Coverage for src/wiktextract/extractor/ms/translation.py: 57%

96 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-12 08:27 +0000

1from wikitextprocessor.parser import ( 

2 LEVEL_KIND_FLAGS, 

3 LevelNode, 

4 NodeKind, 

5 TemplateNode, 

6 WikiNode, 

7) 

8 

9from ...page import clean_node 

10from ...wxr_context import WiktextractContext 

11from .models import Translation, WordEntry 

12from .tags import translate_raw_tags 

13 

14 

15def extract_translation_section( 

16 wxr: WiktextractContext, 

17 page_data: list[WordEntry], 

18 base_data: WordEntry, 

19 level_node: LevelNode, 

20 sense: str = "", 

21 source: str = "", 

22 from_trans_see: bool = False, 

23) -> None: 

24 tr_list = [] 

25 cats = {} 

26 for node in level_node.children: 

27 if ( 

28 isinstance(node, TemplateNode) 

29 and node.template_name 

30 in [ 

31 "ter-atas", 

32 "teratas", 

33 "trans-top", 

34 ] 

35 and not (sense != "" and from_trans_see) 

36 ): 

37 sense = clean_node(wxr, cats, node) 

38 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

39 for list_item in node.find_child(NodeKind.LIST_ITEM): 

40 tr_list.extend( 

41 extract_translation_list_item(wxr, list_item, sense, source) 

42 ) 

43 elif ( 43 ↛ 48line 43 didn't jump to line 48 because the condition on line 43 was never true

44 isinstance(node, TemplateNode) 

45 and node.template_name in ["ter-lihat", "trans-see"] 

46 and not from_trans_see 

47 ): 

48 extract_trans_see_template(wxr, page_data, base_data, node) 

49 

50 if len(page_data) == 0 or page_data[-1].lang_code != base_data.lang_code: 50 ↛ 51line 50 didn't jump to line 51 because the condition on line 50 was never true

51 base_data.categories.extend(cats.get("categories", [])) 

52 for tr_data in tr_list: 

53 if tr_data.word != "": 

54 base_data.translations.append(tr_data) 

55 base_data.categories.extend(tr_data.categories) 

56 elif level_node.kind == NodeKind.LEVEL3: 56 ↛ 65line 56 didn't jump to line 65 because the condition on line 56 was always true

57 for data in page_data: 

58 if data.lang_code == page_data[-1].lang_code: 58 ↛ 57line 58 didn't jump to line 57 because the condition on line 58 was always true

59 data.categories.extend(cats.get("categories", [])) 

60 for tr_data in tr_list: 

61 if tr_data.word != "": 61 ↛ 60line 61 didn't jump to line 60 because the condition on line 61 was always true

62 data.translations.append(tr_data) 

63 data.categories.extend(tr_data.categories) 

64 else: 

65 page_data[-1].categories.extend(cats.get("categories", [])) 

66 for tr_data in tr_list: 

67 if tr_data.word != "": 

68 page_data[-1].translations.append(tr_data) 

69 page_data[-1].categories.extend(tr_data.categories) 

70 

71 

72def extract_translation_list_item( 

73 wxr: WiktextractContext, list_item: WikiNode, sense: str, source: str 

74) -> None: 

75 tr_list = [] 

76 lang_name = "unknown" 

77 for node in list_item.children: 

78 if ( 

79 isinstance(node, str) 

80 and node.strip().endswith(":") 

81 and lang_name == "unknown" 

82 ): 

83 lang_name = node.strip(": ") or "unknown" 

84 elif isinstance(node, TemplateNode) and node.template_name in [ 

85 "t", 

86 "trad", 

87 "tø", 

88 "t-", 

89 "t+", 

90 ]: 

91 tr_list.append( 

92 extract_t_template(wxr, node, sense, lang_name, source) 

93 ) 

94 elif ( 94 ↛ 100line 94 didn't jump to line 100 because the condition on line 94 was never true

95 isinstance(node, TemplateNode) 

96 and node.template_name 

97 in ["penerang", "qualifier", "i", "q", "qual"] 

98 and len(tr_list) > 0 

99 ): 

100 raw_tag = clean_node(wxr, None, node).strip("() ") 

101 if raw_tag != "": 

102 tr_list[-1].raw_tags.append(raw_tag) 

103 translate_raw_tags(tr_list[-1]) 

104 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

105 for child_list_item in node.find_child(NodeKind.LIST_ITEM): 

106 tr_list.extend( 

107 extract_translation_list_item( 

108 wxr, child_list_item, sense, source 

109 ) 

110 ) 

111 return tr_list 

112 

113 

114def extract_t_template( 

115 wxr: WiktextractContext, 

116 t_node: TemplateNode, 

117 sense: str, 

118 lang_name: str, 

119 source: str, 

120) -> Translation: 

121 lang_code = ( 

122 clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

123 or "unknown" 

124 ) 

125 tr_data = Translation( 

126 word="", lang=lang_name, lang_code=lang_code, sense=sense, source=source 

127 ) 

128 expanded_node = wxr.wtp.parse( 

129 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

130 ) 

131 for span_tag in expanded_node.find_html("span"): 

132 if span_tag.attrs.get("lang") == lang_code and tr_data.word == "": 

133 tr_data.word = clean_node(wxr, None, span_tag) 

134 elif span_tag.attrs.get("class", "") == "gender": 

135 for abbr_tag in span_tag.find_html("abbr"): 

136 raw_tag = clean_node(wxr, None, abbr_tag) 

137 if raw_tag not in ["", "?", "jantina tidak diberi"]: 137 ↛ 135line 137 didn't jump to line 135 because the condition on line 137 was always true

138 tr_data.raw_tags.append(raw_tag) 

139 elif "tr" in span_tag.attrs.get("class", ""): 

140 tr_data.roman = clean_node(wxr, None, span_tag) 

141 if tr_data.word != "": 141 ↛ 145line 141 didn't jump to line 145 because the condition on line 141 was always true

142 translate_raw_tags(tr_data) 

143 for link_node in expanded_node.find_child(NodeKind.LINK): 

144 clean_node(wxr, tr_data, link_node) 

145 return tr_data 

146 

147 

148def extract_trans_see_template( 

149 wxr: WiktextractContext, 

150 page_data: list[WordEntry], 

151 base_data: WordEntry, 

152 t_node: TemplateNode, 

153): 

154 # https://ms.wiktionary.org/wik/Templat:ter-lihat 

155 sense = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

156 page_titles = [] 

157 if 2 in t_node.template_parameters: 

158 for index in range(2, 11): 

159 if index not in t_node.template_parameters: 

160 break 

161 page_titles.append( 

162 clean_node(wxr, None, t_node.template_parameters[index]) 

163 ) 

164 else: 

165 page_titles.append( 

166 clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

167 ) 

168 for page_title in page_titles: 

169 if "#" in page_title: 

170 page_title = page_title[: page_title.index("#")] 

171 page = wxr.wtp.get_page(page_title) 

172 if page is None: 

173 return 

174 root = wxr.wtp.parse(page.body) 

175 target_node = find_subpage_section(wxr, root, "Terjemahan") 

176 if target_node is not None: 

177 extract_translation_section( 

178 wxr, 

179 page_data, 

180 base_data, 

181 target_node, 

182 sense=sense, 

183 source=page_title, 

184 from_trans_see=True, 

185 ) 

186 

187 

188def find_subpage_section( 

189 wxr: WiktextractContext, root: WikiNode, target_section: str 

190) -> WikiNode | None: 

191 for level_node in root.find_child_recursively(LEVEL_KIND_FLAGS): 

192 section_title = clean_node(wxr, None, level_node.largs) 

193 if section_title == target_section: 

194 return level_node 

195 return None