Coverage for src / wiktextract / extractor / vi / page.py: 40%

167 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2025-12-05 07:46 +0000

1import string 

2from typing import Any 

3 

4from mediawiki_langcodes import name_to_code 

5from wikitextprocessor.parser import ( 

6 LEVEL_KIND_FLAGS, 

7 HTMLNode, 

8 LevelNode, 

9 NodeKind, 

10 TemplateNode, 

11 WikiNode, 

12) 

13 

14from ...page import clean_node 

15from ...wxr_context import WiktextractContext 

16from .descendant import extract_descendant_section 

17from .etymology import extract_etymology_section, extract_ja_kanjitab_template 

18from .linkage import extract_alt_form_section, extract_linkage_section 

19from .models import Form, Sense, WordEntry 

20from .pos import extract_note_section, extract_pos_section 

21from .section_titles import LINKAGE_SECTIONS, POS_DATA, TRANSLATION_SECTIONS 

22from .sound import extract_homophone_section, extract_sound_section 

23from .tags import translate_raw_tags 

24from .translation import extract_translation_section 

25 

26 

27def parse_section( 

28 wxr: WiktextractContext, 

29 page_data: list[WordEntry], 

30 base_data: WordEntry, 

31 level_node: LevelNode, 

32) -> None: 

33 subtitle = clean_node(wxr, None, level_node.largs) 

34 subtitle = subtitle.rstrip(string.digits + string.whitespace) 

35 if subtitle in POS_DATA: 

36 extract_pos_section(wxr, page_data, base_data, level_node, subtitle) 

37 if len(page_data[-1].senses) == 0 and subtitle in LINKAGE_SECTIONS: 

38 page_data.pop() 

39 extract_linkage_section( 

40 wxr, 

41 page_data if len(page_data) > 0 else [base_data], 

42 level_node, 

43 LINKAGE_SECTIONS[subtitle], 

44 ) 

45 elif subtitle in TRANSLATION_SECTIONS: 

46 extract_translation_section( 

47 wxr, page_data[-1] if len(page_data) else base_data, level_node 

48 ) 

49 elif subtitle == "Cách phát âm": 

50 extract_sound_section(wxr, base_data, level_node) 

51 elif subtitle == "Từ đồng âm": 51 ↛ 52line 51 didn't jump to line 52 because the condition on line 51 was never true

52 extract_homophone_section(wxr, base_data, level_node) 

53 elif subtitle == "Từ nguyên": 

54 if level_node.contain_node(LEVEL_KIND_FLAGS): 54 ↛ 55line 54 didn't jump to line 55 because the condition on line 54 was never true

55 base_data = base_data.model_copy(deep=True) 

56 extract_etymology_section(wxr, base_data, level_node) 

57 elif subtitle == "Cách viết khác": 

58 extract_alt_form_section(wxr, base_data, page_data, level_node) 

59 elif subtitle in ["Ghi chú sử dụng", "Chú ý"]: 59 ↛ 60line 59 didn't jump to line 60 because the condition on line 59 was never true

60 extract_note_section( 

61 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node 

62 ) 

63 elif subtitle in LINKAGE_SECTIONS: 63 ↛ 70line 63 didn't jump to line 70 because the condition on line 63 was always true

64 extract_linkage_section( 

65 wxr, 

66 page_data if len(page_data) > 0 else [base_data], 

67 level_node, 

68 LINKAGE_SECTIONS[subtitle], 

69 ) 

70 elif subtitle == "Hậu duệ": 

71 extract_descendant_section( 

72 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node 

73 ) 

74 elif subtitle not in ["Tham khảo", "Cách ra dấu", "Đọc thêm", "Xem thêm"]: 

75 wxr.wtp.debug(f"Unknown title: {subtitle}", sortid="vi/page/22") 

76 

77 extract_section_cats(wxr, base_data, page_data, level_node) 

78 for next_level in level_node.find_child(LEVEL_KIND_FLAGS): 

79 parse_section(wxr, page_data, base_data, next_level) 

80 

81 

82def parse_page( 

83 wxr: WiktextractContext, page_title: str, page_text: str 

84) -> list[dict[str, Any]]: 

85 # page layout 

86 # https://vi.wiktionary.org/wiki/Wiktionary:Sơ_đồ_mục_từ 

87 

88 # ignore thesaurus, rhyme, quote, reconstruct pages 

89 if page_title.startswith( 89 ↛ 92line 89 didn't jump to line 92 because the condition on line 89 was never true

90 ("Kho từ vựng:", "Vần:", "Kho ngữ liệu:", "Từ tái tạo:") 

91 ): 

92 return [] 

93 

94 wxr.wtp.start_page(page_title) 

95 tree = wxr.wtp.parse(page_text, pre_expand=True) 

96 page_data = [] 

97 for level2_node in tree.find_child(NodeKind.LEVEL2): 

98 categories = {} 

99 lang_name = clean_node(wxr, categories, level2_node.largs) or "unknown" 

100 lang_code = name_to_code(lang_name, "vi") or "unknown" 

101 for t_node in level2_node.find_content(NodeKind.TEMPLATE): 

102 if t_node.template_name == "langname": 102 ↛ 101line 102 didn't jump to line 101 because the condition on line 102 was always true

103 lang_code = clean_node( 

104 wxr, None, t_node.template_parameters.get(1, "") 

105 ) 

106 if ( 106 ↛ 110line 106 didn't jump to line 110 because the condition on line 106 was never true

107 wxr.config.capture_language_codes is not None 

108 and lang_code not in wxr.config.capture_language_codes 

109 ): 

110 continue 

111 wxr.wtp.start_section(lang_name) 

112 base_data = WordEntry( 

113 word=wxr.wtp.title, 

114 lang_code=lang_code, 

115 lang=lang_name, 

116 pos="unknown", 

117 ) 

118 base_data.categories = categories.get("categories", []) 

119 extract_section_cats(wxr, base_data, page_data, level2_node) 

120 for t_node in level2_node.find_child(NodeKind.TEMPLATE): 120 ↛ 121line 120 didn't jump to line 121 because the loop on line 120 never started

121 if t_node.template_name in ["zho-forms", "zh-forms"]: 

122 extract_zh_forms_template(wxr, base_data, t_node) 

123 elif t_node.template_name in ["zh-see", "zho-see"]: 

124 base_data.redirects.append( 

125 clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

126 ) 

127 clean_node(wxr, base_data, t_node) 

128 elif t_node.template_name in ["ja-see", "jpn-see", "ja-see-kango"]: 

129 for key, value in t_node.template_parameters.items(): 

130 if isinstance(key, int): 

131 base_data.redirects.append(clean_node(wxr, None, value)) 

132 clean_node(wxr, base_data, t_node) 

133 elif ( 

134 t_node.template_name.endswith("-kanjitab") 

135 or t_node.template_name == "ja-kt" 

136 ): 

137 extract_ja_kanjitab_template(wxr, t_node, base_data) 

138 

139 if len(base_data.redirects) > 0: 139 ↛ 140line 139 didn't jump to line 140 because the condition on line 139 was never true

140 page_data.append(base_data) 

141 for next_level in level2_node.find_child(LEVEL_KIND_FLAGS): 

142 parse_section(wxr, page_data, base_data, next_level) 

143 

144 for data in page_data: 

145 if len(data.senses) == 0: 145 ↛ 146line 145 didn't jump to line 146 because the condition on line 145 was never true

146 data.senses.append(Sense(tags=["no-gloss"])) 

147 

148 return [d.model_dump(exclude_defaults=True) for d in page_data] 

149 

150 

151def extract_section_cats( 

152 wxr: WiktextractContext, 

153 base_data: WordEntry, 

154 page_data: list[WordEntry], 

155 level_node: LevelNode, 

156): 

157 cats = {} 

158 for node in level_node.find_child(NodeKind.TEMPLATE | NodeKind.LINK): 

159 if node.kind == NodeKind.LINK: 159 ↛ 160line 159 didn't jump to line 160 because the condition on line 159 was never true

160 clean_node(wxr, cats, node) 

161 elif node.template_name in [ 161 ↛ 168line 161 didn't jump to line 168 because the condition on line 161 was never true

162 "topics", 

163 "C", 

164 "topic", 

165 "catlangname", 

166 "cln", 

167 ]: 

168 clean_node(wxr, cats, node) 

169 

170 if len(page_data) == 0 or page_data[-1].lang_code != base_data.lang_code: 

171 base_data.categories.extend(cats.get("categories", [])) 

172 else: 

173 for data in page_data: 

174 if data.lang_code == page_data[-1].lang_code: 174 ↛ 173line 174 didn't jump to line 173 because the condition on line 174 was always true

175 data.categories.extend(cats.get("categories", [])) 

176 

177 

178def extract_zh_forms_template( 

179 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

180): 

181 base_data.literal_meaning = clean_node( 

182 wxr, None, t_node.template_parameters.get("lit", "") 

183 ) 

184 expanded_node = wxr.wtp.parse( 

185 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

186 ) 

187 for table in expanded_node.find_child(NodeKind.TABLE): 

188 for row in table.find_child(NodeKind.TABLE_ROW): 

189 row_header = "" 

190 row_header_tags = [] 

191 header_has_span = False 

192 for cell in row.find_child( 

193 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

194 ): 

195 if cell.kind == NodeKind.TABLE_HEADER_CELL: 

196 row_header, row_header_tags, header_has_span = ( 

197 extract_zh_forms_header_cell(wxr, base_data, cell) 

198 ) 

199 elif not header_has_span: 

200 extract_zh_forms_data_cell( 

201 wxr, base_data, cell, row_header, row_header_tags 

202 ) 

203 for link_node in expanded_node.find_child(NodeKind.LINK): 

204 clean_node(wxr, base_data, link_node) 

205 

206 

207def extract_zh_forms_header_cell( 

208 wxr: WiktextractContext, base_data: WordEntry, header_cell: WikiNode 

209) -> tuple[str, list[str], bool]: 

210 row_header = "" 

211 row_header_tags = [] 

212 header_has_span = False 

213 first_span_index = len(header_cell.children) 

214 for index, span_tag in header_cell.find_html("span", with_index=True): 

215 if index < first_span_index: 

216 first_span_index = index 

217 header_has_span = True 

218 row_header = clean_node(wxr, None, header_cell.children[:first_span_index]) 

219 for raw_tag in row_header.split(" và "): 

220 raw_tag = raw_tag.strip() 

221 if raw_tag != "": 

222 row_header_tags.append(raw_tag) 

223 for span_tag in header_cell.find_html_recursively("span"): 

224 span_lang = span_tag.attrs.get("lang", "") 

225 form_nodes = [] 

226 sup_title = "" 

227 for node in span_tag.children: 

228 if isinstance(node, HTMLNode) and node.tag == "sup": 

229 for sup_span in node.find_html("span"): 

230 sup_title = sup_span.attrs.get("title", "") 

231 else: 

232 form_nodes.append(node) 

233 if span_lang in ["zh-Hant", "zh-Hans"]: 

234 for word in clean_node(wxr, None, form_nodes).split("/"): 

235 if word not in [base_data.word, ""]: 

236 form = Form(form=word, raw_tags=row_header_tags) 

237 if sup_title != "": 

238 form.raw_tags.append(sup_title) 

239 translate_raw_tags(form) 

240 base_data.forms.append(form) 

241 return row_header, row_header_tags, header_has_span 

242 

243 

244def extract_zh_forms_data_cell( 

245 wxr: WiktextractContext, 

246 base_data: WordEntry, 

247 cell: WikiNode, 

248 row_header: str, 

249 row_header_tags: list[str], 

250): 

251 for top_span_tag in cell.find_html("span"): 

252 forms = [] 

253 for span_tag in top_span_tag.find_html("span"): 

254 span_lang = span_tag.attrs.get("lang", "") 

255 if span_lang in ["zh-Hant", "zh-Hans", "zh"]: 

256 word = clean_node(wxr, None, span_tag) 

257 if word not in ["", "/", base_data.word]: 

258 form = Form(form=word, raw_tags=row_header_tags) 

259 if span_lang == "zh-Hant": 

260 form.tags.append("Traditional-Chinese") 

261 elif span_lang == "zh-Hans": 

262 form.tags.append("Simplified-Chinese") 

263 translate_raw_tags(form) 

264 forms.append(form) 

265 elif "font-size:80%" in span_tag.attrs.get("style", ""): 

266 raw_tag = clean_node(wxr, None, span_tag) 

267 if raw_tag != "": 

268 for form in forms: 

269 form.raw_tags.append(raw_tag) 

270 translate_raw_tags(form) 

271 base_data.forms.extend(forms)