Coverage for src/wiktextract/extractor/vi/page.py: 38%

165 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2025-10-13 10:14 +0000

1import string 

2from typing import Any 

3 

4from mediawiki_langcodes import name_to_code 

5from wikitextprocessor.parser import ( 

6 LEVEL_KIND_FLAGS, 

7 HTMLNode, 

8 LevelNode, 

9 NodeKind, 

10 TemplateNode, 

11 WikiNode, 

12) 

13 

14from ...page import clean_node 

15from ...wxr_context import WiktextractContext 

16from .descendant import extract_descendant_section 

17from .etymology import extract_etymology_section 

18from .linkage import extract_alt_form_section, extract_linkage_section 

19from .models import Form, Sense, WordEntry 

20from .pos import extract_note_section, extract_pos_section 

21from .section_titles import LINKAGE_SECTIONS, POS_DATA, TRANSLATION_SECTIONS 

22from .sound import extract_homophone_section, extract_sound_section 

23from .tags import translate_raw_tags 

24from .translation import extract_translation_section 

25 

26 

27def parse_section( 

28 wxr: WiktextractContext, 

29 page_data: list[WordEntry], 

30 base_data: WordEntry, 

31 level_node: LevelNode, 

32) -> None: 

33 subtitle = clean_node(wxr, None, level_node.largs) 

34 subtitle = subtitle.rstrip(string.digits + string.whitespace) 

35 if subtitle in POS_DATA: 

36 extract_pos_section(wxr, page_data, base_data, level_node, subtitle) 

37 if len(page_data[-1].senses) == 0 and subtitle in LINKAGE_SECTIONS: 

38 page_data.pop() 

39 extract_linkage_section( 

40 wxr, 

41 page_data if len(page_data) > 0 else [base_data], 

42 level_node, 

43 LINKAGE_SECTIONS[subtitle], 

44 ) 

45 elif subtitle in TRANSLATION_SECTIONS: 

46 extract_translation_section( 

47 wxr, page_data[-1] if len(page_data) else base_data, level_node 

48 ) 

49 elif subtitle == "Cách phát âm": 

50 extract_sound_section(wxr, base_data, level_node) 

51 elif subtitle == "Từ đồng âm": 51 ↛ 52line 51 didn't jump to line 52 because the condition on line 51 was never true

52 extract_homophone_section(wxr, base_data, level_node) 

53 elif subtitle == "Từ nguyên": 53 ↛ 54line 53 didn't jump to line 54 because the condition on line 53 was never true

54 if level_node.contain_node(LEVEL_KIND_FLAGS): 

55 base_data = base_data.model_copy(deep=True) 

56 extract_etymology_section(wxr, base_data, level_node) 

57 elif subtitle == "Cách viết khác": 

58 extract_alt_form_section(wxr, base_data, page_data, level_node) 

59 elif subtitle in ["Ghi chú sử dụng", "Chú ý"]: 59 ↛ 60line 59 didn't jump to line 60 because the condition on line 59 was never true

60 extract_note_section( 

61 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node 

62 ) 

63 elif subtitle in LINKAGE_SECTIONS: 63 ↛ 70line 63 didn't jump to line 70 because the condition on line 63 was always true

64 extract_linkage_section( 

65 wxr, 

66 page_data if len(page_data) > 0 else [base_data], 

67 level_node, 

68 LINKAGE_SECTIONS[subtitle], 

69 ) 

70 elif subtitle == "Hậu duệ": 

71 extract_descendant_section( 

72 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node 

73 ) 

74 elif subtitle not in ["Tham khảo", "Cách ra dấu", "Đọc thêm", "Xem thêm"]: 

75 wxr.wtp.debug(f"Unknown title: {subtitle}", sortid="vi/page/22") 

76 

77 extract_section_cats(wxr, base_data, page_data, level_node) 

78 for next_level in level_node.find_child(LEVEL_KIND_FLAGS): 

79 parse_section(wxr, page_data, base_data, next_level) 

80 

81 

82def parse_page( 

83 wxr: WiktextractContext, page_title: str, page_text: str 

84) -> list[dict[str, Any]]: 

85 # page layout 

86 # https://vi.wiktionary.org/wiki/Wiktionary:Sơ_đồ_mục_từ 

87 

88 # ignore thesaurus, rhyme, quote, reconstruct pages 

89 if page_title.startswith( 89 ↛ 92line 89 didn't jump to line 92 because the condition on line 89 was never true

90 ("Kho từ vựng:", "Vần:", "Kho ngữ liệu:", "Từ tái tạo:") 

91 ): 

92 return [] 

93 

94 wxr.wtp.start_page(page_title) 

95 tree = wxr.wtp.parse(page_text, pre_expand=True) 

96 page_data = [] 

97 for level2_node in tree.find_child(NodeKind.LEVEL2): 

98 categories = {} 

99 lang_name = clean_node(wxr, categories, level2_node.largs) or "unknown" 

100 lang_code = name_to_code(lang_name, "vi") or "unknown" 

101 for t_node in level2_node.find_content(NodeKind.TEMPLATE): 101 ↛ 102line 101 didn't jump to line 102 because the loop on line 101 never started

102 if t_node.template_name == "langname": 

103 lang_code = clean_node( 

104 wxr, None, t_node.template_parameters.get(1, "") 

105 ) 

106 if ( 106 ↛ 110line 106 didn't jump to line 110 because the condition on line 106 was never true

107 wxr.config.capture_language_codes is not None 

108 and lang_code not in wxr.config.capture_language_codes 

109 ): 

110 continue 

111 wxr.wtp.start_section(lang_name) 

112 base_data = WordEntry( 

113 word=wxr.wtp.title, 

114 lang_code=lang_code, 

115 lang=lang_name, 

116 pos="unknown", 

117 ) 

118 base_data.categories = categories.get("categories", []) 

119 extract_section_cats(wxr, base_data, page_data, level2_node) 

120 for t_node in level2_node.find_child(NodeKind.TEMPLATE): 120 ↛ 121line 120 didn't jump to line 121 because the loop on line 120 never started

121 if t_node.template_name in ["zho-forms", "zh-forms"]: 

122 extract_zh_forms_template(wxr, base_data, t_node) 

123 elif t_node.template_name in ["zh-see", "zho-see"]: 

124 base_data.redirects.append( 

125 clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

126 ) 

127 clean_node(wxr, base_data, t_node) 

128 elif t_node.template_name in ["ja-see", "jpn-see", "ja-see-kango"]: 

129 for key, value in t_node.template_parameters.items(): 

130 if isinstance(key, int): 

131 base_data.redirects.append(clean_node(wxr, None, value)) 

132 clean_node(wxr, base_data, t_node) 

133 if len(base_data.redirects) > 0: 133 ↛ 134line 133 didn't jump to line 134 because the condition on line 133 was never true

134 page_data.append(base_data) 

135 for next_level in level2_node.find_child(LEVEL_KIND_FLAGS): 

136 parse_section(wxr, page_data, base_data, next_level) 

137 

138 for data in page_data: 

139 if len(data.senses) == 0: 139 ↛ 140line 139 didn't jump to line 140 because the condition on line 139 was never true

140 data.senses.append(Sense(tags=["no-gloss"])) 

141 

142 return [d.model_dump(exclude_defaults=True) for d in page_data] 

143 

144 

145def extract_section_cats( 

146 wxr: WiktextractContext, 

147 base_data: WordEntry, 

148 page_data: list[WordEntry], 

149 level_node: LevelNode, 

150): 

151 cats = {} 

152 for node in level_node.find_child(NodeKind.TEMPLATE | NodeKind.LINK): 

153 if node.kind == NodeKind.LINK: 153 ↛ 154line 153 didn't jump to line 154 because the condition on line 153 was never true

154 clean_node(wxr, cats, node) 

155 elif node.template_name in [ 155 ↛ 162line 155 didn't jump to line 162 because the condition on line 155 was never true

156 "topics", 

157 "C", 

158 "topic", 

159 "catlangname", 

160 "cln", 

161 ]: 

162 clean_node(wxr, cats, node) 

163 

164 if len(page_data) == 0 or page_data[-1].lang_code != base_data.lang_code: 

165 base_data.categories.extend(cats.get("categories", [])) 

166 else: 

167 for data in page_data: 

168 if data.lang_code == page_data[-1].lang_code: 168 ↛ 167line 168 didn't jump to line 167 because the condition on line 168 was always true

169 data.categories.extend(cats.get("categories", [])) 

170 

171 

172def extract_zh_forms_template( 

173 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

174): 

175 base_data.literal_meaning = clean_node( 

176 wxr, None, t_node.template_parameters.get("lit", "") 

177 ) 

178 expanded_node = wxr.wtp.parse( 

179 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

180 ) 

181 for table in expanded_node.find_child(NodeKind.TABLE): 

182 for row in table.find_child(NodeKind.TABLE_ROW): 

183 row_header = "" 

184 row_header_tags = [] 

185 header_has_span = False 

186 for cell in row.find_child( 

187 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

188 ): 

189 if cell.kind == NodeKind.TABLE_HEADER_CELL: 

190 row_header, row_header_tags, header_has_span = ( 

191 extract_zh_forms_header_cell(wxr, base_data, cell) 

192 ) 

193 elif not header_has_span: 

194 extract_zh_forms_data_cell( 

195 wxr, base_data, cell, row_header, row_header_tags 

196 ) 

197 for link_node in expanded_node.find_child(NodeKind.LINK): 

198 clean_node(wxr, base_data, link_node) 

199 

200 

201def extract_zh_forms_header_cell( 

202 wxr: WiktextractContext, base_data: WordEntry, header_cell: WikiNode 

203) -> tuple[str, list[str], bool]: 

204 row_header = "" 

205 row_header_tags = [] 

206 header_has_span = False 

207 first_span_index = len(header_cell.children) 

208 for index, span_tag in header_cell.find_html("span", with_index=True): 

209 if index < first_span_index: 

210 first_span_index = index 

211 header_has_span = True 

212 row_header = clean_node(wxr, None, header_cell.children[:first_span_index]) 

213 for raw_tag in row_header.split(" và "): 

214 raw_tag = raw_tag.strip() 

215 if raw_tag != "": 

216 row_header_tags.append(raw_tag) 

217 for span_tag in header_cell.find_html_recursively("span"): 

218 span_lang = span_tag.attrs.get("lang", "") 

219 form_nodes = [] 

220 sup_title = "" 

221 for node in span_tag.children: 

222 if isinstance(node, HTMLNode) and node.tag == "sup": 

223 for sup_span in node.find_html("span"): 

224 sup_title = sup_span.attrs.get("title", "") 

225 else: 

226 form_nodes.append(node) 

227 if span_lang in ["zh-Hant", "zh-Hans"]: 

228 for word in clean_node(wxr, None, form_nodes).split("/"): 

229 if word not in [base_data.word, ""]: 

230 form = Form(form=word, raw_tags=row_header_tags) 

231 if sup_title != "": 

232 form.raw_tags.append(sup_title) 

233 translate_raw_tags(form) 

234 base_data.forms.append(form) 

235 return row_header, row_header_tags, header_has_span 

236 

237 

238def extract_zh_forms_data_cell( 

239 wxr: WiktextractContext, 

240 base_data: WordEntry, 

241 cell: WikiNode, 

242 row_header: str, 

243 row_header_tags: list[str], 

244): 

245 for top_span_tag in cell.find_html("span"): 

246 forms = [] 

247 for span_tag in top_span_tag.find_html("span"): 

248 span_lang = span_tag.attrs.get("lang", "") 

249 if span_lang in ["zh-Hant", "zh-Hans", "zh"]: 

250 word = clean_node(wxr, None, span_tag) 

251 if word not in ["", "/", base_data.word]: 

252 form = Form(form=word, raw_tags=row_header_tags) 

253 if span_lang == "zh-Hant": 

254 form.tags.append("Traditional-Chinese") 

255 elif span_lang == "zh-Hans": 

256 form.tags.append("Simplified-Chinese") 

257 translate_raw_tags(form) 

258 forms.append(form) 

259 elif "font-size:80%" in span_tag.attrs.get("style", ""): 

260 raw_tag = clean_node(wxr, None, span_tag) 

261 if raw_tag != "": 

262 for form in forms: 

263 form.raw_tags.append(raw_tag) 

264 translate_raw_tags(form) 

265 base_data.forms.extend(forms)