Coverage for src/wiktextract/extractor/zh/page.py: 80%

182 statements  

« prev     ^ index     » next       coverage.py v7.9.2, created at 2025-07-09 23:59 +0000

1import re 

2from typing import Any 

3 

4from mediawiki_langcodes import name_to_code 

5from wikitextprocessor.parser import ( 

6 LEVEL_KIND_FLAGS, 

7 LevelNode, 

8 NodeKind, 

9 TemplateNode, 

10 WikiNode, 

11) 

12 

13from ...page import clean_node 

14from ...wxr_context import WiktextractContext 

15from ...wxr_logging import logger 

16from .descendant import extract_descendant_section 

17from .etymology import extract_etymology_section 

18from .gloss import extract_gloss 

19from .headword_line import extract_pos_head_line_nodes 

20from .inflection import extract_inflections 

21from .linkage import extract_linkage_section 

22from .models import Form, Sense, WordEntry 

23from .note import extract_note_section 

24from .pronunciation import extract_pronunciation_section 

25from .section_titles import ( 

26 DESCENDANTS_TITLES, 

27 ETYMOLOGY_TITLES, 

28 IGNORED_TITLES, 

29 INFLECTION_TITLES, 

30 LINKAGE_TITLES, 

31 POS_TITLES, 

32 PRONUNCIATION_TITLES, 

33 TRANSLATIONS_TITLES, 

34 USAGE_NOTE_TITLES, 

35) 

36from .translation import extract_translation 

37 

38 

39def parse_section( 

40 wxr: WiktextractContext, 

41 page_data: list[WordEntry], 

42 base_data: WordEntry, 

43 level_node: LevelNode, 

44) -> None: 

45 subtitle = clean_node(wxr, None, level_node.largs) 

46 # remove number suffix from subtitle 

47 subtitle = re.sub(r"\s*(?:(.+)|\d+)$", "", subtitle) 

48 wxr.wtp.start_subsection(subtitle) 

49 if subtitle in IGNORED_TITLES: 49 ↛ 50line 49 didn't jump to line 50 because the condition on line 49 was never true

50 pass 

51 elif subtitle in POS_TITLES: 

52 process_pos_block(wxr, page_data, base_data, level_node, subtitle) 

53 if len(page_data[-1].senses) == 0 and subtitle in LINKAGE_TITLES: 

54 page_data.pop() 

55 extract_linkage_section( 

56 wxr, 

57 page_data if len(page_data) > 0 else [base_data], 

58 level_node, 

59 LINKAGE_TITLES[subtitle], 

60 ) 

61 elif wxr.config.capture_etymologies and subtitle.startswith( 

62 tuple(ETYMOLOGY_TITLES) 

63 ): 

64 if level_node.contain_node(LEVEL_KIND_FLAGS): 64 ↛ 66line 64 didn't jump to line 66 because the condition on line 64 was always true

65 base_data = base_data.model_copy(deep=True) 

66 extract_etymology_section(wxr, page_data, base_data, level_node) 

67 elif wxr.config.capture_pronunciation and subtitle in PRONUNCIATION_TITLES: 

68 if level_node.contain_node(LEVEL_KIND_FLAGS): 

69 base_data = base_data.model_copy(deep=True) 

70 extract_pronunciation_section(wxr, base_data, level_node) 

71 elif wxr.config.capture_linkages and subtitle in LINKAGE_TITLES: 

72 is_descendant_section = False 

73 if subtitle in DESCENDANTS_TITLES: 

74 for t_node in level_node.find_child_recursively(NodeKind.TEMPLATE): 74 ↛ 84line 74 didn't jump to line 84 because the loop on line 74 didn't complete

75 if t_node.template_name.lower() in [ 75 ↛ 74line 75 didn't jump to line 74 because the condition on line 75 was always true

76 "desc", 

77 "descendant", 

78 "desctree", 

79 "descendants tree", 

80 "cjkv", 

81 ]: 

82 is_descendant_section = True 

83 break 

84 if is_descendant_section and wxr.config.capture_descendants: 

85 extract_descendant_section( 

86 wxr, 

87 level_node, 

88 page_data if len(page_data) > 0 else [base_data], 

89 ) 

90 elif not is_descendant_section: 90 ↛ 119line 90 didn't jump to line 119 because the condition on line 90 was always true

91 extract_linkage_section( 

92 wxr, 

93 page_data if len(page_data) > 0 else [base_data], 

94 level_node, 

95 LINKAGE_TITLES[subtitle], 

96 ) 

97 elif wxr.config.capture_translations and subtitle in TRANSLATIONS_TITLES: 97 ↛ 98line 97 didn't jump to line 98 because the condition on line 97 was never true

98 if len(page_data) == 0: 

99 page_data.append(base_data.model_copy(deep=True)) 

100 extract_translation(wxr, page_data, level_node) 

101 elif wxr.config.capture_inflections and subtitle in INFLECTION_TITLES: 101 ↛ 102line 101 didn't jump to line 102 because the condition on line 101 was never true

102 extract_inflections( 

103 wxr, page_data if len(page_data) > 0 else [base_data], level_node 

104 ) 

105 elif wxr.config.capture_descendants and subtitle in DESCENDANTS_TITLES: 105 ↛ 106line 105 didn't jump to line 106 because the condition on line 105 was never true

106 extract_descendant_section( 

107 wxr, level_node, page_data if len(page_data) > 0 else [base_data] 

108 ) 

109 elif subtitle in USAGE_NOTE_TITLES: 109 ↛ 114line 109 didn't jump to line 114 because the condition on line 109 was always true

110 extract_note_section( 

111 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node 

112 ) 

113 else: 

114 wxr.wtp.debug( 

115 f"Unhandled subtitle: {subtitle}", 

116 sortid="extractor/zh/page/parse_section/192", 

117 ) 

118 

119 for next_level_node in level_node.find_child(LEVEL_KIND_FLAGS): 

120 parse_section(wxr, page_data, base_data, next_level_node) 

121 

122 for template in level_node.find_child(NodeKind.TEMPLATE): 

123 add_page_end_categories(wxr, page_data, template) 

124 

125 

126def process_pos_block( 

127 wxr: WiktextractContext, 

128 page_data: list[WordEntry], 

129 base_data: WordEntry, 

130 level_node: LevelNode, 

131 pos_title: str, 

132): 

133 pos_data = POS_TITLES[pos_title] 

134 pos_type = pos_data["pos"] 

135 base_data.pos = pos_type 

136 page_data.append(base_data.model_copy(deep=True)) 

137 page_data[-1].pos_title = pos_title 

138 page_data[-1].pos_level = level_node.kind 

139 page_data[-1].tags.extend(pos_data.get("tags", [])) 

140 first_gloss_list_index = len(level_node.children) 

141 for index, child in enumerate(level_node.children): 

142 if ( 

143 isinstance(child, WikiNode) 

144 and child.kind == NodeKind.LIST 

145 and child.sarg.startswith("#") 

146 ): 

147 if index < first_gloss_list_index: 147 ↛ 149line 147 didn't jump to line 149 because the condition on line 147 was always true

148 first_gloss_list_index = index 

149 extract_gloss(wxr, page_data, child, Sense()) 

150 

151 extract_pos_head_line_nodes( 

152 wxr, page_data[-1], level_node.children[:first_gloss_list_index] 

153 ) 

154 

155 if len(page_data[-1].senses) == 0 and not level_node.contain_node( 

156 NodeKind.LIST 

157 ): 

158 # low quality pages don't put gloss in list 

159 expanded_node = wxr.wtp.parse( 

160 wxr.wtp.node_to_wikitext( 

161 list(level_node.invert_find_child(LEVEL_KIND_FLAGS)) 

162 ), 

163 expand_all=True, 

164 ) 

165 if not expanded_node.contain_node(NodeKind.LIST): 

166 gloss_text = clean_node( 

167 wxr, 

168 page_data[-1], 

169 expanded_node, 

170 ) 

171 if len(gloss_text) > 0: 171 ↛ 174line 171 didn't jump to line 174 because the condition on line 171 was always true

172 page_data[-1].senses.append(Sense(glosses=[gloss_text])) 

173 else: 

174 page_data[-1].senses.append(Sense(tags=["no-gloss"])) 

175 

176 

177def parse_page( 

178 wxr: WiktextractContext, page_title: str, page_text: str 

179) -> list[dict[str, Any]]: 

180 # page layout documents 

181 # https://zh.wiktionary.org/wiki/Wiktionary:佈局解釋 

182 # https://zh.wiktionary.org/wiki/Wiktionary:体例说明 

183 # https://zh.wiktionary.org/wiki/Wiktionary:格式手冊 

184 

185 # skip translation pages 

186 if page_title.endswith( 186 ↛ 189line 186 didn't jump to line 189 because the condition on line 186 was never true

187 tuple("/" + tr_title for tr_title in TRANSLATIONS_TITLES) + ("/衍生詞",) 

188 ): 

189 return [] 

190 

191 if wxr.config.verbose: 191 ↛ 192line 191 didn't jump to line 192 because the condition on line 191 was never true

192 logger.info(f"Parsing page: {page_title}") 

193 wxr.config.word = page_title 

194 wxr.wtp.start_page(page_title) 

195 

196 # Parse the page, pre-expanding those templates that are likely to 

197 # influence parsing 

198 tree = wxr.wtp.parse(page_text, pre_expand=True) 

199 

200 page_data = [] 

201 for level2_node in tree.find_child(NodeKind.LEVEL2): 

202 categories = {} 

203 lang_name = clean_node(wxr, categories, level2_node.largs) 

204 lang_code = name_to_code(lang_name, "zh") 

205 if lang_code == "": 205 ↛ 206line 205 didn't jump to line 206 because the condition on line 205 was never true

206 wxr.wtp.warning( 

207 f"Unrecognized language name: {lang_name}", 

208 sortid="extractor/zh/page/parse_page/509", 

209 ) 

210 lang_code = "unknown" 

211 if ( 211 ↛ 215line 211 didn't jump to line 215 because the condition on line 211 was never true

212 wxr.config.capture_language_codes is not None 

213 and lang_code not in wxr.config.capture_language_codes 

214 ): 

215 continue 

216 wxr.wtp.start_section(lang_name) 

217 base_data = WordEntry( 

218 word=wxr.wtp.title, 

219 lang_code=lang_code, 

220 lang=lang_name, 

221 pos="unknown", 

222 ) 

223 base_data.categories = categories.get("categories", []) 

224 for template_node in level2_node.find_child(NodeKind.TEMPLATE): 

225 if template_node.template_name == "zh-forms": 

226 process_zh_forms(wxr, base_data, template_node) 

227 

228 for level3_node in level2_node.find_child(NodeKind.LEVEL3): 

229 parse_section(wxr, page_data, base_data, level3_node) 

230 if not level2_node.contain_node(NodeKind.LEVEL3): 

231 page_data.append(base_data.model_copy(deep=True)) 

232 process_low_quality_page(wxr, level2_node, page_data[-1]) 

233 if page_data[-1] == base_data: 233 ↛ 234line 233 didn't jump to line 234 because the condition on line 233 was never true

234 page_data.pop() 

235 

236 for data in page_data: 

237 if len(data.senses) == 0: 

238 data.senses.append(Sense(tags=["no-gloss"])) 

239 

240 return [d.model_dump(exclude_defaults=True) for d in page_data] 

241 

242 

243def process_low_quality_page( 

244 wxr: WiktextractContext, level_node: WikiNode, word_entry: WordEntry 

245) -> None: 

246 is_soft_redirect = False 

247 for template_node in level_node.find_child(NodeKind.TEMPLATE): 

248 if template_node.template_name in ("ja-see", "ja-see-kango", "zh-see"): 

249 process_soft_redirect_template(wxr, template_node, word_entry) 

250 is_soft_redirect = True 

251 

252 if not is_soft_redirect: # only have a gloss text 

253 has_gloss_list = False 

254 for list_node in level_node.find_child(NodeKind.LIST): 254 ↛ 255line 254 didn't jump to line 255 because the loop on line 254 never started

255 if list_node.sarg == "#": 

256 extract_gloss(wxr, [word_entry], list_node, Sense()) 

257 has_gloss_list = True 

258 if not has_gloss_list: 258 ↛ exitline 258 didn't return from function 'process_low_quality_page' because the condition on line 258 was always true

259 gloss_text = clean_node(wxr, word_entry, level_node.children) 

260 if len(gloss_text) > 0: 260 ↛ exitline 260 didn't return from function 'process_low_quality_page' because the condition on line 260 was always true

261 for cat in word_entry.categories: 

262 cat = cat.removeprefix(word_entry.lang).strip() 

263 if cat in POS_TITLES: 263 ↛ 261line 263 didn't jump to line 261 because the condition on line 263 was always true

264 pos_data = POS_TITLES[cat] 

265 word_entry.pos = pos_data["pos"] 

266 word_entry.tags.extend(pos_data.get("tags", [])) 

267 break 

268 word_entry.senses.append(Sense(glosses=[gloss_text])) 

269 

270 

271def process_soft_redirect_template( 

272 wxr: WiktextractContext, t_node: TemplateNode, word_entry: WordEntry 

273) -> None: 

274 # https://zh.wiktionary.org/wiki/Template:Ja-see 

275 # https://zh.wiktionary.org/wiki/Template:Ja-see-kango 

276 # https://zh.wiktionary.org/wiki/Template:Zh-see 

277 template_name = t_node.template_name.lower() 

278 if template_name == "zh-see": 

279 word_entry.redirects.append( 

280 clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

281 ) 

282 elif template_name in ("ja-see", "ja-see-kango"): 282 ↛ 287line 282 didn't jump to line 287 because the condition on line 282 was always true

283 for key, value in t_node.template_parameters.items(): 

284 if isinstance(key, int): 284 ↛ 283line 284 didn't jump to line 283 because the condition on line 284 was always true

285 word_entry.redirects.append(clean_node(wxr, None, value)) 

286 

287 if word_entry.pos == "unknown": 287 ↛ exitline 287 didn't return from function 'process_soft_redirect_template' because the condition on line 287 was always true

288 word_entry.pos = "soft-redirect" 

289 

290 

291def process_zh_forms( 

292 wxr: WiktextractContext, 

293 base_data: WordEntry, 

294 template_node: TemplateNode, 

295) -> None: 

296 # https://zh.wiktionary.org/wiki/Template:zh-forms 

297 for p_name, p_value in template_node.template_parameters.items(): 

298 if not isinstance(p_name, str): 

299 continue 

300 if re.fullmatch(r"s\d*", p_name): 

301 form_data = Form( 

302 form=clean_node(wxr, None, p_value), tags=["Simplified Chinese"] 

303 ) 

304 if len(form_data.form) > 0: 304 ↛ 297line 304 didn't jump to line 297 because the condition on line 304 was always true

305 base_data.forms.append(form_data) 

306 elif re.fullmatch(r"t\d+", p_name): 306 ↛ 307line 306 didn't jump to line 307 because the condition on line 306 was never true

307 form_data = Form( 

308 form=clean_node(wxr, None, p_value), 

309 tags=["Traditional Chinese"], 

310 ) 

311 if len(form_data.form) > 0: 

312 base_data.forms.append(form_data) 

313 elif p_name == "alt": 

314 for form_text in clean_node(wxr, None, p_value).split(","): 

315 texts = form_text.split("-") 

316 form_data = Form(form=texts[0], raw_tags=texts[1:]) 

317 if len(form_data.form) > 0: 317 ↛ 314line 317 didn't jump to line 314 because the condition on line 317 was always true

318 base_data.forms.append(form_data) 

319 elif p_name == "lit": 

320 lit = clean_node(wxr, None, p_value) 

321 base_data.literal_meaning = lit 

322 

323 

324# https://zh.wiktionary.org/wiki/Template:Zh-cat 

325# https://zh.wiktionary.org/wiki/Template:Catlangname 

326CATEGORY_TEMPLATES = frozenset(["zh-cat", "cln", "catlangname", "c", "topics"]) 

327 

328 

329def add_page_end_categories( 

330 wxr: WiktextractContext, page_data: list[WordEntry], template: TemplateNode 

331) -> None: 

332 if template.template_name.lower() in CATEGORY_TEMPLATES: 332 ↛ 333line 332 didn't jump to line 333 because the condition on line 332 was never true

333 categories = {} 

334 clean_node(wxr, categories, template) 

335 for data in page_data: 

336 if data.lang_code == page_data[-1].lang_code: 

337 data.categories.extend(categories.get("categories", []))