Coverage for src / wiktextract / extractor / zh / page.py: 88%

238 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2025-12-12 08:09 +0000

1import re 

2from typing import Any 

3 

4from mediawiki_langcodes import name_to_code 

5from wikitextprocessor.parser import ( 

6 LEVEL_KIND_FLAGS, 

7 HTMLNode, 

8 LevelNode, 

9 NodeKind, 

10 TemplateNode, 

11 WikiNode, 

12) 

13 

14from ...page import clean_node 

15from ...wxr_context import WiktextractContext 

16from ...wxr_logging import logger 

17from .descendant import extract_descendant_section 

18from .etymology import extract_etymology_section, extract_ja_kanjitab_template 

19from .gloss import extract_gloss 

20from .headword_line import extract_pos_head_line_nodes 

21from .inflection import extract_inflections 

22from .linkage import extract_linkage_section 

23from .models import Form, Linkage, Sense, WordEntry 

24from .note import extract_note_section 

25from .pronunciation import extract_pronunciation_section 

26from .section_titles import ( 

27 DESCENDANTS_TITLES, 

28 ETYMOLOGY_TITLES, 

29 IGNORED_TITLES, 

30 INFLECTION_TITLES, 

31 LINKAGE_TITLES, 

32 POS_TITLES, 

33 PRONUNCIATION_TITLES, 

34 TRANSLATIONS_TITLES, 

35 USAGE_NOTE_TITLES, 

36) 

37from .tags import translate_raw_tags 

38from .translation import extract_translation_section 

39 

40 

41def parse_section( 

42 wxr: WiktextractContext, 

43 page_data: list[WordEntry], 

44 base_data: WordEntry, 

45 level_node: LevelNode, 

46) -> None: 

47 subtitle = clean_node(wxr, None, level_node.largs) 

48 # remove number suffix from subtitle 

49 subtitle = re.sub(r"\s*(?:(.+)|\d+)$", "", subtitle) 

50 wxr.wtp.start_subsection(subtitle) 

51 if subtitle in IGNORED_TITLES: 51 ↛ 52line 51 didn't jump to line 52 because the condition on line 51 was never true

52 pass 

53 elif subtitle in POS_TITLES: 

54 process_pos_block(wxr, page_data, base_data, level_node, subtitle) 

55 if len(page_data[-1].senses) == 0 and subtitle in LINKAGE_TITLES: 

56 page_data.pop() 

57 extract_linkage_section( 

58 wxr, 

59 page_data if len(page_data) > 0 else [base_data], 

60 level_node, 

61 LINKAGE_TITLES[subtitle], 

62 ) 

63 elif wxr.config.capture_etymologies and subtitle.startswith( 

64 tuple(ETYMOLOGY_TITLES) 

65 ): 

66 if level_node.contain_node(LEVEL_KIND_FLAGS): 

67 base_data = base_data.model_copy(deep=True) 

68 extract_etymology_section(wxr, page_data, base_data, level_node) 

69 elif wxr.config.capture_pronunciation and subtitle in PRONUNCIATION_TITLES: 

70 if level_node.contain_node(LEVEL_KIND_FLAGS): 

71 base_data = base_data.model_copy(deep=True) 

72 extract_pronunciation_section(wxr, base_data, level_node) 

73 elif wxr.config.capture_linkages and subtitle in LINKAGE_TITLES: 

74 is_descendant_section = False 

75 if subtitle in DESCENDANTS_TITLES: 

76 for t_node in level_node.find_child_recursively(NodeKind.TEMPLATE): 76 ↛ 86line 76 didn't jump to line 86 because the loop on line 76 didn't complete

77 if t_node.template_name.lower() in [ 77 ↛ 76line 77 didn't jump to line 76 because the condition on line 77 was always true

78 "desc", 

79 "descendant", 

80 "desctree", 

81 "descendants tree", 

82 "cjkv", 

83 ]: 

84 is_descendant_section = True 

85 break 

86 if is_descendant_section and wxr.config.capture_descendants: 

87 extract_descendant_section( 

88 wxr, 

89 level_node, 

90 page_data if len(page_data) > 0 else [base_data], 

91 ) 

92 elif not is_descendant_section: 92 ↛ 121line 92 didn't jump to line 121 because the condition on line 92 was always true

93 extract_linkage_section( 

94 wxr, 

95 page_data if len(page_data) > 0 else [base_data], 

96 level_node, 

97 LINKAGE_TITLES[subtitle], 

98 ) 

99 elif wxr.config.capture_translations and subtitle in TRANSLATIONS_TITLES: 

100 if len(page_data) == 0: 100 ↛ 101line 100 didn't jump to line 101 because the condition on line 100 was never true

101 page_data.append(base_data.model_copy(deep=True)) 

102 extract_translation_section(wxr, page_data[-1], level_node) 

103 elif wxr.config.capture_inflections and subtitle in INFLECTION_TITLES: 

104 extract_inflections( 

105 wxr, page_data if len(page_data) > 0 else [base_data], level_node 

106 ) 

107 elif wxr.config.capture_descendants and subtitle in DESCENDANTS_TITLES: 

108 extract_descendant_section( 

109 wxr, level_node, page_data if len(page_data) > 0 else [base_data] 

110 ) 

111 elif subtitle in USAGE_NOTE_TITLES: 111 ↛ 116line 111 didn't jump to line 116 because the condition on line 111 was always true

112 extract_note_section( 

113 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node 

114 ) 

115 else: 

116 wxr.wtp.debug( 

117 f"Unhandled subtitle: {subtitle}", 

118 sortid="extractor/zh/page/parse_section/192", 

119 ) 

120 

121 for next_level_node in level_node.find_child(LEVEL_KIND_FLAGS): 

122 parse_section(wxr, page_data, base_data, next_level_node) 

123 

124 for template in level_node.find_child(NodeKind.TEMPLATE): 

125 add_page_end_categories( 

126 wxr, page_data if len(page_data) else [base_data], template 

127 ) 

128 

129 

130def process_pos_block( 

131 wxr: WiktextractContext, 

132 page_data: list[WordEntry], 

133 base_data: WordEntry, 

134 level_node: LevelNode, 

135 pos_title: str, 

136): 

137 pos_data = POS_TITLES[pos_title] 

138 pos_type = pos_data["pos"] 

139 base_data.pos = pos_type 

140 page_data.append(base_data.model_copy(deep=True)) 

141 page_data[-1].pos_title = pos_title 

142 page_data[-1].pos_level = level_node.kind 

143 page_data[-1].tags.extend(pos_data.get("tags", [])) 

144 first_gloss_list_index = len(level_node.children) 

145 for index, child in enumerate(level_node.children): 

146 if ( 

147 isinstance(child, WikiNode) 

148 and child.kind == NodeKind.LIST 

149 and child.sarg.startswith("#") 

150 ): 

151 if index < first_gloss_list_index: 151 ↛ 153line 151 didn't jump to line 153 because the condition on line 151 was always true

152 first_gloss_list_index = index 

153 extract_gloss(wxr, page_data, child, Sense()) 

154 

155 extract_pos_head_line_nodes( 

156 wxr, page_data[-1], level_node.children[:first_gloss_list_index] 

157 ) 

158 

159 if len(page_data[-1].senses) == 0 and not level_node.contain_node( 

160 NodeKind.LIST 

161 ): 

162 # low quality pages don't put gloss in list 

163 expanded_node = wxr.wtp.parse( 

164 wxr.wtp.node_to_wikitext( 

165 list( 

166 level_node.invert_find_child( 

167 LEVEL_KIND_FLAGS, include_empty_str=True 

168 ) 

169 ) 

170 ), 

171 expand_all=True, 

172 ) 

173 if not expanded_node.contain_node(NodeKind.LIST): 

174 gloss_text = clean_node( 

175 wxr, 

176 page_data[-1], 

177 expanded_node, 

178 ) 

179 if len(gloss_text) > 0: 179 ↛ 182line 179 didn't jump to line 182 because the condition on line 179 was always true

180 page_data[-1].senses.append(Sense(glosses=[gloss_text])) 

181 else: 

182 page_data[-1].senses.append(Sense(tags=["no-gloss"])) 

183 

184 

185def parse_page( 

186 wxr: WiktextractContext, page_title: str, page_text: str 

187) -> list[dict[str, Any]]: 

188 # page layout documents 

189 # https://zh.wiktionary.org/wiki/Wiktionary:佈局解釋 

190 # https://zh.wiktionary.org/wiki/Wiktionary:体例说明 

191 # https://zh.wiktionary.org/wiki/Wiktionary:格式手冊 

192 

193 # skip translation pages 

194 if page_title.endswith( 194 ↛ 197line 194 didn't jump to line 197 because the condition on line 194 was never true

195 tuple("/" + tr_title for tr_title in TRANSLATIONS_TITLES) + ("/衍生詞",) 

196 ): 

197 return [] 

198 

199 if wxr.config.verbose: 199 ↛ 200line 199 didn't jump to line 200 because the condition on line 199 was never true

200 logger.info(f"Parsing page: {page_title}") 

201 wxr.config.word = page_title 

202 wxr.wtp.start_page(page_title) 

203 

204 # Parse the page, pre-expanding those templates that are likely to 

205 # influence parsing 

206 tree = wxr.wtp.parse(page_text, pre_expand=True) 

207 

208 page_data = [] 

209 for level2_node in tree.find_child(NodeKind.LEVEL2): 

210 categories = {} 

211 lang_name = clean_node(wxr, categories, level2_node.largs) 

212 lang_code = name_to_code(lang_name, "zh") 

213 if lang_code == "": 213 ↛ 214line 213 didn't jump to line 214 because the condition on line 213 was never true

214 wxr.wtp.warning( 

215 f"Unrecognized language name: {lang_name}", 

216 sortid="extractor/zh/page/parse_page/509", 

217 ) 

218 lang_code = "unknown" 

219 if ( 219 ↛ 223line 219 didn't jump to line 223 because the condition on line 219 was never true

220 wxr.config.capture_language_codes is not None 

221 and lang_code not in wxr.config.capture_language_codes 

222 ): 

223 continue 

224 wxr.wtp.start_section(lang_name) 

225 base_data = WordEntry( 

226 word=wxr.wtp.title, 

227 lang_code=lang_code, 

228 lang=lang_name, 

229 pos="unknown", 

230 ) 

231 base_data.categories = categories.get("categories", []) 

232 for t_node in level2_node.find_child(NodeKind.TEMPLATE): 

233 if t_node.template_name == "zh-forms": 

234 process_zh_forms(wxr, base_data, t_node) 

235 elif ( 235 ↛ 239line 235 didn't jump to line 239 because the condition on line 235 was never true

236 t_node.template_name.endswith("-kanjitab") 

237 or t_node.template_name == "ja-kt" 

238 ): 

239 extract_ja_kanjitab_template(wxr, t_node, base_data) 

240 

241 for level3_node in level2_node.find_child(NodeKind.LEVEL3): 

242 parse_section(wxr, page_data, base_data, level3_node) 

243 if not level2_node.contain_node(NodeKind.LEVEL3): 

244 page_data.append(base_data.model_copy(deep=True)) 

245 process_low_quality_page(wxr, level2_node, page_data[-1]) 

246 if page_data[-1] == base_data: 246 ↛ 247line 246 didn't jump to line 247 because the condition on line 246 was never true

247 page_data.pop() 

248 

249 for data in page_data: 

250 if len(data.senses) == 0: 

251 data.senses.append(Sense(tags=["no-gloss"])) 

252 

253 return [d.model_dump(exclude_defaults=True) for d in page_data] 

254 

255 

256def process_low_quality_page( 

257 wxr: WiktextractContext, level_node: WikiNode, word_entry: WordEntry 

258) -> None: 

259 is_soft_redirect = False 

260 for template_node in level_node.find_child(NodeKind.TEMPLATE): 

261 if template_node.template_name in ("ja-see", "ja-see-kango", "zh-see"): 

262 process_soft_redirect_template(wxr, template_node, word_entry) 

263 is_soft_redirect = True 

264 

265 if not is_soft_redirect: # only have a gloss text 

266 has_gloss_list = False 

267 for list_node in level_node.find_child(NodeKind.LIST): 267 ↛ 268line 267 didn't jump to line 268 because the loop on line 267 never started

268 if list_node.sarg == "#": 

269 extract_gloss(wxr, [word_entry], list_node, Sense()) 

270 has_gloss_list = True 

271 if not has_gloss_list: 271 ↛ exitline 271 didn't return from function 'process_low_quality_page' because the condition on line 271 was always true

272 gloss_text = clean_node(wxr, word_entry, level_node.children) 

273 if len(gloss_text) > 0: 273 ↛ exitline 273 didn't return from function 'process_low_quality_page' because the condition on line 273 was always true

274 for cat in word_entry.categories: 

275 cat = cat.removeprefix(word_entry.lang).strip() 

276 if cat in POS_TITLES: 276 ↛ 274line 276 didn't jump to line 274 because the condition on line 276 was always true

277 pos_data = POS_TITLES[cat] 

278 word_entry.pos = pos_data["pos"] 

279 word_entry.tags.extend(pos_data.get("tags", [])) 

280 break 

281 word_entry.senses.append(Sense(glosses=[gloss_text])) 

282 

283 

284def process_soft_redirect_template( 

285 wxr: WiktextractContext, t_node: TemplateNode, word_entry: WordEntry 

286) -> None: 

287 # https://zh.wiktionary.org/wiki/Template:Ja-see 

288 # https://zh.wiktionary.org/wiki/Template:Ja-see-kango 

289 # https://zh.wiktionary.org/wiki/Template:Zh-see 

290 template_name = t_node.template_name.lower() 

291 if template_name == "zh-see": 

292 word_entry.redirects.append( 

293 clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

294 ) 

295 elif template_name in ("ja-see", "ja-see-kango"): 295 ↛ 300line 295 didn't jump to line 300 because the condition on line 295 was always true

296 for key, value in t_node.template_parameters.items(): 

297 if isinstance(key, int): 297 ↛ 296line 297 didn't jump to line 296 because the condition on line 297 was always true

298 word_entry.redirects.append(clean_node(wxr, None, value)) 

299 

300 if word_entry.pos == "unknown": 300 ↛ exitline 300 didn't return from function 'process_soft_redirect_template' because the condition on line 300 was always true

301 word_entry.pos = "soft-redirect" 

302 

303 

304def process_zh_forms( 

305 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

306): 

307 # https://zh.wiktionary.org/wiki/Template:zh-forms 

308 base_data.literal_meaning = clean_node( 

309 wxr, None, t_node.template_parameters.get("lit", "") 

310 ) 

311 expanded_node = wxr.wtp.parse( 

312 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

313 ) 

314 for table in expanded_node.find_child(NodeKind.TABLE): 

315 for row in table.find_child(NodeKind.TABLE_ROW): 

316 row_header = "" 

317 row_header_tags = [] 

318 header_has_span = False 

319 for cell in row.find_child( 

320 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

321 ): 

322 if cell.kind == NodeKind.TABLE_HEADER_CELL: 

323 row_header, row_header_tags, header_has_span = ( 

324 extract_zh_forms_header_cell(wxr, base_data, cell) 

325 ) 

326 elif not header_has_span: 

327 extract_zh_forms_data_cell( 

328 wxr, base_data, cell, row_header, row_header_tags 

329 ) 

330 

331 

332def extract_zh_forms_header_cell( 

333 wxr: WiktextractContext, base_data: WordEntry, header_cell: WikiNode 

334) -> tuple[str, list[str], bool]: 

335 row_header = "" 

336 row_header_tags = [] 

337 header_has_span = False 

338 first_span_index = len(header_cell.children) 

339 for index, span_tag in header_cell.find_html("span", with_index=True): 

340 if index < first_span_index: 340 ↛ 342line 340 didn't jump to line 342 because the condition on line 340 was always true

341 first_span_index = index 

342 header_has_span = True 

343 row_header = clean_node(wxr, None, header_cell.children[:first_span_index]) 

344 for raw_tag in re.split(r"/|與", row_header): 

345 raw_tag = raw_tag.strip() 

346 if raw_tag != "": 

347 row_header_tags.append(raw_tag) 

348 for span_tag in header_cell.find_html_recursively("span"): 

349 span_lang = span_tag.attrs.get("lang", "") 

350 form_nodes = [] 

351 sup_title = "" 

352 for node in span_tag.children: 

353 if isinstance(node, HTMLNode) and node.tag == "sup": 

354 for sup_span in node.find_html("span"): 

355 sup_title = sup_span.attrs.get("title", "") 

356 else: 

357 form_nodes.append(node) 

358 if span_lang in ["zh-Hant", "zh-Hans"]: 

359 for word in clean_node(wxr, None, form_nodes).split("/"): 

360 if word not in [base_data.word, ""]: 

361 form = Form(form=word, raw_tags=row_header_tags) 

362 if sup_title != "": 

363 form.raw_tags.append(sup_title) 

364 translate_raw_tags(form) 

365 base_data.forms.append(form) 

366 return row_header, row_header_tags, header_has_span 

367 

368 

369def extract_zh_forms_data_cell( 

370 wxr: WiktextractContext, 

371 base_data: WordEntry, 

372 cell: WikiNode, 

373 row_header: str, 

374 row_header_tags: list[str], 

375): 

376 forms = [] 

377 for top_span_tag in cell.find_html("span"): 

378 span_style = top_span_tag.attrs.get("style", "") 

379 span_lang = top_span_tag.attrs.get("lang", "") 

380 if span_style == "white-space:nowrap;": 

381 extract_zh_forms_data_cell( 

382 wxr, base_data, top_span_tag, row_header, row_header_tags 

383 ) 

384 elif "font-size:80%" in span_style: 

385 raw_tag = clean_node(wxr, None, top_span_tag) 

386 if raw_tag != "": 386 ↛ 377line 386 didn't jump to line 377 because the condition on line 386 was always true

387 for form in forms: 

388 form.raw_tags.append(raw_tag) 

389 translate_raw_tags(form) 

390 elif span_lang in ["zh-Hant", "zh-Hans", "zh"]: 390 ↛ 377line 390 didn't jump to line 377 because the condition on line 390 was always true

391 word = clean_node(wxr, None, top_span_tag) 

392 if word not in ["", "/", base_data.word]: 

393 form = Form(form=word) 

394 if row_header != "異序詞": 

395 form.raw_tags = row_header_tags 

396 if span_lang == "zh-Hant": 

397 form.tags.append("Traditional-Chinese") 

398 elif span_lang == "zh-Hans": 

399 form.tags.append("Simplified-Chinese") 

400 translate_raw_tags(form) 

401 forms.append(form) 

402 

403 if row_header == "異序詞": 

404 for form in forms: 

405 base_data.anagrams.append( 

406 Linkage(word=form.form, raw_tags=form.raw_tags, tags=form.tags) 

407 ) 

408 else: 

409 base_data.forms.extend(forms) 

410 

411 

412# https://zh.wiktionary.org/wiki/Template:Zh-cat 

413# https://zh.wiktionary.org/wiki/Template:Catlangname 

414CATEGORY_TEMPLATES = frozenset( 

415 [ 

416 "zh-cat", 

417 "cln", 

418 "catlangname", 

419 "c", 

420 "topics", 

421 "top", 

422 "catlangcode", 

423 "topic", 

424 ] 

425) 

426 

427 

428def add_page_end_categories( 

429 wxr: WiktextractContext, page_data: list[WordEntry], template: TemplateNode 

430) -> None: 

431 if template.template_name.lower() in CATEGORY_TEMPLATES: 431 ↛ 432line 431 didn't jump to line 432 because the condition on line 431 was never true

432 categories = {} 

433 clean_node(wxr, categories, template) 

434 for data in page_data: 

435 if data.lang_code == page_data[-1].lang_code: 

436 data.categories.extend(categories.get("categories", []))