Coverage for src / wiktextract / extractor / zh / page.py: 88%
238 statements
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-12 08:09 +0000
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-12 08:09 +0000
1import re
2from typing import Any
4from mediawiki_langcodes import name_to_code
5from wikitextprocessor.parser import (
6 LEVEL_KIND_FLAGS,
7 HTMLNode,
8 LevelNode,
9 NodeKind,
10 TemplateNode,
11 WikiNode,
12)
14from ...page import clean_node
15from ...wxr_context import WiktextractContext
16from ...wxr_logging import logger
17from .descendant import extract_descendant_section
18from .etymology import extract_etymology_section, extract_ja_kanjitab_template
19from .gloss import extract_gloss
20from .headword_line import extract_pos_head_line_nodes
21from .inflection import extract_inflections
22from .linkage import extract_linkage_section
23from .models import Form, Linkage, Sense, WordEntry
24from .note import extract_note_section
25from .pronunciation import extract_pronunciation_section
26from .section_titles import (
27 DESCENDANTS_TITLES,
28 ETYMOLOGY_TITLES,
29 IGNORED_TITLES,
30 INFLECTION_TITLES,
31 LINKAGE_TITLES,
32 POS_TITLES,
33 PRONUNCIATION_TITLES,
34 TRANSLATIONS_TITLES,
35 USAGE_NOTE_TITLES,
36)
37from .tags import translate_raw_tags
38from .translation import extract_translation_section
41def parse_section(
42 wxr: WiktextractContext,
43 page_data: list[WordEntry],
44 base_data: WordEntry,
45 level_node: LevelNode,
46) -> None:
47 subtitle = clean_node(wxr, None, level_node.largs)
48 # remove number suffix from subtitle
49 subtitle = re.sub(r"\s*(?:(.+)|\d+)$", "", subtitle)
50 wxr.wtp.start_subsection(subtitle)
51 if subtitle in IGNORED_TITLES: 51 ↛ 52line 51 didn't jump to line 52 because the condition on line 51 was never true
52 pass
53 elif subtitle in POS_TITLES:
54 process_pos_block(wxr, page_data, base_data, level_node, subtitle)
55 if len(page_data[-1].senses) == 0 and subtitle in LINKAGE_TITLES:
56 page_data.pop()
57 extract_linkage_section(
58 wxr,
59 page_data if len(page_data) > 0 else [base_data],
60 level_node,
61 LINKAGE_TITLES[subtitle],
62 )
63 elif wxr.config.capture_etymologies and subtitle.startswith(
64 tuple(ETYMOLOGY_TITLES)
65 ):
66 if level_node.contain_node(LEVEL_KIND_FLAGS):
67 base_data = base_data.model_copy(deep=True)
68 extract_etymology_section(wxr, page_data, base_data, level_node)
69 elif wxr.config.capture_pronunciation and subtitle in PRONUNCIATION_TITLES:
70 if level_node.contain_node(LEVEL_KIND_FLAGS):
71 base_data = base_data.model_copy(deep=True)
72 extract_pronunciation_section(wxr, base_data, level_node)
73 elif wxr.config.capture_linkages and subtitle in LINKAGE_TITLES:
74 is_descendant_section = False
75 if subtitle in DESCENDANTS_TITLES:
76 for t_node in level_node.find_child_recursively(NodeKind.TEMPLATE): 76 ↛ 86line 76 didn't jump to line 86 because the loop on line 76 didn't complete
77 if t_node.template_name.lower() in [ 77 ↛ 76line 77 didn't jump to line 76 because the condition on line 77 was always true
78 "desc",
79 "descendant",
80 "desctree",
81 "descendants tree",
82 "cjkv",
83 ]:
84 is_descendant_section = True
85 break
86 if is_descendant_section and wxr.config.capture_descendants:
87 extract_descendant_section(
88 wxr,
89 level_node,
90 page_data if len(page_data) > 0 else [base_data],
91 )
92 elif not is_descendant_section: 92 ↛ 121line 92 didn't jump to line 121 because the condition on line 92 was always true
93 extract_linkage_section(
94 wxr,
95 page_data if len(page_data) > 0 else [base_data],
96 level_node,
97 LINKAGE_TITLES[subtitle],
98 )
99 elif wxr.config.capture_translations and subtitle in TRANSLATIONS_TITLES:
100 if len(page_data) == 0: 100 ↛ 101line 100 didn't jump to line 101 because the condition on line 100 was never true
101 page_data.append(base_data.model_copy(deep=True))
102 extract_translation_section(wxr, page_data[-1], level_node)
103 elif wxr.config.capture_inflections and subtitle in INFLECTION_TITLES:
104 extract_inflections(
105 wxr, page_data if len(page_data) > 0 else [base_data], level_node
106 )
107 elif wxr.config.capture_descendants and subtitle in DESCENDANTS_TITLES:
108 extract_descendant_section(
109 wxr, level_node, page_data if len(page_data) > 0 else [base_data]
110 )
111 elif subtitle in USAGE_NOTE_TITLES: 111 ↛ 116line 111 didn't jump to line 116 because the condition on line 111 was always true
112 extract_note_section(
113 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
114 )
115 else:
116 wxr.wtp.debug(
117 f"Unhandled subtitle: {subtitle}",
118 sortid="extractor/zh/page/parse_section/192",
119 )
121 for next_level_node in level_node.find_child(LEVEL_KIND_FLAGS):
122 parse_section(wxr, page_data, base_data, next_level_node)
124 for template in level_node.find_child(NodeKind.TEMPLATE):
125 add_page_end_categories(
126 wxr, page_data if len(page_data) else [base_data], template
127 )
130def process_pos_block(
131 wxr: WiktextractContext,
132 page_data: list[WordEntry],
133 base_data: WordEntry,
134 level_node: LevelNode,
135 pos_title: str,
136):
137 pos_data = POS_TITLES[pos_title]
138 pos_type = pos_data["pos"]
139 base_data.pos = pos_type
140 page_data.append(base_data.model_copy(deep=True))
141 page_data[-1].pos_title = pos_title
142 page_data[-1].pos_level = level_node.kind
143 page_data[-1].tags.extend(pos_data.get("tags", []))
144 first_gloss_list_index = len(level_node.children)
145 for index, child in enumerate(level_node.children):
146 if (
147 isinstance(child, WikiNode)
148 and child.kind == NodeKind.LIST
149 and child.sarg.startswith("#")
150 ):
151 if index < first_gloss_list_index: 151 ↛ 153line 151 didn't jump to line 153 because the condition on line 151 was always true
152 first_gloss_list_index = index
153 extract_gloss(wxr, page_data, child, Sense())
155 extract_pos_head_line_nodes(
156 wxr, page_data[-1], level_node.children[:first_gloss_list_index]
157 )
159 if len(page_data[-1].senses) == 0 and not level_node.contain_node(
160 NodeKind.LIST
161 ):
162 # low quality pages don't put gloss in list
163 expanded_node = wxr.wtp.parse(
164 wxr.wtp.node_to_wikitext(
165 list(
166 level_node.invert_find_child(
167 LEVEL_KIND_FLAGS, include_empty_str=True
168 )
169 )
170 ),
171 expand_all=True,
172 )
173 if not expanded_node.contain_node(NodeKind.LIST):
174 gloss_text = clean_node(
175 wxr,
176 page_data[-1],
177 expanded_node,
178 )
179 if len(gloss_text) > 0: 179 ↛ 182line 179 didn't jump to line 182 because the condition on line 179 was always true
180 page_data[-1].senses.append(Sense(glosses=[gloss_text]))
181 else:
182 page_data[-1].senses.append(Sense(tags=["no-gloss"]))
185def parse_page(
186 wxr: WiktextractContext, page_title: str, page_text: str
187) -> list[dict[str, Any]]:
188 # page layout documents
189 # https://zh.wiktionary.org/wiki/Wiktionary:佈局解釋
190 # https://zh.wiktionary.org/wiki/Wiktionary:体例说明
191 # https://zh.wiktionary.org/wiki/Wiktionary:格式手冊
193 # skip translation pages
194 if page_title.endswith( 194 ↛ 197line 194 didn't jump to line 197 because the condition on line 194 was never true
195 tuple("/" + tr_title for tr_title in TRANSLATIONS_TITLES) + ("/衍生詞",)
196 ):
197 return []
199 if wxr.config.verbose: 199 ↛ 200line 199 didn't jump to line 200 because the condition on line 199 was never true
200 logger.info(f"Parsing page: {page_title}")
201 wxr.config.word = page_title
202 wxr.wtp.start_page(page_title)
204 # Parse the page, pre-expanding those templates that are likely to
205 # influence parsing
206 tree = wxr.wtp.parse(page_text, pre_expand=True)
208 page_data = []
209 for level2_node in tree.find_child(NodeKind.LEVEL2):
210 categories = {}
211 lang_name = clean_node(wxr, categories, level2_node.largs)
212 lang_code = name_to_code(lang_name, "zh")
213 if lang_code == "": 213 ↛ 214line 213 didn't jump to line 214 because the condition on line 213 was never true
214 wxr.wtp.warning(
215 f"Unrecognized language name: {lang_name}",
216 sortid="extractor/zh/page/parse_page/509",
217 )
218 lang_code = "unknown"
219 if ( 219 ↛ 223line 219 didn't jump to line 223 because the condition on line 219 was never true
220 wxr.config.capture_language_codes is not None
221 and lang_code not in wxr.config.capture_language_codes
222 ):
223 continue
224 wxr.wtp.start_section(lang_name)
225 base_data = WordEntry(
226 word=wxr.wtp.title,
227 lang_code=lang_code,
228 lang=lang_name,
229 pos="unknown",
230 )
231 base_data.categories = categories.get("categories", [])
232 for t_node in level2_node.find_child(NodeKind.TEMPLATE):
233 if t_node.template_name == "zh-forms":
234 process_zh_forms(wxr, base_data, t_node)
235 elif ( 235 ↛ 239line 235 didn't jump to line 239 because the condition on line 235 was never true
236 t_node.template_name.endswith("-kanjitab")
237 or t_node.template_name == "ja-kt"
238 ):
239 extract_ja_kanjitab_template(wxr, t_node, base_data)
241 for level3_node in level2_node.find_child(NodeKind.LEVEL3):
242 parse_section(wxr, page_data, base_data, level3_node)
243 if not level2_node.contain_node(NodeKind.LEVEL3):
244 page_data.append(base_data.model_copy(deep=True))
245 process_low_quality_page(wxr, level2_node, page_data[-1])
246 if page_data[-1] == base_data: 246 ↛ 247line 246 didn't jump to line 247 because the condition on line 246 was never true
247 page_data.pop()
249 for data in page_data:
250 if len(data.senses) == 0:
251 data.senses.append(Sense(tags=["no-gloss"]))
253 return [d.model_dump(exclude_defaults=True) for d in page_data]
256def process_low_quality_page(
257 wxr: WiktextractContext, level_node: WikiNode, word_entry: WordEntry
258) -> None:
259 is_soft_redirect = False
260 for template_node in level_node.find_child(NodeKind.TEMPLATE):
261 if template_node.template_name in ("ja-see", "ja-see-kango", "zh-see"):
262 process_soft_redirect_template(wxr, template_node, word_entry)
263 is_soft_redirect = True
265 if not is_soft_redirect: # only have a gloss text
266 has_gloss_list = False
267 for list_node in level_node.find_child(NodeKind.LIST): 267 ↛ 268line 267 didn't jump to line 268 because the loop on line 267 never started
268 if list_node.sarg == "#":
269 extract_gloss(wxr, [word_entry], list_node, Sense())
270 has_gloss_list = True
271 if not has_gloss_list: 271 ↛ exitline 271 didn't return from function 'process_low_quality_page' because the condition on line 271 was always true
272 gloss_text = clean_node(wxr, word_entry, level_node.children)
273 if len(gloss_text) > 0: 273 ↛ exitline 273 didn't return from function 'process_low_quality_page' because the condition on line 273 was always true
274 for cat in word_entry.categories:
275 cat = cat.removeprefix(word_entry.lang).strip()
276 if cat in POS_TITLES: 276 ↛ 274line 276 didn't jump to line 274 because the condition on line 276 was always true
277 pos_data = POS_TITLES[cat]
278 word_entry.pos = pos_data["pos"]
279 word_entry.tags.extend(pos_data.get("tags", []))
280 break
281 word_entry.senses.append(Sense(glosses=[gloss_text]))
284def process_soft_redirect_template(
285 wxr: WiktextractContext, t_node: TemplateNode, word_entry: WordEntry
286) -> None:
287 # https://zh.wiktionary.org/wiki/Template:Ja-see
288 # https://zh.wiktionary.org/wiki/Template:Ja-see-kango
289 # https://zh.wiktionary.org/wiki/Template:Zh-see
290 template_name = t_node.template_name.lower()
291 if template_name == "zh-see":
292 word_entry.redirects.append(
293 clean_node(wxr, None, t_node.template_parameters.get(1, ""))
294 )
295 elif template_name in ("ja-see", "ja-see-kango"): 295 ↛ 300line 295 didn't jump to line 300 because the condition on line 295 was always true
296 for key, value in t_node.template_parameters.items():
297 if isinstance(key, int): 297 ↛ 296line 297 didn't jump to line 296 because the condition on line 297 was always true
298 word_entry.redirects.append(clean_node(wxr, None, value))
300 if word_entry.pos == "unknown": 300 ↛ exitline 300 didn't return from function 'process_soft_redirect_template' because the condition on line 300 was always true
301 word_entry.pos = "soft-redirect"
304def process_zh_forms(
305 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
306):
307 # https://zh.wiktionary.org/wiki/Template:zh-forms
308 base_data.literal_meaning = clean_node(
309 wxr, None, t_node.template_parameters.get("lit", "")
310 )
311 expanded_node = wxr.wtp.parse(
312 wxr.wtp.node_to_wikitext(t_node), expand_all=True
313 )
314 for table in expanded_node.find_child(NodeKind.TABLE):
315 for row in table.find_child(NodeKind.TABLE_ROW):
316 row_header = ""
317 row_header_tags = []
318 header_has_span = False
319 for cell in row.find_child(
320 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
321 ):
322 if cell.kind == NodeKind.TABLE_HEADER_CELL:
323 row_header, row_header_tags, header_has_span = (
324 extract_zh_forms_header_cell(wxr, base_data, cell)
325 )
326 elif not header_has_span:
327 extract_zh_forms_data_cell(
328 wxr, base_data, cell, row_header, row_header_tags
329 )
332def extract_zh_forms_header_cell(
333 wxr: WiktextractContext, base_data: WordEntry, header_cell: WikiNode
334) -> tuple[str, list[str], bool]:
335 row_header = ""
336 row_header_tags = []
337 header_has_span = False
338 first_span_index = len(header_cell.children)
339 for index, span_tag in header_cell.find_html("span", with_index=True):
340 if index < first_span_index: 340 ↛ 342line 340 didn't jump to line 342 because the condition on line 340 was always true
341 first_span_index = index
342 header_has_span = True
343 row_header = clean_node(wxr, None, header_cell.children[:first_span_index])
344 for raw_tag in re.split(r"/|與", row_header):
345 raw_tag = raw_tag.strip()
346 if raw_tag != "":
347 row_header_tags.append(raw_tag)
348 for span_tag in header_cell.find_html_recursively("span"):
349 span_lang = span_tag.attrs.get("lang", "")
350 form_nodes = []
351 sup_title = ""
352 for node in span_tag.children:
353 if isinstance(node, HTMLNode) and node.tag == "sup":
354 for sup_span in node.find_html("span"):
355 sup_title = sup_span.attrs.get("title", "")
356 else:
357 form_nodes.append(node)
358 if span_lang in ["zh-Hant", "zh-Hans"]:
359 for word in clean_node(wxr, None, form_nodes).split("/"):
360 if word not in [base_data.word, ""]:
361 form = Form(form=word, raw_tags=row_header_tags)
362 if sup_title != "":
363 form.raw_tags.append(sup_title)
364 translate_raw_tags(form)
365 base_data.forms.append(form)
366 return row_header, row_header_tags, header_has_span
369def extract_zh_forms_data_cell(
370 wxr: WiktextractContext,
371 base_data: WordEntry,
372 cell: WikiNode,
373 row_header: str,
374 row_header_tags: list[str],
375):
376 forms = []
377 for top_span_tag in cell.find_html("span"):
378 span_style = top_span_tag.attrs.get("style", "")
379 span_lang = top_span_tag.attrs.get("lang", "")
380 if span_style == "white-space:nowrap;":
381 extract_zh_forms_data_cell(
382 wxr, base_data, top_span_tag, row_header, row_header_tags
383 )
384 elif "font-size:80%" in span_style:
385 raw_tag = clean_node(wxr, None, top_span_tag)
386 if raw_tag != "": 386 ↛ 377line 386 didn't jump to line 377 because the condition on line 386 was always true
387 for form in forms:
388 form.raw_tags.append(raw_tag)
389 translate_raw_tags(form)
390 elif span_lang in ["zh-Hant", "zh-Hans", "zh"]: 390 ↛ 377line 390 didn't jump to line 377 because the condition on line 390 was always true
391 word = clean_node(wxr, None, top_span_tag)
392 if word not in ["", "/", base_data.word]:
393 form = Form(form=word)
394 if row_header != "異序詞":
395 form.raw_tags = row_header_tags
396 if span_lang == "zh-Hant":
397 form.tags.append("Traditional-Chinese")
398 elif span_lang == "zh-Hans":
399 form.tags.append("Simplified-Chinese")
400 translate_raw_tags(form)
401 forms.append(form)
403 if row_header == "異序詞":
404 for form in forms:
405 base_data.anagrams.append(
406 Linkage(word=form.form, raw_tags=form.raw_tags, tags=form.tags)
407 )
408 else:
409 base_data.forms.extend(forms)
412# https://zh.wiktionary.org/wiki/Template:Zh-cat
413# https://zh.wiktionary.org/wiki/Template:Catlangname
414CATEGORY_TEMPLATES = frozenset(
415 [
416 "zh-cat",
417 "cln",
418 "catlangname",
419 "c",
420 "topics",
421 "top",
422 "catlangcode",
423 "topic",
424 ]
425)
428def add_page_end_categories(
429 wxr: WiktextractContext, page_data: list[WordEntry], template: TemplateNode
430) -> None:
431 if template.template_name.lower() in CATEGORY_TEMPLATES: 431 ↛ 432line 431 didn't jump to line 432 because the condition on line 431 was never true
432 categories = {}
433 clean_node(wxr, categories, template)
434 for data in page_data:
435 if data.lang_code == page_data[-1].lang_code:
436 data.categories.extend(categories.get("categories", []))