Coverage for src/wiktextract/extractor/zh/page.py: 87%
234 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-12 08:27 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-12 08:27 +0000
1import re
2from typing import Any
4from mediawiki_langcodes import name_to_code
5from wikitextprocessor.parser import (
6 LEVEL_KIND_FLAGS,
7 HTMLNode,
8 LevelNode,
9 NodeKind,
10 TemplateNode,
11 WikiNode,
12)
14from ...page import clean_node
15from ...wxr_context import WiktextractContext
16from ...wxr_logging import logger
17from .descendant import extract_descendant_section
18from .etymology import extract_etymology_section
19from .gloss import extract_gloss
20from .headword_line import extract_pos_head_line_nodes
21from .inflection import extract_inflections
22from .linkage import extract_linkage_section
23from .models import Form, Linkage, Sense, WordEntry
24from .note import extract_note_section
25from .pronunciation import extract_pronunciation_section
26from .section_titles import (
27 DESCENDANTS_TITLES,
28 ETYMOLOGY_TITLES,
29 IGNORED_TITLES,
30 INFLECTION_TITLES,
31 LINKAGE_TITLES,
32 POS_TITLES,
33 PRONUNCIATION_TITLES,
34 TRANSLATIONS_TITLES,
35 USAGE_NOTE_TITLES,
36)
37from .tags import translate_raw_tags
38from .translation import extract_translation_section
41def parse_section(
42 wxr: WiktextractContext,
43 page_data: list[WordEntry],
44 base_data: WordEntry,
45 level_node: LevelNode,
46) -> None:
47 subtitle = clean_node(wxr, None, level_node.largs)
48 # remove number suffix from subtitle
49 subtitle = re.sub(r"\s*(?:(.+)|\d+)$", "", subtitle)
50 wxr.wtp.start_subsection(subtitle)
51 if subtitle in IGNORED_TITLES: 51 ↛ 52line 51 didn't jump to line 52 because the condition on line 51 was never true
52 pass
53 elif subtitle in POS_TITLES:
54 process_pos_block(wxr, page_data, base_data, level_node, subtitle)
55 if len(page_data[-1].senses) == 0 and subtitle in LINKAGE_TITLES:
56 page_data.pop()
57 extract_linkage_section(
58 wxr,
59 page_data if len(page_data) > 0 else [base_data],
60 level_node,
61 LINKAGE_TITLES[subtitle],
62 )
63 elif wxr.config.capture_etymologies and subtitle.startswith(
64 tuple(ETYMOLOGY_TITLES)
65 ):
66 if level_node.contain_node(LEVEL_KIND_FLAGS): 66 ↛ 68line 66 didn't jump to line 68 because the condition on line 66 was always true
67 base_data = base_data.model_copy(deep=True)
68 extract_etymology_section(wxr, page_data, base_data, level_node)
69 elif wxr.config.capture_pronunciation and subtitle in PRONUNCIATION_TITLES:
70 if level_node.contain_node(LEVEL_KIND_FLAGS):
71 base_data = base_data.model_copy(deep=True)
72 extract_pronunciation_section(wxr, base_data, level_node)
73 elif wxr.config.capture_linkages and subtitle in LINKAGE_TITLES:
74 is_descendant_section = False
75 if subtitle in DESCENDANTS_TITLES:
76 for t_node in level_node.find_child_recursively(NodeKind.TEMPLATE): 76 ↛ 86line 76 didn't jump to line 86 because the loop on line 76 didn't complete
77 if t_node.template_name.lower() in [ 77 ↛ 76line 77 didn't jump to line 76 because the condition on line 77 was always true
78 "desc",
79 "descendant",
80 "desctree",
81 "descendants tree",
82 "cjkv",
83 ]:
84 is_descendant_section = True
85 break
86 if is_descendant_section and wxr.config.capture_descendants:
87 extract_descendant_section(
88 wxr,
89 level_node,
90 page_data if len(page_data) > 0 else [base_data],
91 )
92 elif not is_descendant_section: 92 ↛ 121line 92 didn't jump to line 121 because the condition on line 92 was always true
93 extract_linkage_section(
94 wxr,
95 page_data if len(page_data) > 0 else [base_data],
96 level_node,
97 LINKAGE_TITLES[subtitle],
98 )
99 elif wxr.config.capture_translations and subtitle in TRANSLATIONS_TITLES:
100 if len(page_data) == 0: 100 ↛ 101line 100 didn't jump to line 101 because the condition on line 100 was never true
101 page_data.append(base_data.model_copy(deep=True))
102 extract_translation_section(wxr, page_data[-1], level_node)
103 elif wxr.config.capture_inflections and subtitle in INFLECTION_TITLES: 103 ↛ 104line 103 didn't jump to line 104 because the condition on line 103 was never true
104 extract_inflections(
105 wxr, page_data if len(page_data) > 0 else [base_data], level_node
106 )
107 elif wxr.config.capture_descendants and subtitle in DESCENDANTS_TITLES:
108 extract_descendant_section(
109 wxr, level_node, page_data if len(page_data) > 0 else [base_data]
110 )
111 elif subtitle in USAGE_NOTE_TITLES: 111 ↛ 116line 111 didn't jump to line 116 because the condition on line 111 was always true
112 extract_note_section(
113 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
114 )
115 else:
116 wxr.wtp.debug(
117 f"Unhandled subtitle: {subtitle}",
118 sortid="extractor/zh/page/parse_section/192",
119 )
121 for next_level_node in level_node.find_child(LEVEL_KIND_FLAGS):
122 parse_section(wxr, page_data, base_data, next_level_node)
124 for template in level_node.find_child(NodeKind.TEMPLATE):
125 add_page_end_categories(
126 wxr, page_data if len(page_data) else [base_data], template
127 )
130def process_pos_block(
131 wxr: WiktextractContext,
132 page_data: list[WordEntry],
133 base_data: WordEntry,
134 level_node: LevelNode,
135 pos_title: str,
136):
137 pos_data = POS_TITLES[pos_title]
138 pos_type = pos_data["pos"]
139 base_data.pos = pos_type
140 page_data.append(base_data.model_copy(deep=True))
141 page_data[-1].pos_title = pos_title
142 page_data[-1].pos_level = level_node.kind
143 page_data[-1].tags.extend(pos_data.get("tags", []))
144 first_gloss_list_index = len(level_node.children)
145 for index, child in enumerate(level_node.children):
146 if (
147 isinstance(child, WikiNode)
148 and child.kind == NodeKind.LIST
149 and child.sarg.startswith("#")
150 ):
151 if index < first_gloss_list_index: 151 ↛ 153line 151 didn't jump to line 153 because the condition on line 151 was always true
152 first_gloss_list_index = index
153 extract_gloss(wxr, page_data, child, Sense())
155 extract_pos_head_line_nodes(
156 wxr, page_data[-1], level_node.children[:first_gloss_list_index]
157 )
159 if len(page_data[-1].senses) == 0 and not level_node.contain_node(
160 NodeKind.LIST
161 ):
162 # low quality pages don't put gloss in list
163 expanded_node = wxr.wtp.parse(
164 wxr.wtp.node_to_wikitext(
165 list(level_node.invert_find_child(LEVEL_KIND_FLAGS))
166 ),
167 expand_all=True,
168 )
169 if not expanded_node.contain_node(NodeKind.LIST):
170 gloss_text = clean_node(
171 wxr,
172 page_data[-1],
173 expanded_node,
174 )
175 if len(gloss_text) > 0: 175 ↛ 178line 175 didn't jump to line 178 because the condition on line 175 was always true
176 page_data[-1].senses.append(Sense(glosses=[gloss_text]))
177 else:
178 page_data[-1].senses.append(Sense(tags=["no-gloss"]))
181def parse_page(
182 wxr: WiktextractContext, page_title: str, page_text: str
183) -> list[dict[str, Any]]:
184 # page layout documents
185 # https://zh.wiktionary.org/wiki/Wiktionary:佈局解釋
186 # https://zh.wiktionary.org/wiki/Wiktionary:体例说明
187 # https://zh.wiktionary.org/wiki/Wiktionary:格式手冊
189 # skip translation pages
190 if page_title.endswith( 190 ↛ 193line 190 didn't jump to line 193 because the condition on line 190 was never true
191 tuple("/" + tr_title for tr_title in TRANSLATIONS_TITLES) + ("/衍生詞",)
192 ):
193 return []
195 if wxr.config.verbose: 195 ↛ 196line 195 didn't jump to line 196 because the condition on line 195 was never true
196 logger.info(f"Parsing page: {page_title}")
197 wxr.config.word = page_title
198 wxr.wtp.start_page(page_title)
200 # Parse the page, pre-expanding those templates that are likely to
201 # influence parsing
202 tree = wxr.wtp.parse(page_text, pre_expand=True)
204 page_data = []
205 for level2_node in tree.find_child(NodeKind.LEVEL2):
206 categories = {}
207 lang_name = clean_node(wxr, categories, level2_node.largs)
208 lang_code = name_to_code(lang_name, "zh")
209 if lang_code == "": 209 ↛ 210line 209 didn't jump to line 210 because the condition on line 209 was never true
210 wxr.wtp.warning(
211 f"Unrecognized language name: {lang_name}",
212 sortid="extractor/zh/page/parse_page/509",
213 )
214 lang_code = "unknown"
215 if ( 215 ↛ 219line 215 didn't jump to line 219 because the condition on line 215 was never true
216 wxr.config.capture_language_codes is not None
217 and lang_code not in wxr.config.capture_language_codes
218 ):
219 continue
220 wxr.wtp.start_section(lang_name)
221 base_data = WordEntry(
222 word=wxr.wtp.title,
223 lang_code=lang_code,
224 lang=lang_name,
225 pos="unknown",
226 )
227 base_data.categories = categories.get("categories", [])
228 for template_node in level2_node.find_child(NodeKind.TEMPLATE):
229 if template_node.template_name == "zh-forms":
230 process_zh_forms(wxr, base_data, template_node)
232 for level3_node in level2_node.find_child(NodeKind.LEVEL3):
233 parse_section(wxr, page_data, base_data, level3_node)
234 if not level2_node.contain_node(NodeKind.LEVEL3):
235 page_data.append(base_data.model_copy(deep=True))
236 process_low_quality_page(wxr, level2_node, page_data[-1])
237 if page_data[-1] == base_data: 237 ↛ 238line 237 didn't jump to line 238 because the condition on line 237 was never true
238 page_data.pop()
240 for data in page_data:
241 if len(data.senses) == 0:
242 data.senses.append(Sense(tags=["no-gloss"]))
244 return [d.model_dump(exclude_defaults=True) for d in page_data]
247def process_low_quality_page(
248 wxr: WiktextractContext, level_node: WikiNode, word_entry: WordEntry
249) -> None:
250 is_soft_redirect = False
251 for template_node in level_node.find_child(NodeKind.TEMPLATE):
252 if template_node.template_name in ("ja-see", "ja-see-kango", "zh-see"):
253 process_soft_redirect_template(wxr, template_node, word_entry)
254 is_soft_redirect = True
256 if not is_soft_redirect: # only have a gloss text
257 has_gloss_list = False
258 for list_node in level_node.find_child(NodeKind.LIST): 258 ↛ 259line 258 didn't jump to line 259 because the loop on line 258 never started
259 if list_node.sarg == "#":
260 extract_gloss(wxr, [word_entry], list_node, Sense())
261 has_gloss_list = True
262 if not has_gloss_list: 262 ↛ exitline 262 didn't return from function 'process_low_quality_page' because the condition on line 262 was always true
263 gloss_text = clean_node(wxr, word_entry, level_node.children)
264 if len(gloss_text) > 0: 264 ↛ exitline 264 didn't return from function 'process_low_quality_page' because the condition on line 264 was always true
265 for cat in word_entry.categories:
266 cat = cat.removeprefix(word_entry.lang).strip()
267 if cat in POS_TITLES: 267 ↛ 265line 267 didn't jump to line 265 because the condition on line 267 was always true
268 pos_data = POS_TITLES[cat]
269 word_entry.pos = pos_data["pos"]
270 word_entry.tags.extend(pos_data.get("tags", []))
271 break
272 word_entry.senses.append(Sense(glosses=[gloss_text]))
275def process_soft_redirect_template(
276 wxr: WiktextractContext, t_node: TemplateNode, word_entry: WordEntry
277) -> None:
278 # https://zh.wiktionary.org/wiki/Template:Ja-see
279 # https://zh.wiktionary.org/wiki/Template:Ja-see-kango
280 # https://zh.wiktionary.org/wiki/Template:Zh-see
281 template_name = t_node.template_name.lower()
282 if template_name == "zh-see":
283 word_entry.redirects.append(
284 clean_node(wxr, None, t_node.template_parameters.get(1, ""))
285 )
286 elif template_name in ("ja-see", "ja-see-kango"): 286 ↛ 291line 286 didn't jump to line 291 because the condition on line 286 was always true
287 for key, value in t_node.template_parameters.items():
288 if isinstance(key, int): 288 ↛ 287line 288 didn't jump to line 287 because the condition on line 288 was always true
289 word_entry.redirects.append(clean_node(wxr, None, value))
291 if word_entry.pos == "unknown": 291 ↛ exitline 291 didn't return from function 'process_soft_redirect_template' because the condition on line 291 was always true
292 word_entry.pos = "soft-redirect"
295def process_zh_forms(
296 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
297):
298 # https://zh.wiktionary.org/wiki/Template:zh-forms
299 base_data.literal_meaning = clean_node(
300 wxr, None, t_node.template_parameters.get("lit", "")
301 )
302 expanded_node = wxr.wtp.parse(
303 wxr.wtp.node_to_wikitext(t_node), expand_all=True
304 )
305 for table in expanded_node.find_child(NodeKind.TABLE):
306 for row in table.find_child(NodeKind.TABLE_ROW):
307 row_header = ""
308 row_header_tags = []
309 header_has_span = False
310 for cell in row.find_child(
311 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
312 ):
313 if cell.kind == NodeKind.TABLE_HEADER_CELL:
314 row_header, row_header_tags, header_has_span = (
315 extract_zh_forms_header_cell(wxr, base_data, cell)
316 )
317 elif not header_has_span:
318 extract_zh_forms_data_cell(
319 wxr, base_data, cell, row_header, row_header_tags
320 )
323def extract_zh_forms_header_cell(
324 wxr: WiktextractContext, base_data: WordEntry, header_cell: WikiNode
325) -> tuple[str, list[str], bool]:
326 row_header = ""
327 row_header_tags = []
328 header_has_span = False
329 first_span_index = len(header_cell.children)
330 for index, span_tag in header_cell.find_html("span", with_index=True):
331 if index < first_span_index: 331 ↛ 333line 331 didn't jump to line 333 because the condition on line 331 was always true
332 first_span_index = index
333 header_has_span = True
334 row_header = clean_node(wxr, None, header_cell.children[:first_span_index])
335 for raw_tag in re.split(r"/|與", row_header):
336 raw_tag = raw_tag.strip()
337 if raw_tag != "":
338 row_header_tags.append(raw_tag)
339 for span_tag in header_cell.find_html_recursively("span"):
340 span_lang = span_tag.attrs.get("lang", "")
341 form_nodes = []
342 sup_title = ""
343 for node in span_tag.children:
344 if isinstance(node, HTMLNode) and node.tag == "sup":
345 for sup_span in node.find_html("span"):
346 sup_title = sup_span.attrs.get("title", "")
347 else:
348 form_nodes.append(node)
349 if span_lang in ["zh-Hant", "zh-Hans"]:
350 for word in clean_node(wxr, None, form_nodes).split("/"):
351 if word not in [base_data.word, ""]:
352 form = Form(form=word, raw_tags=row_header_tags)
353 if sup_title != "":
354 form.raw_tags.append(sup_title)
355 translate_raw_tags(form)
356 base_data.forms.append(form)
357 return row_header, row_header_tags, header_has_span
360def extract_zh_forms_data_cell(
361 wxr: WiktextractContext,
362 base_data: WordEntry,
363 cell: WikiNode,
364 row_header: str,
365 row_header_tags: list[str],
366):
367 for top_span_tag in cell.find_html("span"):
368 forms = []
369 for span_tag in top_span_tag.find_html("span"):
370 span_lang = span_tag.attrs.get("lang", "")
371 if span_lang in ["zh-Hant", "zh-Hans", "zh"]:
372 word = clean_node(wxr, None, span_tag)
373 if word not in ["", "/", base_data.word]:
374 form = Form(form=word)
375 if row_header != "異序詞":
376 form.raw_tags = row_header_tags
377 if span_lang == "zh-Hant":
378 form.tags.append("Traditional-Chinese")
379 elif span_lang == "zh-Hans":
380 form.tags.append("Simplified-Chinese")
381 translate_raw_tags(form)
382 forms.append(form)
383 elif "font-size:80%" in span_tag.attrs.get("style", ""): 383 ↛ 369line 383 didn't jump to line 369 because the condition on line 383 was always true
384 raw_tag = clean_node(wxr, None, span_tag)
385 if raw_tag != "": 385 ↛ 369line 385 didn't jump to line 369 because the condition on line 385 was always true
386 for form in forms:
387 form.raw_tags.append(raw_tag)
388 translate_raw_tags(form)
389 if row_header == "異序詞":
390 for form in forms:
391 base_data.anagrams.append(
392 Linkage(
393 word=form.form,
394 raw_tags=form.raw_tags,
395 tags=form.tags,
396 )
397 )
398 else:
399 base_data.forms.extend(forms)
402# https://zh.wiktionary.org/wiki/Template:Zh-cat
403# https://zh.wiktionary.org/wiki/Template:Catlangname
404CATEGORY_TEMPLATES = frozenset(
405 [
406 "zh-cat",
407 "cln",
408 "catlangname",
409 "c",
410 "topics",
411 "top",
412 "catlangcode",
413 "topic",
414 ]
415)
418def add_page_end_categories(
419 wxr: WiktextractContext, page_data: list[WordEntry], template: TemplateNode
420) -> None:
421 if template.template_name.lower() in CATEGORY_TEMPLATES: 421 ↛ 422line 421 didn't jump to line 422 because the condition on line 421 was never true
422 categories = {}
423 clean_node(wxr, categories, template)
424 for data in page_data:
425 if data.lang_code == page_data[-1].lang_code:
426 data.categories.extend(categories.get("categories", []))