Coverage for src/wiktextract/extractor/zh/page.py: 80%
182 statements
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-09 23:59 +0000
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-09 23:59 +0000
1import re
2from typing import Any
4from mediawiki_langcodes import name_to_code
5from wikitextprocessor.parser import (
6 LEVEL_KIND_FLAGS,
7 LevelNode,
8 NodeKind,
9 TemplateNode,
10 WikiNode,
11)
13from ...page import clean_node
14from ...wxr_context import WiktextractContext
15from ...wxr_logging import logger
16from .descendant import extract_descendant_section
17from .etymology import extract_etymology_section
18from .gloss import extract_gloss
19from .headword_line import extract_pos_head_line_nodes
20from .inflection import extract_inflections
21from .linkage import extract_linkage_section
22from .models import Form, Sense, WordEntry
23from .note import extract_note_section
24from .pronunciation import extract_pronunciation_section
25from .section_titles import (
26 DESCENDANTS_TITLES,
27 ETYMOLOGY_TITLES,
28 IGNORED_TITLES,
29 INFLECTION_TITLES,
30 LINKAGE_TITLES,
31 POS_TITLES,
32 PRONUNCIATION_TITLES,
33 TRANSLATIONS_TITLES,
34 USAGE_NOTE_TITLES,
35)
36from .translation import extract_translation
39def parse_section(
40 wxr: WiktextractContext,
41 page_data: list[WordEntry],
42 base_data: WordEntry,
43 level_node: LevelNode,
44) -> None:
45 subtitle = clean_node(wxr, None, level_node.largs)
46 # remove number suffix from subtitle
47 subtitle = re.sub(r"\s*(?:(.+)|\d+)$", "", subtitle)
48 wxr.wtp.start_subsection(subtitle)
49 if subtitle in IGNORED_TITLES: 49 ↛ 50line 49 didn't jump to line 50 because the condition on line 49 was never true
50 pass
51 elif subtitle in POS_TITLES:
52 process_pos_block(wxr, page_data, base_data, level_node, subtitle)
53 if len(page_data[-1].senses) == 0 and subtitle in LINKAGE_TITLES:
54 page_data.pop()
55 extract_linkage_section(
56 wxr,
57 page_data if len(page_data) > 0 else [base_data],
58 level_node,
59 LINKAGE_TITLES[subtitle],
60 )
61 elif wxr.config.capture_etymologies and subtitle.startswith(
62 tuple(ETYMOLOGY_TITLES)
63 ):
64 if level_node.contain_node(LEVEL_KIND_FLAGS): 64 ↛ 66line 64 didn't jump to line 66 because the condition on line 64 was always true
65 base_data = base_data.model_copy(deep=True)
66 extract_etymology_section(wxr, page_data, base_data, level_node)
67 elif wxr.config.capture_pronunciation and subtitle in PRONUNCIATION_TITLES:
68 if level_node.contain_node(LEVEL_KIND_FLAGS):
69 base_data = base_data.model_copy(deep=True)
70 extract_pronunciation_section(wxr, base_data, level_node)
71 elif wxr.config.capture_linkages and subtitle in LINKAGE_TITLES:
72 is_descendant_section = False
73 if subtitle in DESCENDANTS_TITLES:
74 for t_node in level_node.find_child_recursively(NodeKind.TEMPLATE): 74 ↛ 84line 74 didn't jump to line 84 because the loop on line 74 didn't complete
75 if t_node.template_name.lower() in [ 75 ↛ 74line 75 didn't jump to line 74 because the condition on line 75 was always true
76 "desc",
77 "descendant",
78 "desctree",
79 "descendants tree",
80 "cjkv",
81 ]:
82 is_descendant_section = True
83 break
84 if is_descendant_section and wxr.config.capture_descendants:
85 extract_descendant_section(
86 wxr,
87 level_node,
88 page_data if len(page_data) > 0 else [base_data],
89 )
90 elif not is_descendant_section: 90 ↛ 119line 90 didn't jump to line 119 because the condition on line 90 was always true
91 extract_linkage_section(
92 wxr,
93 page_data if len(page_data) > 0 else [base_data],
94 level_node,
95 LINKAGE_TITLES[subtitle],
96 )
97 elif wxr.config.capture_translations and subtitle in TRANSLATIONS_TITLES: 97 ↛ 98line 97 didn't jump to line 98 because the condition on line 97 was never true
98 if len(page_data) == 0:
99 page_data.append(base_data.model_copy(deep=True))
100 extract_translation(wxr, page_data, level_node)
101 elif wxr.config.capture_inflections and subtitle in INFLECTION_TITLES: 101 ↛ 102line 101 didn't jump to line 102 because the condition on line 101 was never true
102 extract_inflections(
103 wxr, page_data if len(page_data) > 0 else [base_data], level_node
104 )
105 elif wxr.config.capture_descendants and subtitle in DESCENDANTS_TITLES: 105 ↛ 106line 105 didn't jump to line 106 because the condition on line 105 was never true
106 extract_descendant_section(
107 wxr, level_node, page_data if len(page_data) > 0 else [base_data]
108 )
109 elif subtitle in USAGE_NOTE_TITLES: 109 ↛ 114line 109 didn't jump to line 114 because the condition on line 109 was always true
110 extract_note_section(
111 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
112 )
113 else:
114 wxr.wtp.debug(
115 f"Unhandled subtitle: {subtitle}",
116 sortid="extractor/zh/page/parse_section/192",
117 )
119 for next_level_node in level_node.find_child(LEVEL_KIND_FLAGS):
120 parse_section(wxr, page_data, base_data, next_level_node)
122 for template in level_node.find_child(NodeKind.TEMPLATE):
123 add_page_end_categories(wxr, page_data, template)
126def process_pos_block(
127 wxr: WiktextractContext,
128 page_data: list[WordEntry],
129 base_data: WordEntry,
130 level_node: LevelNode,
131 pos_title: str,
132):
133 pos_data = POS_TITLES[pos_title]
134 pos_type = pos_data["pos"]
135 base_data.pos = pos_type
136 page_data.append(base_data.model_copy(deep=True))
137 page_data[-1].pos_title = pos_title
138 page_data[-1].pos_level = level_node.kind
139 page_data[-1].tags.extend(pos_data.get("tags", []))
140 first_gloss_list_index = len(level_node.children)
141 for index, child in enumerate(level_node.children):
142 if (
143 isinstance(child, WikiNode)
144 and child.kind == NodeKind.LIST
145 and child.sarg.startswith("#")
146 ):
147 if index < first_gloss_list_index: 147 ↛ 149line 147 didn't jump to line 149 because the condition on line 147 was always true
148 first_gloss_list_index = index
149 extract_gloss(wxr, page_data, child, Sense())
151 extract_pos_head_line_nodes(
152 wxr, page_data[-1], level_node.children[:first_gloss_list_index]
153 )
155 if len(page_data[-1].senses) == 0 and not level_node.contain_node(
156 NodeKind.LIST
157 ):
158 # low quality pages don't put gloss in list
159 expanded_node = wxr.wtp.parse(
160 wxr.wtp.node_to_wikitext(
161 list(level_node.invert_find_child(LEVEL_KIND_FLAGS))
162 ),
163 expand_all=True,
164 )
165 if not expanded_node.contain_node(NodeKind.LIST):
166 gloss_text = clean_node(
167 wxr,
168 page_data[-1],
169 expanded_node,
170 )
171 if len(gloss_text) > 0: 171 ↛ 174line 171 didn't jump to line 174 because the condition on line 171 was always true
172 page_data[-1].senses.append(Sense(glosses=[gloss_text]))
173 else:
174 page_data[-1].senses.append(Sense(tags=["no-gloss"]))
177def parse_page(
178 wxr: WiktextractContext, page_title: str, page_text: str
179) -> list[dict[str, Any]]:
180 # page layout documents
181 # https://zh.wiktionary.org/wiki/Wiktionary:佈局解釋
182 # https://zh.wiktionary.org/wiki/Wiktionary:体例说明
183 # https://zh.wiktionary.org/wiki/Wiktionary:格式手冊
185 # skip translation pages
186 if page_title.endswith( 186 ↛ 189line 186 didn't jump to line 189 because the condition on line 186 was never true
187 tuple("/" + tr_title for tr_title in TRANSLATIONS_TITLES) + ("/衍生詞",)
188 ):
189 return []
191 if wxr.config.verbose: 191 ↛ 192line 191 didn't jump to line 192 because the condition on line 191 was never true
192 logger.info(f"Parsing page: {page_title}")
193 wxr.config.word = page_title
194 wxr.wtp.start_page(page_title)
196 # Parse the page, pre-expanding those templates that are likely to
197 # influence parsing
198 tree = wxr.wtp.parse(page_text, pre_expand=True)
200 page_data = []
201 for level2_node in tree.find_child(NodeKind.LEVEL2):
202 categories = {}
203 lang_name = clean_node(wxr, categories, level2_node.largs)
204 lang_code = name_to_code(lang_name, "zh")
205 if lang_code == "": 205 ↛ 206line 205 didn't jump to line 206 because the condition on line 205 was never true
206 wxr.wtp.warning(
207 f"Unrecognized language name: {lang_name}",
208 sortid="extractor/zh/page/parse_page/509",
209 )
210 lang_code = "unknown"
211 if ( 211 ↛ 215line 211 didn't jump to line 215 because the condition on line 211 was never true
212 wxr.config.capture_language_codes is not None
213 and lang_code not in wxr.config.capture_language_codes
214 ):
215 continue
216 wxr.wtp.start_section(lang_name)
217 base_data = WordEntry(
218 word=wxr.wtp.title,
219 lang_code=lang_code,
220 lang=lang_name,
221 pos="unknown",
222 )
223 base_data.categories = categories.get("categories", [])
224 for template_node in level2_node.find_child(NodeKind.TEMPLATE):
225 if template_node.template_name == "zh-forms":
226 process_zh_forms(wxr, base_data, template_node)
228 for level3_node in level2_node.find_child(NodeKind.LEVEL3):
229 parse_section(wxr, page_data, base_data, level3_node)
230 if not level2_node.contain_node(NodeKind.LEVEL3):
231 page_data.append(base_data.model_copy(deep=True))
232 process_low_quality_page(wxr, level2_node, page_data[-1])
233 if page_data[-1] == base_data: 233 ↛ 234line 233 didn't jump to line 234 because the condition on line 233 was never true
234 page_data.pop()
236 for data in page_data:
237 if len(data.senses) == 0:
238 data.senses.append(Sense(tags=["no-gloss"]))
240 return [d.model_dump(exclude_defaults=True) for d in page_data]
243def process_low_quality_page(
244 wxr: WiktextractContext, level_node: WikiNode, word_entry: WordEntry
245) -> None:
246 is_soft_redirect = False
247 for template_node in level_node.find_child(NodeKind.TEMPLATE):
248 if template_node.template_name in ("ja-see", "ja-see-kango", "zh-see"):
249 process_soft_redirect_template(wxr, template_node, word_entry)
250 is_soft_redirect = True
252 if not is_soft_redirect: # only have a gloss text
253 has_gloss_list = False
254 for list_node in level_node.find_child(NodeKind.LIST): 254 ↛ 255line 254 didn't jump to line 255 because the loop on line 254 never started
255 if list_node.sarg == "#":
256 extract_gloss(wxr, [word_entry], list_node, Sense())
257 has_gloss_list = True
258 if not has_gloss_list: 258 ↛ exitline 258 didn't return from function 'process_low_quality_page' because the condition on line 258 was always true
259 gloss_text = clean_node(wxr, word_entry, level_node.children)
260 if len(gloss_text) > 0: 260 ↛ exitline 260 didn't return from function 'process_low_quality_page' because the condition on line 260 was always true
261 for cat in word_entry.categories:
262 cat = cat.removeprefix(word_entry.lang).strip()
263 if cat in POS_TITLES: 263 ↛ 261line 263 didn't jump to line 261 because the condition on line 263 was always true
264 pos_data = POS_TITLES[cat]
265 word_entry.pos = pos_data["pos"]
266 word_entry.tags.extend(pos_data.get("tags", []))
267 break
268 word_entry.senses.append(Sense(glosses=[gloss_text]))
271def process_soft_redirect_template(
272 wxr: WiktextractContext, t_node: TemplateNode, word_entry: WordEntry
273) -> None:
274 # https://zh.wiktionary.org/wiki/Template:Ja-see
275 # https://zh.wiktionary.org/wiki/Template:Ja-see-kango
276 # https://zh.wiktionary.org/wiki/Template:Zh-see
277 template_name = t_node.template_name.lower()
278 if template_name == "zh-see":
279 word_entry.redirects.append(
280 clean_node(wxr, None, t_node.template_parameters.get(1, ""))
281 )
282 elif template_name in ("ja-see", "ja-see-kango"): 282 ↛ 287line 282 didn't jump to line 287 because the condition on line 282 was always true
283 for key, value in t_node.template_parameters.items():
284 if isinstance(key, int): 284 ↛ 283line 284 didn't jump to line 283 because the condition on line 284 was always true
285 word_entry.redirects.append(clean_node(wxr, None, value))
287 if word_entry.pos == "unknown": 287 ↛ exitline 287 didn't return from function 'process_soft_redirect_template' because the condition on line 287 was always true
288 word_entry.pos = "soft-redirect"
291def process_zh_forms(
292 wxr: WiktextractContext,
293 base_data: WordEntry,
294 template_node: TemplateNode,
295) -> None:
296 # https://zh.wiktionary.org/wiki/Template:zh-forms
297 for p_name, p_value in template_node.template_parameters.items():
298 if not isinstance(p_name, str):
299 continue
300 if re.fullmatch(r"s\d*", p_name):
301 form_data = Form(
302 form=clean_node(wxr, None, p_value), tags=["Simplified Chinese"]
303 )
304 if len(form_data.form) > 0: 304 ↛ 297line 304 didn't jump to line 297 because the condition on line 304 was always true
305 base_data.forms.append(form_data)
306 elif re.fullmatch(r"t\d+", p_name): 306 ↛ 307line 306 didn't jump to line 307 because the condition on line 306 was never true
307 form_data = Form(
308 form=clean_node(wxr, None, p_value),
309 tags=["Traditional Chinese"],
310 )
311 if len(form_data.form) > 0:
312 base_data.forms.append(form_data)
313 elif p_name == "alt":
314 for form_text in clean_node(wxr, None, p_value).split(","):
315 texts = form_text.split("-")
316 form_data = Form(form=texts[0], raw_tags=texts[1:])
317 if len(form_data.form) > 0: 317 ↛ 314line 317 didn't jump to line 314 because the condition on line 317 was always true
318 base_data.forms.append(form_data)
319 elif p_name == "lit":
320 lit = clean_node(wxr, None, p_value)
321 base_data.literal_meaning = lit
324# https://zh.wiktionary.org/wiki/Template:Zh-cat
325# https://zh.wiktionary.org/wiki/Template:Catlangname
326CATEGORY_TEMPLATES = frozenset(["zh-cat", "cln", "catlangname", "c", "topics"])
329def add_page_end_categories(
330 wxr: WiktextractContext, page_data: list[WordEntry], template: TemplateNode
331) -> None:
332 if template.template_name.lower() in CATEGORY_TEMPLATES: 332 ↛ 333line 332 didn't jump to line 333 because the condition on line 332 was never true
333 categories = {}
334 clean_node(wxr, categories, template)
335 for data in page_data:
336 if data.lang_code == page_data[-1].lang_code:
337 data.categories.extend(categories.get("categories", []))