Coverage for src/wiktextract/extractor/zh/page.py: 77%
167 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1import re
2from typing import Any
4from mediawiki_langcodes import name_to_code
5from wikitextprocessor.parser import (
6 LEVEL_KIND_FLAGS,
7 LevelNode,
8 NodeKind,
9 TemplateNode,
10 WikiNode,
11)
13from ...page import clean_node
14from ...wxr_context import WiktextractContext
15from ...wxr_logging import logger
16from .descendant import extract_descendant_section
17from .etymology import extract_etymology
18from .gloss import extract_gloss
19from .headword_line import extract_headword_line_template, extract_tlb_template
20from .inflection import extract_inflections
21from .linkage import extract_linkage_section
22from .models import Form, Sense, WordEntry
23from .note import extract_note
24from .pronunciation import extract_pronunciation
25from .section_titles import (
26 DESCENDANTS_TITLES,
27 ETYMOLOGY_TITLES,
28 IGNORED_TITLES,
29 INFLECTION_TITLES,
30 LINKAGE_TITLES,
31 NOTES_TITLES,
32 POS_TITLES,
33 PRONUNCIATION_TITLES,
34 TRANSLATIONS_TITLES,
35)
36from .translation import extract_translation
39def parse_section(
40 wxr: WiktextractContext,
41 page_data: list[WordEntry],
42 base_data: WordEntry,
43 level_node: LevelNode,
44) -> None:
45 subtitle = clean_node(wxr, None, level_node.largs)
46 # remove number suffix from subtitle
47 subtitle = re.sub(r"\s*(?:(.+)|\d+)$", "", subtitle)
48 wxr.wtp.start_subsection(subtitle)
49 if subtitle in IGNORED_TITLES: 49 ↛ 50line 49 didn't jump to line 50 because the condition on line 49 was never true
50 pass
51 elif subtitle in POS_TITLES:
52 process_pos_block(wxr, page_data, base_data, level_node, subtitle)
53 elif wxr.config.capture_etymologies and subtitle.startswith( 53 ↛ 56line 53 didn't jump to line 56 because the condition on line 53 was never true
54 tuple(ETYMOLOGY_TITLES)
55 ):
56 extract_etymology(wxr, page_data, base_data, level_node)
57 elif wxr.config.capture_pronunciation and subtitle in PRONUNCIATION_TITLES:
58 extract_pronunciation(wxr, page_data, base_data, level_node)
59 elif wxr.config.capture_linkages and subtitle in LINKAGE_TITLES: 59 ↛ 85line 59 didn't jump to line 85 because the condition on line 59 was always true
60 is_descendant_section = False
61 if subtitle in DESCENDANTS_TITLES:
62 for t_node in level_node.find_child_recursively(NodeKind.TEMPLATE): 62 ↛ 72line 62 didn't jump to line 72 because the loop on line 62 didn't complete
63 if t_node.template_name.lower() in [ 63 ↛ 62line 63 didn't jump to line 62 because the condition on line 63 was always true
64 "desc",
65 "descendant",
66 "desctree",
67 "descendants tree",
68 "cjkv",
69 ]:
70 is_descendant_section = True
71 break
72 if is_descendant_section and wxr.config.capture_descendants:
73 extract_descendant_section(
74 wxr,
75 level_node,
76 page_data if len(page_data) > 0 else [base_data],
77 )
78 elif not is_descendant_section: 78 ↛ 107line 78 didn't jump to line 107 because the condition on line 78 was always true
79 extract_linkage_section(
80 wxr,
81 page_data if len(page_data) > 0 else [base_data],
82 level_node,
83 LINKAGE_TITLES[subtitle],
84 )
85 elif wxr.config.capture_translations and subtitle in TRANSLATIONS_TITLES:
86 if len(page_data) == 0:
87 page_data.append(base_data.model_copy(deep=True))
88 extract_translation(wxr, page_data, level_node)
89 elif wxr.config.capture_inflections and subtitle in INFLECTION_TITLES:
90 extract_inflections(
91 wxr, page_data if len(page_data) > 0 else [base_data], level_node
92 )
93 elif wxr.config.capture_descendants and subtitle in DESCENDANTS_TITLES:
94 extract_descendant_section(
95 wxr, level_node, page_data if len(page_data) > 0 else [base_data]
96 )
97 elif subtitle in NOTES_TITLES:
98 extract_note(
99 wxr, page_data if len(page_data) > 0 else [base_data], level_node
100 )
101 else:
102 wxr.wtp.debug(
103 f"Unhandled subtitle: {subtitle}",
104 sortid="extractor/zh/page/parse_section/192",
105 )
107 for next_level_node in level_node.find_child(LEVEL_KIND_FLAGS):
108 parse_section(wxr, page_data, base_data, next_level_node)
110 for template in level_node.find_child(NodeKind.TEMPLATE):
111 add_page_end_categories(wxr, page_data, template)
114def process_pos_block(
115 wxr: WiktextractContext,
116 page_data: list[WordEntry],
117 base_data: WordEntry,
118 level_node: LevelNode,
119 pos_text: str,
120):
121 pos_data = POS_TITLES[pos_text]
122 pos_type = pos_data["pos"]
123 base_data.pos = pos_type
124 page_data.append(base_data.model_copy(deep=True))
125 page_data[-1].tags.extend(pos_data.get("tags", []))
126 for index, child in enumerate(level_node.filter_empty_str_child()):
127 if isinstance(child, WikiNode):
128 if index == 0 and isinstance(child, TemplateNode):
129 extract_headword_line_template(
130 wxr, page_data, child, base_data.lang_code
131 )
132 process_soft_redirect_template(wxr, child, page_data)
133 elif ( 133 ↛ 136line 133 didn't jump to line 136 because the condition on line 133 was never true
134 isinstance(child, TemplateNode) and child.template_name == "tlb"
135 ):
136 extract_tlb_template(wxr, child, page_data)
137 elif child.kind == NodeKind.LIST:
138 extract_gloss(wxr, page_data, child, Sense())
140 if len(page_data[-1].senses) == 0 and not level_node.contain_node(
141 NodeKind.LIST
142 ):
143 # low quality pages don't put gloss in list
144 gloss_text = clean_node(
145 wxr,
146 page_data[-1],
147 list(level_node.invert_find_child(LEVEL_KIND_FLAGS)),
148 )
149 if len(gloss_text) > 0: 149 ↛ 152line 149 didn't jump to line 152 because the condition on line 149 was always true
150 page_data[-1].senses.append(Sense(glosses=[gloss_text]))
151 else:
152 page_data[-1].senses.append(Sense(tags=["no-gloss"]))
155def parse_page(
156 wxr: WiktextractContext, page_title: str, page_text: str
157) -> list[dict[str, Any]]:
158 # page layout documents
159 # https://zh.wiktionary.org/wiki/Wiktionary:佈局解釋
160 # https://zh.wiktionary.org/wiki/Wiktionary:体例说明
161 # https://zh.wiktionary.org/wiki/Wiktionary:格式手冊
163 # skip translation pages
164 if page_title.endswith( 164 ↛ 167line 164 didn't jump to line 167 because the condition on line 164 was never true
165 tuple("/" + tr_title for tr_title in TRANSLATIONS_TITLES)
166 ):
167 return []
169 if wxr.config.verbose: 169 ↛ 170line 169 didn't jump to line 170 because the condition on line 169 was never true
170 logger.info(f"Parsing page: {page_title}")
171 wxr.config.word = page_title
172 wxr.wtp.start_page(page_title)
174 # Parse the page, pre-expanding those templates that are likely to
175 # influence parsing
176 tree = wxr.wtp.parse(page_text, pre_expand=True)
178 page_data = []
179 for level2_node in tree.find_child(NodeKind.LEVEL2):
180 categories = {}
181 lang_name = clean_node(wxr, categories, level2_node.largs)
182 lang_code = name_to_code(lang_name, "zh")
183 if lang_code == "": 183 ↛ 184line 183 didn't jump to line 184 because the condition on line 183 was never true
184 wxr.wtp.warning(
185 f"Unrecognized language name: {lang_name}",
186 sortid="extractor/zh/page/parse_page/509",
187 )
188 lang_code = "unknown"
189 if ( 189 ↛ 193line 189 didn't jump to line 193 because the condition on line 189 was never true
190 wxr.config.capture_language_codes is not None
191 and lang_code not in wxr.config.capture_language_codes
192 ):
193 continue
194 wxr.wtp.start_section(lang_name)
195 base_data = WordEntry(
196 word=wxr.wtp.title,
197 lang_code=lang_code,
198 lang=lang_name,
199 pos="unknown",
200 )
201 base_data.categories = categories.get("categories", [])
202 for template_node in level2_node.find_child(NodeKind.TEMPLATE):
203 if template_node.template_name == "zh-forms":
204 process_zh_forms(wxr, base_data, template_node)
206 for level3_node in level2_node.find_child(NodeKind.LEVEL3):
207 parse_section(wxr, page_data, base_data, level3_node)
208 if not level2_node.contain_node(NodeKind.LEVEL3):
209 page_data.append(base_data.model_copy(deep=True))
210 process_low_quality_page(wxr, level2_node, page_data)
211 if page_data[-1] == base_data: 211 ↛ 212line 211 didn't jump to line 212 because the condition on line 211 was never true
212 page_data.pop()
214 for data in page_data:
215 if len(data.senses) == 0:
216 data.senses.append(Sense(tags=["no-gloss"]))
218 return [d.model_dump(exclude_defaults=True) for d in page_data]
221def process_low_quality_page(
222 wxr: WiktextractContext,
223 level_node: WikiNode,
224 page_data: list[WordEntry],
225) -> None:
226 is_soft_redirect = False
227 for template_node in level_node.find_child(NodeKind.TEMPLATE):
228 if template_node.template_name in ("ja-see", "ja-see-kango", "zh-see"):
229 process_soft_redirect_template(wxr, template_node, page_data)
230 is_soft_redirect = True
232 if not is_soft_redirect: # only have a gloss text
233 gloss_text = clean_node(wxr, page_data[-1], level_node.children)
234 if len(gloss_text) > 0: 234 ↛ exitline 234 didn't return from function 'process_low_quality_page' because the condition on line 234 was always true
235 for cat in page_data[-1].categories:
236 cat = cat.removeprefix(page_data[-1].lang).strip()
237 if cat in POS_TITLES: 237 ↛ 235line 237 didn't jump to line 235 because the condition on line 237 was always true
238 pos_data = POS_TITLES[cat]
239 page_data[-1].pos = pos_data["pos"]
240 page_data[-1].tags.extend(pos_data.get("tags", []))
241 break
242 page_data[-1].senses.append(Sense(glosses=[gloss_text]))
245def process_soft_redirect_template(
246 wxr: WiktextractContext,
247 template_node: TemplateNode,
248 page_data: list[WordEntry],
249) -> None:
250 # https://zh.wiktionary.org/wiki/Template:Ja-see
251 # https://zh.wiktionary.org/wiki/Template:Ja-see-kango
252 # https://zh.wiktionary.org/wiki/Template:Zh-see
253 template_name = template_node.template_name.lower()
254 if template_name == "zh-see":
255 page_data[-1].redirects.append(
256 clean_node(wxr, None, template_node.template_parameters.get(1, ""))
257 )
258 elif template_name in ("ja-see", "ja-see-kango"):
259 for key, value in template_node.template_parameters.items():
260 if isinstance(key, int): 260 ↛ 259line 260 didn't jump to line 259 because the condition on line 260 was always true
261 page_data[-1].redirects.append(clean_node(wxr, None, value))
263 if page_data[-1].pos == "unknown":
264 page_data[-1].pos = "soft-redirect"
267def process_zh_forms(
268 wxr: WiktextractContext,
269 base_data: WordEntry,
270 template_node: TemplateNode,
271) -> None:
272 # https://zh.wiktionary.org/wiki/Template:zh-forms
273 for p_name, p_value in template_node.template_parameters.items():
274 if not isinstance(p_name, str):
275 continue
276 if re.fullmatch(r"s\d*", p_name):
277 form_data = Form(
278 form=clean_node(wxr, None, p_value), tags=["Simplified Chinese"]
279 )
280 if len(form_data.form) > 0: 280 ↛ 273line 280 didn't jump to line 273 because the condition on line 280 was always true
281 base_data.forms.append(form_data)
282 elif re.fullmatch(r"t\d+", p_name): 282 ↛ 283line 282 didn't jump to line 283 because the condition on line 282 was never true
283 form_data = Form(
284 form=clean_node(wxr, None, p_value),
285 tags=["Traditional Chinese"],
286 )
287 if len(form_data.form) > 0:
288 base_data.forms.append(form_data)
289 elif p_name == "alt":
290 for form_text in clean_node(wxr, None, p_value).split(","):
291 texts = form_text.split("-")
292 form_data = Form(form=texts[0], raw_tags=texts[1:])
293 if len(form_data.form) > 0: 293 ↛ 290line 293 didn't jump to line 290 because the condition on line 293 was always true
294 base_data.forms.append(form_data)
295 elif p_name == "lit":
296 lit = clean_node(wxr, None, p_value)
297 base_data.literal_meaning = lit
300# https://zh.wiktionary.org/wiki/Template:Zh-cat
301# https://zh.wiktionary.org/wiki/Template:Catlangname
302CATEGORY_TEMPLATES = frozenset(["zh-cat", "cln", "catlangname", "c", "topics"])
305def add_page_end_categories(
306 wxr: WiktextractContext, page_data: list[WordEntry], template: TemplateNode
307) -> None:
308 if template.template_name.lower() in CATEGORY_TEMPLATES: 308 ↛ 309line 308 didn't jump to line 309 because the condition on line 308 was never true
309 categories = {}
310 clean_node(wxr, categories, template)
311 for data in page_data:
312 if data.lang_code == page_data[-1].lang_code:
313 data.categories.extend(categories.get("categories", []))