Coverage for src/wiktextract/extractor/zh/example.py: 97%
208 statements
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-15 05:18 +0000
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-15 05:18 +0000
1from wikitextprocessor.parser import HTMLNode, NodeKind, TemplateNode, WikiNode
3from ...page import clean_node
4from ...wxr_context import WiktextractContext
5from ..ruby import extract_ruby
6from ..share import calculate_bold_offsets
7from .linkage import process_linkage_templates_in_gloss
8from .models import Example, Form, Sense, WordEntry
9from .tags import translate_raw_tags
11LINKAGE_TEMPLATES = {
12 "syn": "synonyms",
13 "synonyms": "synonyms",
14 "ant": "antonyms",
15 "antonyms": "antonyms",
16 "antonym": "antonyms",
17 "hyper": "hypernyms",
18 "hypernyms": "hypernyms",
19 "hypo": "hyponyms",
20 "hyponyms": "hyponyms",
21 "cot": "coordinate_terms",
22 "coo": "coordinate_terms",
23 "coord": "coordinate_terms",
24 "coordinate terms": "coordinate_terms",
25}
28def extract_example_list_item(
29 wxr: WiktextractContext,
30 sense_data: Sense,
31 list_item: WikiNode,
32 word_entry: WordEntry,
33 parent_example: Example | None = None,
34) -> None:
35 example_data = parent_example or Example()
36 if list_item.contain_node(NodeKind.LIST) and not all(
37 isinstance(n, TemplateNode)
38 for n in list_item.invert_find_child(NodeKind.LIST)
39 ):
40 # plain text in the nested list, not using any template
41 # https://zh.wiktionary.org/wiki/%, the second example
42 extract_plain_text_example_list(wxr, list_item, example_data)
43 else:
44 # parse example templates
45 for child in list_item.find_child(NodeKind.TEMPLATE):
46 template_name = child.template_name
47 if (
48 template_name.startswith(("quote-", "RQ:"))
49 or template_name == "quote"
50 ):
51 extract_quote_templates(wxr, child, example_data)
52 clean_node(wxr, sense_data, child) # add cat link
53 elif template_name in ["ja-x", "ja-usex"]:
54 extract_template_ja_usex(wxr, child, example_data)
55 clean_node(wxr, sense_data, child) # add cat link
56 elif template_name in ["zh-x", "zh-usex", "zh-q", "zh-co"]:
57 sense_data.examples.extend(
58 extract_template_zh_x(wxr, child, example_data)
59 )
60 clean_node(wxr, sense_data, child) # add cat link
61 elif template_name in [
62 "ux",
63 "eg",
64 "usex",
65 "uxi",
66 "collocation",
67 "co",
68 "coi",
69 "ko-usex",
70 "ko-x",
71 "koex",
72 "th-usex",
73 "th-x",
74 "th-xi",
75 ]:
76 extract_template_ux(wxr, child, example_data)
77 clean_node(wxr, sense_data, child) # add cat link
78 elif template_name == "Q":
79 extract_template_Q(wxr, child, example_data)
80 clean_node(wxr, sense_data, child) # add cat link
81 elif template_name.lower() in LINKAGE_TEMPLATES:
82 process_linkage_templates_in_gloss(
83 wxr,
84 word_entry,
85 child,
86 LINKAGE_TEMPLATES[template_name.lower()],
87 " ".join(sense_data.glosses),
88 )
89 elif template_name.lower() in ["inline alt forms", "alti"]: 89 ↛ 45line 89 didn't jump to line 45 because the condition on line 89 was always true
90 extract_inline_alt_forms_template(wxr, word_entry, child)
92 for next_list_item in list_item.find_child_recursively(
93 NodeKind.LIST_ITEM
94 ):
95 extract_example_list_item(
96 wxr, sense_data, next_list_item, word_entry, example_data
97 )
99 if len(example_data.text) > 0 and parent_example is None:
100 sense_data.examples.append(example_data)
103def extract_plain_text_example_list(
104 wxr: WiktextractContext, list_item: WikiNode, example_data: Example
105) -> None:
106 for index, nested_list in list_item.find_child(
107 NodeKind.LIST, with_index=True
108 ):
109 example_data.ref = clean_node(wxr, None, list_item.children[:index])
110 example_data.text = clean_node(
111 wxr, None, nested_list.children[0].children
112 )
115def extract_quote_templates(
116 wxr: WiktextractContext, node: TemplateNode, example_data: Example
117) -> None:
118 """
119 Process `quote-*` and "RQ:*" templates.
120 """
121 expanded_node = wxr.wtp.parse(
122 wxr.wtp.node_to_wikitext(node), expand_all=True
123 )
124 for span_tag in expanded_node.find_html_recursively("span"):
125 span_class = span_tag.attrs.get("class", "")
126 if "cited-source" == span_class:
127 example_data.ref = clean_node(wxr, None, span_tag)
128 elif "e-quotation" in span_class:
129 example_data.text = clean_node(wxr, None, span_tag)
130 calculate_bold_offsets(
131 wxr,
132 span_tag,
133 example_data.text,
134 example_data,
135 "bold_text_offsets",
136 )
137 elif "e-translation" in span_class:
138 example_data.translation = clean_node(wxr, None, span_tag)
139 calculate_bold_offsets(
140 wxr,
141 span_tag,
142 example_data.translation,
143 example_data,
144 "bold_translation_offsets",
145 )
146 for i_tag in expanded_node.find_html_recursively(
147 "i", attr_name="class", attr_value="e-transliteration"
148 ):
149 example_data.roman = clean_node(wxr, None, i_tag)
150 calculate_bold_offsets(
151 wxr,
152 i_tag,
153 example_data.roman,
154 example_data,
155 "bold_roman_offsets",
156 )
157 break
160def extract_template_ja_usex(
161 wxr: WiktextractContext, node: TemplateNode, example_data: Example
162) -> None:
163 expanded_node = wxr.wtp.parse(
164 wxr.wtp.node_to_wikitext(node), expand_all=True
165 )
166 for span_tag in expanded_node.find_html(
167 "span", attr_name="class", attr_value="Jpan"
168 ):
169 ruby_data, node_without_ruby = extract_ruby(wxr, span_tag)
170 example_data.text = clean_node(wxr, None, node_without_ruby)
171 calculate_bold_offsets(
172 wxr,
173 wxr.wtp.parse(wxr.wtp.node_to_wikitext(node_without_ruby)),
174 example_data.text,
175 example_data,
176 "bold_text_offsets",
177 )
178 example_data.ruby = ruby_data
179 for span_tag in expanded_node.find_html_recursively(
180 "span", attr_name="class", attr_value="tr"
181 ):
182 example_data.roman = clean_node(wxr, None, span_tag)
183 calculate_bold_offsets(
184 wxr,
185 span_tag,
186 example_data.roman,
187 example_data,
188 "bold_roman_offsets",
189 )
190 tr_arg = wxr.wtp.parse(
191 wxr.wtp.node_to_wikitext(node.template_parameters.get(3, "")),
192 expand_all=True,
193 )
194 example_data.translation = clean_node(wxr, None, tr_arg)
195 calculate_bold_offsets(
196 wxr,
197 tr_arg,
198 example_data.translation,
199 example_data,
200 "bold_translation_offsets",
201 )
202 lit_arg = wxr.wtp.parse(
203 wxr.wtp.node_to_wikitext(node.template_parameters.get("lit", "")),
204 expand_all=True,
205 )
206 example_data.literal_meaning = clean_node(wxr, None, lit_arg)
207 calculate_bold_offsets(
208 wxr,
209 lit_arg,
210 example_data.literal_meaning,
211 example_data,
212 "bold_literal_offsets",
213 )
216def extract_template_zh_x(
217 wxr: WiktextractContext,
218 template_node: TemplateNode,
219 parent_example: Example,
220) -> list[Example]:
221 expanded_node = wxr.wtp.parse(
222 wxr.wtp.node_to_wikitext(template_node), expand_all=True
223 )
224 has_dl_tag = False
225 results = []
226 example_data = parent_example.model_copy(deep=True)
227 tr_arg = wxr.wtp.parse(
228 wxr.wtp.node_to_wikitext(template_node.template_parameters.get(2, "")),
229 expand_all=True,
230 )
231 example_data.translation = clean_node(wxr, None, tr_arg)
232 calculate_bold_offsets(
233 wxr,
234 tr_arg,
235 example_data.translation,
236 example_data,
237 "bold_translation_offsets",
238 )
239 lit_arg = wxr.wtp.parse(
240 wxr.wtp.node_to_wikitext(
241 template_node.template_parameters.get("lit", "")
242 ),
243 expand_all=True,
244 )
245 example_data.literal_meaning = clean_node(wxr, None, lit_arg)
246 calculate_bold_offsets(
247 wxr,
248 lit_arg,
249 example_data.literal_meaning,
250 example_data,
251 "bold_literal_offsets",
252 )
253 for dl_tag in expanded_node.find_html_recursively("dl"):
254 has_dl_tag = True
255 for dd_tag in dl_tag.find_html("dd"):
256 dd_text = clean_node(wxr, None, dd_tag)
257 if dd_text.startswith("出自:"):
258 example_data.ref = dd_text.removeprefix("出自:")
259 elif not dd_text.startswith("(字面義為"): 259 ↛ 255line 259 didn't jump to line 255 because the condition on line 259 was always true
260 for span_tag in dd_tag.find_html_recursively(
261 "span", attr_name="lang", attr_value="Latn"
262 ):
263 example_data.roman = clean_node(wxr, None, span_tag)
264 calculate_bold_offsets(
265 wxr,
266 span_tag,
267 example_data.roman,
268 example_data,
269 "bold_roman_offsets",
270 )
271 for span_tag in dd_tag.find_html_recursively("span"):
272 span_text = clean_node(wxr, None, span_tag)
273 if span_text.startswith("[") and span_text.endswith(
274 "]"
275 ):
276 example_data.raw_tags.append(span_text.strip("[]"))
277 break
278 results.extend(extract_zh_x_dl_span_tag(wxr, dl_tag, example_data))
280 # no source, single line example
281 if not has_dl_tag:
282 for span_tag in expanded_node.find_html( 282 ↛ 294line 282 didn't jump to line 294 because the loop on line 282 didn't complete
283 "span", attr_name="lang", attr_value="Latn"
284 ):
285 example_data.roman = clean_node(wxr, None, span_tag)
286 calculate_bold_offsets(
287 wxr,
288 span_tag,
289 example_data.roman,
290 example_data,
291 "bold_roman_offsets",
292 )
293 break
294 for span_tag in expanded_node.find_html("span"):
295 span_text = clean_node(wxr, None, span_tag)
296 if span_text.startswith("[") and span_text.endswith("]"):
297 example_data.raw_tags.append(span_text.strip("[]"))
298 for span_tag in expanded_node.find_html("span"):
299 span_lang = span_tag.attrs.get("lang", "")
300 if span_lang in ["zh-Hant", "zh-Hans"]:
301 example_text = clean_node(wxr, None, span_tag)
302 if len(example_text) > 0: 302 ↛ 298line 302 didn't jump to line 298 because the condition on line 302 was always true
303 new_example = example_data.model_copy(deep=True)
304 new_example.text = example_text
305 calculate_bold_offsets(
306 wxr,
307 span_tag,
308 example_text,
309 new_example,
310 "bold_text_offsets",
311 )
312 new_example.tags.append(
313 "Traditional-Chinese"
314 if span_lang == "zh-Hant"
315 else "Simplified-Chinese"
316 )
317 translate_raw_tags(new_example)
318 results.append(new_example)
319 return results
322def extract_zh_x_dl_span_tag(
323 wxr: WiktextractContext, dl_tag: HTMLNode, example: Example
324) -> list[Example]:
325 # process example text span tag and dialect span tag
326 results = []
327 is_first_hide = True
328 for span_tag in dl_tag.find_html("span"):
329 span_lang = span_tag.attrs.get("lang", "")
330 if span_lang in ["zh-Hant", "zh-Hans"]:
331 new_example = example.model_copy(deep=True)
332 new_example.text = clean_node(wxr, None, span_tag)
333 calculate_bold_offsets(
334 wxr,
335 span_tag,
336 new_example.text,
337 new_example,
338 "bold_text_offsets",
339 )
340 results.append(new_example)
341 elif "vsHide" in span_tag.attrs.get("class", ""):
342 # template has arg "collapsed=y"
343 results.extend(
344 extract_zh_x_dl_span_tag(
345 wxr,
346 span_tag,
347 results[-1]
348 if is_first_hide and len(results) > 0
349 else example,
350 )
351 )
352 is_first_hide = False
353 elif "font-size:x-small" in span_tag.attrs.get("style", ""): 353 ↛ 328line 353 didn't jump to line 328 because the condition on line 353 was always true
354 for link_node in span_tag.find_child(NodeKind.LINK):
355 raw_tag = clean_node(wxr, None, link_node)
356 if len(raw_tag) > 0: 356 ↛ 354line 356 didn't jump to line 354 because the condition on line 356 was always true
357 if len(results) > 0:
358 results[-1].raw_tags.append(raw_tag)
359 else:
360 example.raw_tags.append(raw_tag)
362 if dl_tag.tag == "dl":
363 for data in results:
364 translate_raw_tags(data)
365 return results
368def extract_template_ux(
369 wxr: WiktextractContext, node: TemplateNode, example_data: Example
370) -> None:
371 # https://zh.wiktionary.org/wiki/Template:ux
372 expanded_node = wxr.wtp.parse(
373 wxr.wtp.node_to_wikitext(node), expand_all=True
374 )
375 for html_node in expanded_node.find_child_recursively(NodeKind.HTML):
376 class_names = html_node.attrs.get("class", "")
377 if "e-example" in class_names:
378 example_data.text = clean_node(wxr, None, html_node)
379 calculate_bold_offsets(
380 wxr,
381 html_node,
382 example_data.text,
383 example_data,
384 "bold_text_offsets",
385 )
386 elif "e-transliteration" in class_names:
387 example_data.roman = clean_node(wxr, None, html_node)
388 calculate_bold_offsets(
389 wxr,
390 html_node,
391 example_data.roman,
392 example_data,
393 "bold_roman_offsets",
394 )
395 elif "e-translation" in class_names:
396 example_data.translation = clean_node(wxr, None, html_node)
397 calculate_bold_offsets(
398 wxr,
399 html_node,
400 example_data.translation,
401 example_data,
402 "bold_translation_offsets",
403 )
404 elif "e-literally" in class_names:
405 example_data.literal_meaning = clean_node(wxr, None, html_node)
406 calculate_bold_offsets(
407 wxr,
408 html_node,
409 example_data.literal_meaning,
410 example_data,
411 "bold_literal_offsets",
412 )
413 elif "qualifier-content" in class_names:
414 example_data.raw_tags.extend(
415 clean_node(wxr, None, html_node).split("、")
416 )
417 translate_raw_tags(example_data)
420def extract_template_Q(
421 wxr: WiktextractContext, node: TemplateNode, example_data: Example
422) -> None:
423 # https://zh.wiktionary.org/wiki/Template:Q
424 expanded_node = wxr.wtp.parse(
425 wxr.wtp.node_to_wikitext(node), expand_all=True
426 )
427 for div_tag in expanded_node.find_html(
428 "div", attr_name="class", attr_value="wiktQuote"
429 ):
430 ref_nodes = []
431 for child in div_tag.children: 431 ↛ 446line 431 didn't jump to line 446 because the loop on line 431 didn't complete
432 if isinstance(child, HTMLNode) and child.tag == "dl":
433 for i_tag in child.find_html_recursively(
434 "i", attr_name="class", attr_value="e-transliteration"
435 ):
436 example_data.roman = clean_node(wxr, None, i_tag)
437 calculate_bold_offsets(
438 wxr,
439 i_tag,
440 example_data.roman,
441 example_data,
442 "bold_roman_offsets",
443 )
444 break
445 ref_nodes.append(child)
446 ref_text = clean_node(wxr, None, ref_nodes)
447 if len(ref_text) > 0: 447 ↛ 449line 447 didn't jump to line 449 because the condition on line 447 was always true
448 example_data.ref = ref_text
449 for t_arg, field in (
450 ("quote", "text"),
451 ("t", "translation"),
452 ("trans", "translation"),
453 ("lit", "literal_meaning"),
454 ):
455 t_arg_node = wxr.wtp.parse(
456 wxr.wtp.node_to_wikitext(
457 node.template_parameters.get(t_arg, "")
458 ),
459 expand_all=True,
460 )
461 value = clean_node(wxr, None, t_arg_node)
462 if len(value) > 0:
463 setattr(example_data, field, value)
464 calculate_bold_offsets(
465 wxr,
466 t_arg_node,
467 value,
468 example_data,
469 "bold_" + field.split("_")[0] + "_offsets",
470 )
473def extract_inline_alt_forms_template(
474 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
475):
476 sense = " ".join(word_entry.senses[-1].glosses)
477 forms = []
478 raw_tag = ""
479 expanded_node = wxr.wtp.parse(
480 wxr.wtp.node_to_wikitext(t_node), expand_all=True
481 )
482 lang = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
483 for span_tag in expanded_node.find_html_recursively("span"):
484 span_class = span_tag.attrs.get("class", "")
485 span_lang = span_tag.attrs.get("lang", "")
486 if "qualifier-content" in span_class:
487 raw_tag = clean_node(wxr, None, span_tag)
488 elif span_lang == lang:
489 word = clean_node(wxr, None, span_tag)
490 if word != "": 490 ↛ 483line 490 didn't jump to line 483 because the condition on line 490 was always true
491 form = Form(form=word, sense=sense, tags=["alternative"])
492 if raw_tag != "":
493 form.raw_tags.append(raw_tag)
494 raw_tag = ""
495 translate_raw_tags(form)
496 forms.append(form)
497 elif span_class == "tr Latn" and len(forms) > 0:
498 forms[-1].roman = clean_node(wxr, None, span_tag)
499 word_entry.forms.extend(forms)