Coverage for src/wiktextract/extractor/zh/example.py: 97%
213 statements
« prev ^ index » next coverage.py v7.11.0, created at 2025-11-06 08:01 +0000
« prev ^ index » next coverage.py v7.11.0, created at 2025-11-06 08:01 +0000
1from wikitextprocessor.parser import HTMLNode, NodeKind, TemplateNode, WikiNode
3from ...page import clean_node
4from ...wxr_context import WiktextractContext
5from ..ruby import extract_ruby
6from ..share import calculate_bold_offsets
7from .linkage import process_linkage_templates_in_gloss
8from .models import Example, Form, Sense, WordEntry
9from .tags import translate_raw_tags
11LINKAGE_TEMPLATES = {
12 "syn": "synonyms",
13 "synonyms": "synonyms",
14 "ant": "antonyms",
15 "antonyms": "antonyms",
16 "antonym": "antonyms",
17 "hyper": "hypernyms",
18 "hypernyms": "hypernyms",
19 "hypo": "hyponyms",
20 "hyponyms": "hyponyms",
21 "cot": "coordinate_terms",
22 "coo": "coordinate_terms",
23 "coord": "coordinate_terms",
24 "coordinate terms": "coordinate_terms",
25}
28def extract_example_list_item(
29 wxr: WiktextractContext,
30 sense_data: Sense,
31 list_item: WikiNode,
32 word_entry: WordEntry,
33 parent_example: Example | None = None,
34) -> None:
35 example_data = parent_example or Example()
36 if list_item.contain_node(NodeKind.LIST) and not all(
37 isinstance(n, TemplateNode)
38 for n in list_item.invert_find_child(NodeKind.LIST)
39 ):
40 # plain text in the nested list, not using any template
41 # https://zh.wiktionary.org/wiki/%, the second example
42 extract_plain_text_example_list(
43 wxr, sense_data, list_item, word_entry, example_data
44 )
45 elif list_item.contain_node(NodeKind.TEMPLATE):
46 # parse example templates
47 for child in list_item.find_child(NodeKind.TEMPLATE):
48 template_name = child.template_name
49 if (
50 template_name.startswith(("quote-", "RQ:"))
51 or template_name == "quote"
52 ):
53 extract_quote_templates(wxr, child, example_data)
54 clean_node(wxr, sense_data, child) # add cat link
55 elif template_name in ["ja-x", "ja-usex"]:
56 extract_template_ja_usex(wxr, child, example_data)
57 clean_node(wxr, sense_data, child) # add cat link
58 elif template_name in ["zh-x", "zh-usex", "zh-q", "zh-co"]:
59 sense_data.examples.extend(
60 extract_template_zh_x(wxr, child, example_data)
61 )
62 clean_node(wxr, sense_data, child) # add cat link
63 elif template_name in [
64 "ux",
65 "eg",
66 "usex",
67 "uxi",
68 "collocation",
69 "co",
70 "coi",
71 "ko-usex",
72 "ko-x",
73 "koex",
74 "th-usex",
75 "th-x",
76 "th-xi",
77 ]:
78 extract_template_ux(wxr, child, example_data)
79 clean_node(wxr, sense_data, child) # add cat link
80 elif template_name == "Q":
81 extract_template_Q(wxr, child, example_data)
82 clean_node(wxr, sense_data, child) # add cat link
83 elif template_name.lower() in LINKAGE_TEMPLATES:
84 process_linkage_templates_in_gloss(
85 wxr,
86 word_entry,
87 child,
88 LINKAGE_TEMPLATES[template_name.lower()],
89 " ".join(sense_data.glosses),
90 )
91 elif template_name.lower() in ["inline alt forms", "alti"]: 91 ↛ 47line 91 didn't jump to line 47 because the condition on line 91 was always true
92 extract_inline_alt_forms_template(wxr, word_entry, child)
94 for next_list_item in list_item.find_child_recursively(
95 NodeKind.LIST_ITEM
96 ):
97 extract_example_list_item(
98 wxr, sense_data, next_list_item, word_entry, example_data
99 )
100 elif not list_item.contain_node(NodeKind.LIST): 100 ↛ 103line 100 didn't jump to line 103 because the condition on line 100 was always true
101 example_data.text = clean_node(wxr, None, list_item.children)
103 if len(example_data.text) > 0 and parent_example is None:
104 sense_data.examples.append(example_data)
107def extract_plain_text_example_list(
108 wxr: WiktextractContext,
109 sense: Sense,
110 list_item: WikiNode,
111 word_entry: WordEntry,
112 example_data: Example,
113) -> None:
114 for index, nested_list in list_item.find_child(
115 NodeKind.LIST, with_index=True
116 ):
117 example_data.ref = clean_node(wxr, None, list_item.children[:index])
118 for child_list_item in nested_list.find_child(NodeKind.LIST_ITEM):
119 extract_example_list_item(
120 wxr, sense, child_list_item, word_entry, example_data
121 )
124def extract_quote_templates(
125 wxr: WiktextractContext, node: TemplateNode, example_data: Example
126) -> None:
127 """
128 Process `quote-*` and "RQ:*" templates.
129 """
130 expanded_node = wxr.wtp.parse(
131 wxr.wtp.node_to_wikitext(node), expand_all=True
132 )
133 for span_tag in expanded_node.find_html_recursively("span"):
134 span_class = span_tag.attrs.get("class", "")
135 if "cited-source" == span_class:
136 example_data.ref = clean_node(wxr, None, span_tag)
137 elif "e-quotation" in span_class:
138 example_data.ruby, node_without_ruby = extract_ruby(wxr, span_tag)
139 example_data.text = clean_node(wxr, None, node_without_ruby)
140 calculate_bold_offsets(
141 wxr,
142 span_tag,
143 example_data.text,
144 example_data,
145 "bold_text_offsets",
146 )
147 elif "e-translation" in span_class:
148 example_data.translation = clean_node(wxr, None, span_tag)
149 calculate_bold_offsets(
150 wxr,
151 span_tag,
152 example_data.translation,
153 example_data,
154 "bold_translation_offsets",
155 )
156 for i_tag in expanded_node.find_html_recursively(
157 "i", attr_name="class", attr_value="e-transliteration"
158 ):
159 example_data.roman = clean_node(wxr, None, i_tag)
160 calculate_bold_offsets(
161 wxr,
162 i_tag,
163 example_data.roman,
164 example_data,
165 "bold_roman_offsets",
166 )
167 break
170def extract_template_ja_usex(
171 wxr: WiktextractContext, node: TemplateNode, example_data: Example
172) -> None:
173 expanded_node = wxr.wtp.parse(
174 wxr.wtp.node_to_wikitext(node), expand_all=True
175 )
176 for span_tag in expanded_node.find_html(
177 "span", attr_name="class", attr_value="Jpan"
178 ):
179 ruby_data, node_without_ruby = extract_ruby(wxr, span_tag)
180 example_data.text = clean_node(wxr, None, node_without_ruby)
181 calculate_bold_offsets(
182 wxr,
183 wxr.wtp.parse(wxr.wtp.node_to_wikitext(node_without_ruby)),
184 example_data.text,
185 example_data,
186 "bold_text_offsets",
187 )
188 example_data.ruby = ruby_data
189 for span_tag in expanded_node.find_html_recursively(
190 "span", attr_name="class", attr_value="tr"
191 ):
192 example_data.roman = clean_node(wxr, None, span_tag)
193 calculate_bold_offsets(
194 wxr,
195 span_tag,
196 example_data.roman,
197 example_data,
198 "bold_roman_offsets",
199 )
200 tr_arg = wxr.wtp.parse(
201 wxr.wtp.node_to_wikitext(node.template_parameters.get(3, "")),
202 expand_all=True,
203 )
204 example_data.translation = clean_node(wxr, None, tr_arg)
205 calculate_bold_offsets(
206 wxr,
207 tr_arg,
208 example_data.translation,
209 example_data,
210 "bold_translation_offsets",
211 )
212 lit_arg = wxr.wtp.parse(
213 wxr.wtp.node_to_wikitext(node.template_parameters.get("lit", "")),
214 expand_all=True,
215 )
216 example_data.literal_meaning = clean_node(wxr, None, lit_arg)
217 calculate_bold_offsets(
218 wxr,
219 lit_arg,
220 example_data.literal_meaning,
221 example_data,
222 "bold_literal_offsets",
223 )
226def extract_template_zh_x(
227 wxr: WiktextractContext,
228 template_node: TemplateNode,
229 parent_example: Example,
230) -> list[Example]:
231 expanded_node = wxr.wtp.parse(
232 wxr.wtp.node_to_wikitext(template_node), expand_all=True
233 )
234 has_dl_tag = False
235 results = []
236 example_data = parent_example.model_copy(deep=True)
237 tr_arg = wxr.wtp.parse(
238 wxr.wtp.node_to_wikitext(template_node.template_parameters.get(2, "")),
239 expand_all=True,
240 )
241 example_data.translation = clean_node(wxr, None, tr_arg)
242 calculate_bold_offsets(
243 wxr,
244 tr_arg,
245 example_data.translation,
246 example_data,
247 "bold_translation_offsets",
248 )
249 lit_arg = wxr.wtp.parse(
250 wxr.wtp.node_to_wikitext(
251 template_node.template_parameters.get("lit", "")
252 ),
253 expand_all=True,
254 )
255 example_data.literal_meaning = clean_node(wxr, None, lit_arg)
256 calculate_bold_offsets(
257 wxr,
258 lit_arg,
259 example_data.literal_meaning,
260 example_data,
261 "bold_literal_offsets",
262 )
263 for dl_tag in expanded_node.find_html_recursively("dl"):
264 has_dl_tag = True
265 for dd_tag in dl_tag.find_html("dd"):
266 dd_text = clean_node(wxr, None, dd_tag)
267 if dd_text.startswith("出自:"):
268 example_data.ref = dd_text.removeprefix("出自:")
269 elif not dd_text.startswith("(字面義為"): 269 ↛ 265line 269 didn't jump to line 265 because the condition on line 269 was always true
270 for span_tag in dd_tag.find_html_recursively(
271 "span", attr_name="lang", attr_value="Latn"
272 ):
273 example_data.roman = clean_node(wxr, None, span_tag)
274 calculate_bold_offsets(
275 wxr,
276 span_tag,
277 example_data.roman,
278 example_data,
279 "bold_roman_offsets",
280 )
281 for span_tag in dd_tag.find_html_recursively("span"):
282 span_text = clean_node(wxr, None, span_tag)
283 if span_text.startswith("[") and span_text.endswith(
284 "]"
285 ):
286 example_data.raw_tags.append(span_text.strip("[]"))
287 break
288 results.extend(extract_zh_x_dl_span_tag(wxr, dl_tag, example_data))
290 # no source, single line example
291 if not has_dl_tag:
292 for span_tag in expanded_node.find_html( 292 ↛ 304line 292 didn't jump to line 304 because the loop on line 292 didn't complete
293 "span", attr_name="lang", attr_value="Latn"
294 ):
295 example_data.roman = clean_node(wxr, None, span_tag)
296 calculate_bold_offsets(
297 wxr,
298 span_tag,
299 example_data.roman,
300 example_data,
301 "bold_roman_offsets",
302 )
303 break
304 for span_tag in expanded_node.find_html("span"):
305 span_text = clean_node(wxr, None, span_tag)
306 if span_text.startswith("[") and span_text.endswith("]"):
307 example_data.raw_tags.append(span_text.strip("[]"))
308 for span_tag in expanded_node.find_html("span"):
309 span_lang = span_tag.attrs.get("lang", "")
310 if span_lang in ["zh-Hant", "zh-Hans"]:
311 example_text = clean_node(wxr, None, span_tag)
312 if len(example_text) > 0: 312 ↛ 308line 312 didn't jump to line 308 because the condition on line 312 was always true
313 new_example = example_data.model_copy(deep=True)
314 new_example.text = example_text
315 calculate_bold_offsets(
316 wxr,
317 span_tag,
318 example_text,
319 new_example,
320 "bold_text_offsets",
321 )
322 new_example.tags.append(
323 "Traditional-Chinese"
324 if span_lang == "zh-Hant"
325 else "Simplified-Chinese"
326 )
327 translate_raw_tags(new_example)
328 results.append(new_example)
329 return results
332def extract_zh_x_dl_span_tag(
333 wxr: WiktextractContext, dl_tag: HTMLNode, example: Example
334) -> list[Example]:
335 # process example text span tag and dialect span tag
336 results = []
337 is_first_hide = True
338 for span_tag in dl_tag.find_html("span"):
339 span_lang = span_tag.attrs.get("lang", "")
340 if span_lang in ["zh-Hant", "zh-Hans"]:
341 new_example = example.model_copy(deep=True)
342 new_example.text = clean_node(wxr, None, span_tag)
343 calculate_bold_offsets(
344 wxr,
345 span_tag,
346 new_example.text,
347 new_example,
348 "bold_text_offsets",
349 )
350 results.append(new_example)
351 elif "vsHide" in span_tag.attrs.get("class", ""):
352 # template has arg "collapsed=y"
353 results.extend(
354 extract_zh_x_dl_span_tag(
355 wxr,
356 span_tag,
357 results[-1]
358 if is_first_hide and len(results) > 0
359 else example,
360 )
361 )
362 is_first_hide = False
363 elif "font-size:x-small" in span_tag.attrs.get("style", ""): 363 ↛ 338line 363 didn't jump to line 338 because the condition on line 363 was always true
364 for link_node in span_tag.find_child(NodeKind.LINK):
365 raw_tag = clean_node(wxr, None, link_node)
366 if len(raw_tag) > 0: 366 ↛ 364line 366 didn't jump to line 364 because the condition on line 366 was always true
367 if len(results) > 0:
368 results[-1].raw_tags.append(raw_tag)
369 else:
370 example.raw_tags.append(raw_tag)
372 if dl_tag.tag == "dl":
373 for data in results:
374 translate_raw_tags(data)
375 return results
378def extract_template_ux(
379 wxr: WiktextractContext, node: TemplateNode, example_data: Example
380) -> None:
381 # https://zh.wiktionary.org/wiki/Template:ux
382 expanded_node = wxr.wtp.parse(
383 wxr.wtp.node_to_wikitext(node), expand_all=True
384 )
385 for html_node in expanded_node.find_child_recursively(NodeKind.HTML):
386 class_names = html_node.attrs.get("class", "")
387 if "e-example" in class_names:
388 example_data.text = clean_node(wxr, None, html_node)
389 calculate_bold_offsets(
390 wxr,
391 html_node,
392 example_data.text,
393 example_data,
394 "bold_text_offsets",
395 )
396 elif "e-transliteration" in class_names:
397 example_data.roman = clean_node(wxr, None, html_node)
398 calculate_bold_offsets(
399 wxr,
400 html_node,
401 example_data.roman,
402 example_data,
403 "bold_roman_offsets",
404 )
405 elif "e-translation" in class_names:
406 example_data.translation = clean_node(wxr, None, html_node)
407 calculate_bold_offsets(
408 wxr,
409 html_node,
410 example_data.translation,
411 example_data,
412 "bold_translation_offsets",
413 )
414 elif "e-literally" in class_names:
415 example_data.literal_meaning = clean_node(wxr, None, html_node)
416 calculate_bold_offsets(
417 wxr,
418 html_node,
419 example_data.literal_meaning,
420 example_data,
421 "bold_literal_offsets",
422 )
423 elif "qualifier-content" in class_names:
424 example_data.raw_tags.extend(
425 clean_node(wxr, None, html_node).split("、")
426 )
427 translate_raw_tags(example_data)
430def extract_template_Q(
431 wxr: WiktextractContext, node: TemplateNode, example_data: Example
432) -> None:
433 # https://zh.wiktionary.org/wiki/Template:Q
434 expanded_node = wxr.wtp.parse(
435 wxr.wtp.node_to_wikitext(node), expand_all=True
436 )
437 for div_tag in expanded_node.find_html(
438 "div", attr_name="class", attr_value="wiktQuote"
439 ):
440 ref_nodes = []
441 for child in div_tag.children: 441 ↛ 456line 441 didn't jump to line 456 because the loop on line 441 didn't complete
442 if isinstance(child, HTMLNode) and child.tag == "dl":
443 for i_tag in child.find_html_recursively(
444 "i", attr_name="class", attr_value="e-transliteration"
445 ):
446 example_data.roman = clean_node(wxr, None, i_tag)
447 calculate_bold_offsets(
448 wxr,
449 i_tag,
450 example_data.roman,
451 example_data,
452 "bold_roman_offsets",
453 )
454 break
455 ref_nodes.append(child)
456 ref_text = clean_node(wxr, None, ref_nodes)
457 if len(ref_text) > 0: 457 ↛ 459line 457 didn't jump to line 459 because the condition on line 457 was always true
458 example_data.ref = ref_text
459 for t_arg, field in (
460 ("quote", "text"),
461 ("t", "translation"),
462 ("trans", "translation"),
463 ("lit", "literal_meaning"),
464 ):
465 t_arg_node = wxr.wtp.parse(
466 wxr.wtp.node_to_wikitext(
467 node.template_parameters.get(t_arg, "")
468 ),
469 expand_all=True,
470 )
471 value = clean_node(wxr, None, t_arg_node)
472 if len(value) > 0:
473 setattr(example_data, field, value)
474 calculate_bold_offsets(
475 wxr,
476 t_arg_node,
477 value,
478 example_data,
479 "bold_" + field.split("_")[0] + "_offsets",
480 )
483def extract_inline_alt_forms_template(
484 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
485):
486 sense = " ".join(word_entry.senses[-1].glosses)
487 forms = []
488 raw_tag = ""
489 expanded_node = wxr.wtp.parse(
490 wxr.wtp.node_to_wikitext(t_node), expand_all=True
491 )
492 lang = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
493 for span_tag in expanded_node.find_html_recursively("span"):
494 span_class = span_tag.attrs.get("class", "")
495 span_lang = span_tag.attrs.get("lang", "")
496 if "qualifier-content" in span_class:
497 raw_tag = clean_node(wxr, None, span_tag)
498 elif span_lang == lang:
499 word = clean_node(wxr, None, span_tag)
500 if word != "": 500 ↛ 493line 500 didn't jump to line 493 because the condition on line 500 was always true
501 form = Form(form=word, sense=sense, tags=["alternative"])
502 if raw_tag != "":
503 form.raw_tags.append(raw_tag)
504 raw_tag = ""
505 translate_raw_tags(form)
506 forms.append(form)
507 elif span_class == "tr Latn" and len(forms) > 0:
508 forms[-1].roman = clean_node(wxr, None, span_tag)
509 word_entry.forms.extend(forms)