Coverage for src / wiktextract / extractor / zh / example.py: 97%
213 statements
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-05 07:46 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-05 07:46 +0000
1from wikitextprocessor.parser import HTMLNode, NodeKind, TemplateNode, WikiNode
3from ...page import clean_node
4from ...wxr_context import WiktextractContext
5from ..ruby import extract_ruby
6from ..share import calculate_bold_offsets
7from .linkage import process_linkage_templates_in_gloss
8from .models import Example, Form, Sense, WordEntry
9from .tags import translate_raw_tags
11LINKAGE_TEMPLATES = {
12 "syn": "synonyms",
13 "synonyms": "synonyms",
14 "ant": "antonyms",
15 "antonyms": "antonyms",
16 "antonym": "antonyms",
17 "hyper": "hypernyms",
18 "hypernyms": "hypernyms",
19 "hypo": "hyponyms",
20 "hyponyms": "hyponyms",
21 "cot": "coordinate_terms",
22 "coo": "coordinate_terms",
23 "coord": "coordinate_terms",
24 "coordinate terms": "coordinate_terms",
25}
28def extract_example_list_item(
29 wxr: WiktextractContext,
30 sense_data: Sense,
31 list_item: WikiNode,
32 word_entry: WordEntry,
33 parent_example: Example | None = None,
34) -> None:
35 example_data = parent_example or Example(text="")
36 if list_item.contain_node(NodeKind.LIST) and not all(
37 isinstance(n, TemplateNode)
38 for n in list_item.invert_find_child(NodeKind.LIST)
39 ):
40 # plain text in the nested list, not using any template
41 # https://zh.wiktionary.org/wiki/%, the second example
42 extract_plain_text_example_list(
43 wxr, sense_data, list_item, word_entry, example_data
44 )
45 elif list_item.contain_node(NodeKind.TEMPLATE):
46 # parse example templates
47 for child in list_item.find_child(NodeKind.TEMPLATE):
48 template_name = child.template_name
49 if (
50 template_name.startswith(("quote-", "RQ:"))
51 or template_name == "quote"
52 ):
53 extract_quote_templates(wxr, child, example_data)
54 clean_node(wxr, sense_data, child) # add cat link
55 elif template_name in ["ja-x", "ja-usex"]:
56 extract_template_ja_usex(wxr, child, example_data)
57 clean_node(wxr, sense_data, child) # add cat link
58 elif template_name in ["zh-x", "zh-usex", "zh-q", "zh-co"]:
59 sense_data.examples.extend(
60 extract_template_zh_x(wxr, child, example_data)
61 )
62 clean_node(wxr, sense_data, child) # add cat link
63 elif template_name in [
64 "ux",
65 "eg",
66 "usex",
67 "uxi",
68 "collocation",
69 "co",
70 "coi",
71 "ko-usex",
72 "ko-x",
73 "koex",
74 "th-usex",
75 "th-x",
76 "th-xi",
77 "uxa",
78 ]:
79 extract_template_ux(wxr, child, example_data)
80 clean_node(wxr, sense_data, child) # add cat link
81 elif template_name == "Q":
82 extract_template_Q(wxr, child, example_data)
83 clean_node(wxr, sense_data, child) # add cat link
84 elif template_name.lower() in LINKAGE_TEMPLATES:
85 process_linkage_templates_in_gloss(
86 wxr,
87 word_entry,
88 child,
89 LINKAGE_TEMPLATES[template_name.lower()],
90 " ".join(sense_data.glosses),
91 )
92 elif template_name.lower() in ["inline alt forms", "alti"]: 92 ↛ 47line 92 didn't jump to line 47 because the condition on line 92 was always true
93 extract_inline_alt_forms_template(wxr, word_entry, child)
95 for next_list_item in list_item.find_child_recursively(
96 NodeKind.LIST_ITEM
97 ):
98 extract_example_list_item(
99 wxr, sense_data, next_list_item, word_entry, example_data
100 )
101 elif not list_item.contain_node(NodeKind.LIST): 101 ↛ 104line 101 didn't jump to line 104 because the condition on line 101 was always true
102 example_data.text = clean_node(wxr, None, list_item.children)
104 if len(example_data.text) > 0 and parent_example is None:
105 sense_data.examples.append(example_data)
108def extract_plain_text_example_list(
109 wxr: WiktextractContext,
110 sense: Sense,
111 list_item: WikiNode,
112 word_entry: WordEntry,
113 example_data: Example,
114) -> None:
115 for index, nested_list in list_item.find_child(
116 NodeKind.LIST, with_index=True
117 ):
118 example_data.ref = clean_node(wxr, None, list_item.children[:index])
119 for child_list_item in nested_list.find_child(NodeKind.LIST_ITEM):
120 extract_example_list_item(
121 wxr, sense, child_list_item, word_entry, example_data
122 )
125def extract_quote_templates(
126 wxr: WiktextractContext, node: TemplateNode, example_data: Example
127) -> None:
128 """
129 Process `quote-*` and "RQ:*" templates.
130 """
131 expanded_node = wxr.wtp.parse(
132 wxr.wtp.node_to_wikitext(node), expand_all=True
133 )
134 for span_tag in expanded_node.find_html_recursively("span"):
135 span_class = span_tag.attrs.get("class", "")
136 if "cited-source" == span_class:
137 example_data.ref = clean_node(wxr, None, span_tag)
138 elif "e-quotation" in span_class:
139 example_data.ruby, node_without_ruby = extract_ruby(wxr, span_tag)
140 example_data.text = clean_node(wxr, None, node_without_ruby)
141 calculate_bold_offsets(
142 wxr,
143 span_tag,
144 example_data.text,
145 example_data,
146 "bold_text_offsets",
147 )
148 elif "e-translation" in span_class:
149 example_data.translation = clean_node(wxr, None, span_tag)
150 calculate_bold_offsets(
151 wxr,
152 span_tag,
153 example_data.translation,
154 example_data,
155 "bold_translation_offsets",
156 )
157 for i_tag in expanded_node.find_html_recursively(
158 "i", attr_name="class", attr_value="e-transliteration"
159 ):
160 example_data.roman = clean_node(wxr, None, i_tag)
161 calculate_bold_offsets(
162 wxr,
163 i_tag,
164 example_data.roman,
165 example_data,
166 "bold_roman_offsets",
167 )
168 break
171def extract_template_ja_usex(
172 wxr: WiktextractContext, node: TemplateNode, example_data: Example
173) -> None:
174 expanded_node = wxr.wtp.parse(
175 wxr.wtp.node_to_wikitext(node), expand_all=True
176 )
177 for span_tag in expanded_node.find_html(
178 "span", attr_name="class", attr_value="Jpan"
179 ):
180 ruby_data, node_without_ruby = extract_ruby(wxr, span_tag)
181 example_data.text = clean_node(wxr, None, node_without_ruby)
182 calculate_bold_offsets(
183 wxr,
184 wxr.wtp.parse(wxr.wtp.node_to_wikitext(node_without_ruby)),
185 example_data.text,
186 example_data,
187 "bold_text_offsets",
188 )
189 example_data.ruby = ruby_data
190 for span_tag in expanded_node.find_html_recursively(
191 "span", attr_name="class", attr_value="tr"
192 ):
193 example_data.roman = clean_node(wxr, None, span_tag)
194 calculate_bold_offsets(
195 wxr,
196 span_tag,
197 example_data.roman,
198 example_data,
199 "bold_roman_offsets",
200 )
201 tr_arg = wxr.wtp.parse(
202 wxr.wtp.node_to_wikitext(node.template_parameters.get(3, "")),
203 expand_all=True,
204 )
205 example_data.translation = clean_node(wxr, None, tr_arg)
206 calculate_bold_offsets(
207 wxr,
208 tr_arg,
209 example_data.translation,
210 example_data,
211 "bold_translation_offsets",
212 )
213 lit_arg = wxr.wtp.parse(
214 wxr.wtp.node_to_wikitext(node.template_parameters.get("lit", "")),
215 expand_all=True,
216 )
217 example_data.literal_meaning = clean_node(wxr, None, lit_arg)
218 calculate_bold_offsets(
219 wxr,
220 lit_arg,
221 example_data.literal_meaning,
222 example_data,
223 "bold_literal_offsets",
224 )
227def extract_template_zh_x(
228 wxr: WiktextractContext,
229 template_node: TemplateNode,
230 parent_example: Example,
231) -> list[Example]:
232 expanded_node = wxr.wtp.parse(
233 wxr.wtp.node_to_wikitext(template_node), expand_all=True
234 )
235 has_dl_tag = False
236 results = []
237 example_data = parent_example.model_copy(deep=True)
238 tr_arg = wxr.wtp.parse(
239 wxr.wtp.node_to_wikitext(template_node.template_parameters.get(2, "")),
240 expand_all=True,
241 )
242 example_data.translation = clean_node(wxr, None, tr_arg)
243 calculate_bold_offsets(
244 wxr,
245 tr_arg,
246 example_data.translation,
247 example_data,
248 "bold_translation_offsets",
249 )
250 lit_arg = wxr.wtp.parse(
251 wxr.wtp.node_to_wikitext(
252 template_node.template_parameters.get("lit", "")
253 ),
254 expand_all=True,
255 )
256 example_data.literal_meaning = clean_node(wxr, None, lit_arg)
257 calculate_bold_offsets(
258 wxr,
259 lit_arg,
260 example_data.literal_meaning,
261 example_data,
262 "bold_literal_offsets",
263 )
264 for dl_tag in expanded_node.find_html_recursively("dl"):
265 has_dl_tag = True
266 for dd_tag in dl_tag.find_html("dd"):
267 dd_text = clean_node(wxr, None, dd_tag)
268 if dd_text.startswith("出自:"):
269 example_data.ref = dd_text.removeprefix("出自:")
270 elif not dd_text.startswith("(字面義為"): 270 ↛ 266line 270 didn't jump to line 266 because the condition on line 270 was always true
271 for span_tag in dd_tag.find_html_recursively(
272 "span", attr_name="lang", attr_value="Latn"
273 ):
274 example_data.roman = clean_node(wxr, None, span_tag)
275 calculate_bold_offsets(
276 wxr,
277 span_tag,
278 example_data.roman,
279 example_data,
280 "bold_roman_offsets",
281 )
282 for span_tag in dd_tag.find_html_recursively("span"):
283 span_text = clean_node(wxr, None, span_tag)
284 if span_text.startswith("[") and span_text.endswith(
285 "]"
286 ):
287 example_data.raw_tags.append(span_text.strip("[]"))
288 break
289 results.extend(extract_zh_x_dl_span_tag(wxr, dl_tag, example_data))
291 # no source, single line example
292 if not has_dl_tag:
293 for span_tag in expanded_node.find_html( 293 ↛ 305line 293 didn't jump to line 305 because the loop on line 293 didn't complete
294 "span", attr_name="lang", attr_value="Latn"
295 ):
296 example_data.roman = clean_node(wxr, None, span_tag)
297 calculate_bold_offsets(
298 wxr,
299 span_tag,
300 example_data.roman,
301 example_data,
302 "bold_roman_offsets",
303 )
304 break
305 for span_tag in expanded_node.find_html("span"):
306 span_text = clean_node(wxr, None, span_tag)
307 if span_text.startswith("[") and span_text.endswith("]"):
308 example_data.raw_tags.append(span_text.strip("[]"))
309 for span_tag in expanded_node.find_html("span"):
310 span_lang = span_tag.attrs.get("lang", "")
311 if span_lang in ["zh-Hant", "zh-Hans"]:
312 example_text = clean_node(wxr, None, span_tag)
313 if len(example_text) > 0: 313 ↛ 309line 313 didn't jump to line 309 because the condition on line 313 was always true
314 new_example = example_data.model_copy(deep=True)
315 new_example.text = example_text
316 calculate_bold_offsets(
317 wxr,
318 span_tag,
319 example_text,
320 new_example,
321 "bold_text_offsets",
322 )
323 new_example.tags.append(
324 "Traditional-Chinese"
325 if span_lang == "zh-Hant"
326 else "Simplified-Chinese"
327 )
328 translate_raw_tags(new_example)
329 results.append(new_example)
330 return results
333def extract_zh_x_dl_span_tag(
334 wxr: WiktextractContext, dl_tag: HTMLNode, example: Example
335) -> list[Example]:
336 # process example text span tag and dialect span tag
337 results = []
338 is_first_hide = True
339 for span_tag in dl_tag.find_html("span"):
340 span_lang = span_tag.attrs.get("lang", "")
341 if span_lang in ["zh-Hant", "zh-Hans"]:
342 new_example = example.model_copy(deep=True)
343 new_example.text = clean_node(wxr, None, span_tag)
344 calculate_bold_offsets(
345 wxr,
346 span_tag,
347 new_example.text,
348 new_example,
349 "bold_text_offsets",
350 )
351 results.append(new_example)
352 elif "vsHide" in span_tag.attrs.get("class", ""):
353 # template has arg "collapsed=y"
354 results.extend(
355 extract_zh_x_dl_span_tag(
356 wxr,
357 span_tag,
358 results[-1]
359 if is_first_hide and len(results) > 0
360 else example,
361 )
362 )
363 is_first_hide = False
364 elif "font-size:x-small" in span_tag.attrs.get("style", ""): 364 ↛ 339line 364 didn't jump to line 339 because the condition on line 364 was always true
365 for link_node in span_tag.find_child(NodeKind.LINK):
366 raw_tag = clean_node(wxr, None, link_node)
367 if len(raw_tag) > 0: 367 ↛ 365line 367 didn't jump to line 365 because the condition on line 367 was always true
368 if len(results) > 0:
369 results[-1].raw_tags.append(raw_tag)
370 else:
371 example.raw_tags.append(raw_tag)
373 if dl_tag.tag == "dl":
374 for data in results:
375 translate_raw_tags(data)
376 return results
379def extract_template_ux(
380 wxr: WiktextractContext, node: TemplateNode, example_data: Example
381) -> None:
382 # https://zh.wiktionary.org/wiki/Template:ux
383 expanded_node = wxr.wtp.parse(
384 wxr.wtp.node_to_wikitext(node), expand_all=True
385 )
386 for html_node in expanded_node.find_child_recursively(NodeKind.HTML):
387 class_names = html_node.attrs.get("class", "")
388 if "e-example" in class_names:
389 example_data.text = clean_node(wxr, None, html_node)
390 calculate_bold_offsets(
391 wxr,
392 html_node,
393 example_data.text,
394 example_data,
395 "bold_text_offsets",
396 )
397 elif "e-transliteration" in class_names:
398 example_data.roman = clean_node(wxr, None, html_node)
399 calculate_bold_offsets(
400 wxr,
401 html_node,
402 example_data.roman,
403 example_data,
404 "bold_roman_offsets",
405 )
406 elif "e-translation" in class_names:
407 example_data.translation = clean_node(wxr, None, html_node)
408 calculate_bold_offsets(
409 wxr,
410 html_node,
411 example_data.translation,
412 example_data,
413 "bold_translation_offsets",
414 )
415 elif "e-literally" in class_names:
416 example_data.literal_meaning = clean_node(wxr, None, html_node)
417 calculate_bold_offsets(
418 wxr,
419 html_node,
420 example_data.literal_meaning,
421 example_data,
422 "bold_literal_offsets",
423 )
424 elif "qualifier-content" in class_names:
425 example_data.raw_tags.extend(
426 clean_node(wxr, None, html_node).split("、")
427 )
428 translate_raw_tags(example_data)
431def extract_template_Q(
432 wxr: WiktextractContext, node: TemplateNode, example_data: Example
433) -> None:
434 # https://zh.wiktionary.org/wiki/Template:Q
435 expanded_node = wxr.wtp.parse(
436 wxr.wtp.node_to_wikitext(node), expand_all=True
437 )
438 for div_tag in expanded_node.find_html(
439 "div", attr_name="class", attr_value="wiktQuote"
440 ):
441 ref_nodes = []
442 for child in div_tag.children: 442 ↛ 457line 442 didn't jump to line 457 because the loop on line 442 didn't complete
443 if isinstance(child, HTMLNode) and child.tag == "dl":
444 for i_tag in child.find_html_recursively(
445 "i", attr_name="class", attr_value="e-transliteration"
446 ):
447 example_data.roman = clean_node(wxr, None, i_tag)
448 calculate_bold_offsets(
449 wxr,
450 i_tag,
451 example_data.roman,
452 example_data,
453 "bold_roman_offsets",
454 )
455 break
456 ref_nodes.append(child)
457 ref_text = clean_node(wxr, None, ref_nodes)
458 if len(ref_text) > 0: 458 ↛ 460line 458 didn't jump to line 460 because the condition on line 458 was always true
459 example_data.ref = ref_text
460 for t_arg, field in (
461 ("quote", "text"),
462 ("t", "translation"),
463 ("trans", "translation"),
464 ("lit", "literal_meaning"),
465 ):
466 t_arg_node = wxr.wtp.parse(
467 wxr.wtp.node_to_wikitext(
468 node.template_parameters.get(t_arg, "")
469 ),
470 expand_all=True,
471 )
472 value = clean_node(wxr, None, t_arg_node)
473 if len(value) > 0:
474 setattr(example_data, field, value)
475 calculate_bold_offsets(
476 wxr,
477 t_arg_node,
478 value,
479 example_data,
480 "bold_" + field.split("_")[0] + "_offsets",
481 )
484def extract_inline_alt_forms_template(
485 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
486):
487 sense = " ".join(word_entry.senses[-1].glosses)
488 forms = []
489 raw_tag = ""
490 expanded_node = wxr.wtp.parse(
491 wxr.wtp.node_to_wikitext(t_node), expand_all=True
492 )
493 lang = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
494 for span_tag in expanded_node.find_html_recursively("span"):
495 span_class = span_tag.attrs.get("class", "")
496 span_lang = span_tag.attrs.get("lang", "")
497 if "qualifier-content" in span_class:
498 raw_tag = clean_node(wxr, None, span_tag)
499 elif span_lang == lang:
500 word = clean_node(wxr, None, span_tag)
501 if word != "": 501 ↛ 494line 501 didn't jump to line 494 because the condition on line 501 was always true
502 form = Form(form=word, sense=sense, tags=["alternative"])
503 if raw_tag != "":
504 form.raw_tags.append(raw_tag)
505 raw_tag = ""
506 translate_raw_tags(form)
507 forms.append(form)
508 elif span_class == "tr Latn" and len(forms) > 0:
509 forms[-1].roman = clean_node(wxr, None, span_tag)
510 word_entry.forms.extend(forms)