Coverage for src/wiktextract/extractor/zh/example.py: 97%
209 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-12 08:27 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-12 08:27 +0000
1from wikitextprocessor.parser import HTMLNode, NodeKind, TemplateNode, WikiNode
3from ...page import clean_node
4from ...wxr_context import WiktextractContext
5from ..ruby import extract_ruby
6from ..share import calculate_bold_offsets
7from .linkage import process_linkage_templates_in_gloss
8from .models import Example, Form, Sense, WordEntry
9from .tags import translate_raw_tags
11LINKAGE_TEMPLATES = {
12 "syn": "synonyms",
13 "synonyms": "synonyms",
14 "ant": "antonyms",
15 "antonyms": "antonyms",
16 "antonym": "antonyms",
17 "hyper": "hypernyms",
18 "hypernyms": "hypernyms",
19 "hypo": "hyponyms",
20 "hyponyms": "hyponyms",
21 "cot": "coordinate_terms",
22 "coo": "coordinate_terms",
23 "coord": "coordinate_terms",
24 "coordinate terms": "coordinate_terms",
25}
28def extract_example_list_item(
29 wxr: WiktextractContext,
30 sense_data: Sense,
31 list_item: WikiNode,
32 word_entry: WordEntry,
33 parent_example: Example | None = None,
34) -> None:
35 example_data = parent_example or Example()
36 if list_item.contain_node(NodeKind.LIST) and not all(
37 isinstance(n, TemplateNode)
38 for n in list_item.invert_find_child(NodeKind.LIST)
39 ):
40 # plain text in the nested list, not using any template
41 # https://zh.wiktionary.org/wiki/%, the second example
42 extract_plain_text_example_list(wxr, list_item, example_data)
43 else:
44 # parse example templates
45 for child in list_item.find_child(NodeKind.TEMPLATE):
46 template_name = child.template_name
47 if (
48 template_name.startswith(("quote-", "RQ:"))
49 or template_name == "quote"
50 ):
51 extract_quote_templates(wxr, child, example_data)
52 clean_node(wxr, sense_data, child) # add cat link
53 elif template_name in ["ja-x", "ja-usex"]:
54 extract_template_ja_usex(wxr, child, example_data)
55 clean_node(wxr, sense_data, child) # add cat link
56 elif template_name in ["zh-x", "zh-usex", "zh-q", "zh-co"]:
57 sense_data.examples.extend(
58 extract_template_zh_x(wxr, child, example_data)
59 )
60 clean_node(wxr, sense_data, child) # add cat link
61 elif template_name in [
62 "ux",
63 "eg",
64 "usex",
65 "uxi",
66 "collocation",
67 "co",
68 "coi",
69 "ko-usex",
70 "ko-x",
71 "koex",
72 "th-usex",
73 "th-x",
74 "th-xi",
75 ]:
76 extract_template_ux(wxr, child, example_data)
77 clean_node(wxr, sense_data, child) # add cat link
78 elif template_name == "Q":
79 extract_template_Q(wxr, child, example_data)
80 clean_node(wxr, sense_data, child) # add cat link
81 elif template_name.lower() in LINKAGE_TEMPLATES:
82 process_linkage_templates_in_gloss(
83 wxr,
84 word_entry,
85 child,
86 LINKAGE_TEMPLATES[template_name.lower()],
87 " ".join(sense_data.glosses),
88 )
89 elif template_name.lower() in ["inline alt forms", "alti"]: 89 ↛ 45line 89 didn't jump to line 45 because the condition on line 89 was always true
90 extract_inline_alt_forms_template(wxr, word_entry, child)
92 for next_list_item in list_item.find_child_recursively(
93 NodeKind.LIST_ITEM
94 ):
95 extract_example_list_item(
96 wxr, sense_data, next_list_item, word_entry, example_data
97 )
99 if len(example_data.text) > 0 and parent_example is None:
100 sense_data.examples.append(example_data)
103def extract_plain_text_example_list(
104 wxr: WiktextractContext, list_item: WikiNode, example_data: Example
105) -> None:
106 for index, nested_list in list_item.find_child(
107 NodeKind.LIST, with_index=True
108 ):
109 example_data.ref = clean_node(wxr, None, list_item.children[:index])
110 example_data.text = clean_node(
111 wxr, None, nested_list.children[0].children
112 )
115def extract_quote_templates(
116 wxr: WiktextractContext, node: TemplateNode, example_data: Example
117) -> None:
118 """
119 Process `quote-*` and "RQ:*" templates.
120 """
121 expanded_node = wxr.wtp.parse(
122 wxr.wtp.node_to_wikitext(node), expand_all=True
123 )
124 for span_tag in expanded_node.find_html_recursively("span"):
125 span_class = span_tag.attrs.get("class", "")
126 if "cited-source" == span_class:
127 example_data.ref = clean_node(wxr, None, span_tag)
128 elif "e-quotation" in span_class:
129 example_data.ruby, node_without_ruby = extract_ruby(wxr, span_tag)
130 example_data.text = clean_node(wxr, None, node_without_ruby)
131 calculate_bold_offsets(
132 wxr,
133 span_tag,
134 example_data.text,
135 example_data,
136 "bold_text_offsets",
137 )
138 elif "e-translation" in span_class:
139 example_data.translation = clean_node(wxr, None, span_tag)
140 calculate_bold_offsets(
141 wxr,
142 span_tag,
143 example_data.translation,
144 example_data,
145 "bold_translation_offsets",
146 )
147 for i_tag in expanded_node.find_html_recursively(
148 "i", attr_name="class", attr_value="e-transliteration"
149 ):
150 example_data.roman = clean_node(wxr, None, i_tag)
151 calculate_bold_offsets(
152 wxr,
153 i_tag,
154 example_data.roman,
155 example_data,
156 "bold_roman_offsets",
157 )
158 break
161def extract_template_ja_usex(
162 wxr: WiktextractContext, node: TemplateNode, example_data: Example
163) -> None:
164 expanded_node = wxr.wtp.parse(
165 wxr.wtp.node_to_wikitext(node), expand_all=True
166 )
167 for span_tag in expanded_node.find_html(
168 "span", attr_name="class", attr_value="Jpan"
169 ):
170 ruby_data, node_without_ruby = extract_ruby(wxr, span_tag)
171 example_data.text = clean_node(wxr, None, node_without_ruby)
172 calculate_bold_offsets(
173 wxr,
174 wxr.wtp.parse(wxr.wtp.node_to_wikitext(node_without_ruby)),
175 example_data.text,
176 example_data,
177 "bold_text_offsets",
178 )
179 example_data.ruby = ruby_data
180 for span_tag in expanded_node.find_html_recursively(
181 "span", attr_name="class", attr_value="tr"
182 ):
183 example_data.roman = clean_node(wxr, None, span_tag)
184 calculate_bold_offsets(
185 wxr,
186 span_tag,
187 example_data.roman,
188 example_data,
189 "bold_roman_offsets",
190 )
191 tr_arg = wxr.wtp.parse(
192 wxr.wtp.node_to_wikitext(node.template_parameters.get(3, "")),
193 expand_all=True,
194 )
195 example_data.translation = clean_node(wxr, None, tr_arg)
196 calculate_bold_offsets(
197 wxr,
198 tr_arg,
199 example_data.translation,
200 example_data,
201 "bold_translation_offsets",
202 )
203 lit_arg = wxr.wtp.parse(
204 wxr.wtp.node_to_wikitext(node.template_parameters.get("lit", "")),
205 expand_all=True,
206 )
207 example_data.literal_meaning = clean_node(wxr, None, lit_arg)
208 calculate_bold_offsets(
209 wxr,
210 lit_arg,
211 example_data.literal_meaning,
212 example_data,
213 "bold_literal_offsets",
214 )
217def extract_template_zh_x(
218 wxr: WiktextractContext,
219 template_node: TemplateNode,
220 parent_example: Example,
221) -> list[Example]:
222 expanded_node = wxr.wtp.parse(
223 wxr.wtp.node_to_wikitext(template_node), expand_all=True
224 )
225 has_dl_tag = False
226 results = []
227 example_data = parent_example.model_copy(deep=True)
228 tr_arg = wxr.wtp.parse(
229 wxr.wtp.node_to_wikitext(template_node.template_parameters.get(2, "")),
230 expand_all=True,
231 )
232 example_data.translation = clean_node(wxr, None, tr_arg)
233 calculate_bold_offsets(
234 wxr,
235 tr_arg,
236 example_data.translation,
237 example_data,
238 "bold_translation_offsets",
239 )
240 lit_arg = wxr.wtp.parse(
241 wxr.wtp.node_to_wikitext(
242 template_node.template_parameters.get("lit", "")
243 ),
244 expand_all=True,
245 )
246 example_data.literal_meaning = clean_node(wxr, None, lit_arg)
247 calculate_bold_offsets(
248 wxr,
249 lit_arg,
250 example_data.literal_meaning,
251 example_data,
252 "bold_literal_offsets",
253 )
254 for dl_tag in expanded_node.find_html_recursively("dl"):
255 has_dl_tag = True
256 for dd_tag in dl_tag.find_html("dd"):
257 dd_text = clean_node(wxr, None, dd_tag)
258 if dd_text.startswith("出自:"):
259 example_data.ref = dd_text.removeprefix("出自:")
260 elif not dd_text.startswith("(字面義為"): 260 ↛ 256line 260 didn't jump to line 256 because the condition on line 260 was always true
261 for span_tag in dd_tag.find_html_recursively(
262 "span", attr_name="lang", attr_value="Latn"
263 ):
264 example_data.roman = clean_node(wxr, None, span_tag)
265 calculate_bold_offsets(
266 wxr,
267 span_tag,
268 example_data.roman,
269 example_data,
270 "bold_roman_offsets",
271 )
272 for span_tag in dd_tag.find_html_recursively("span"):
273 span_text = clean_node(wxr, None, span_tag)
274 if span_text.startswith("[") and span_text.endswith(
275 "]"
276 ):
277 example_data.raw_tags.append(span_text.strip("[]"))
278 break
279 results.extend(extract_zh_x_dl_span_tag(wxr, dl_tag, example_data))
281 # no source, single line example
282 if not has_dl_tag:
283 for span_tag in expanded_node.find_html( 283 ↛ 295line 283 didn't jump to line 295 because the loop on line 283 didn't complete
284 "span", attr_name="lang", attr_value="Latn"
285 ):
286 example_data.roman = clean_node(wxr, None, span_tag)
287 calculate_bold_offsets(
288 wxr,
289 span_tag,
290 example_data.roman,
291 example_data,
292 "bold_roman_offsets",
293 )
294 break
295 for span_tag in expanded_node.find_html("span"):
296 span_text = clean_node(wxr, None, span_tag)
297 if span_text.startswith("[") and span_text.endswith("]"):
298 example_data.raw_tags.append(span_text.strip("[]"))
299 for span_tag in expanded_node.find_html("span"):
300 span_lang = span_tag.attrs.get("lang", "")
301 if span_lang in ["zh-Hant", "zh-Hans"]:
302 example_text = clean_node(wxr, None, span_tag)
303 if len(example_text) > 0: 303 ↛ 299line 303 didn't jump to line 299 because the condition on line 303 was always true
304 new_example = example_data.model_copy(deep=True)
305 new_example.text = example_text
306 calculate_bold_offsets(
307 wxr,
308 span_tag,
309 example_text,
310 new_example,
311 "bold_text_offsets",
312 )
313 new_example.tags.append(
314 "Traditional-Chinese"
315 if span_lang == "zh-Hant"
316 else "Simplified-Chinese"
317 )
318 translate_raw_tags(new_example)
319 results.append(new_example)
320 return results
323def extract_zh_x_dl_span_tag(
324 wxr: WiktextractContext, dl_tag: HTMLNode, example: Example
325) -> list[Example]:
326 # process example text span tag and dialect span tag
327 results = []
328 is_first_hide = True
329 for span_tag in dl_tag.find_html("span"):
330 span_lang = span_tag.attrs.get("lang", "")
331 if span_lang in ["zh-Hant", "zh-Hans"]:
332 new_example = example.model_copy(deep=True)
333 new_example.text = clean_node(wxr, None, span_tag)
334 calculate_bold_offsets(
335 wxr,
336 span_tag,
337 new_example.text,
338 new_example,
339 "bold_text_offsets",
340 )
341 results.append(new_example)
342 elif "vsHide" in span_tag.attrs.get("class", ""):
343 # template has arg "collapsed=y"
344 results.extend(
345 extract_zh_x_dl_span_tag(
346 wxr,
347 span_tag,
348 results[-1]
349 if is_first_hide and len(results) > 0
350 else example,
351 )
352 )
353 is_first_hide = False
354 elif "font-size:x-small" in span_tag.attrs.get("style", ""): 354 ↛ 329line 354 didn't jump to line 329 because the condition on line 354 was always true
355 for link_node in span_tag.find_child(NodeKind.LINK):
356 raw_tag = clean_node(wxr, None, link_node)
357 if len(raw_tag) > 0: 357 ↛ 355line 357 didn't jump to line 355 because the condition on line 357 was always true
358 if len(results) > 0:
359 results[-1].raw_tags.append(raw_tag)
360 else:
361 example.raw_tags.append(raw_tag)
363 if dl_tag.tag == "dl":
364 for data in results:
365 translate_raw_tags(data)
366 return results
369def extract_template_ux(
370 wxr: WiktextractContext, node: TemplateNode, example_data: Example
371) -> None:
372 # https://zh.wiktionary.org/wiki/Template:ux
373 expanded_node = wxr.wtp.parse(
374 wxr.wtp.node_to_wikitext(node), expand_all=True
375 )
376 for html_node in expanded_node.find_child_recursively(NodeKind.HTML):
377 class_names = html_node.attrs.get("class", "")
378 if "e-example" in class_names:
379 example_data.text = clean_node(wxr, None, html_node)
380 calculate_bold_offsets(
381 wxr,
382 html_node,
383 example_data.text,
384 example_data,
385 "bold_text_offsets",
386 )
387 elif "e-transliteration" in class_names:
388 example_data.roman = clean_node(wxr, None, html_node)
389 calculate_bold_offsets(
390 wxr,
391 html_node,
392 example_data.roman,
393 example_data,
394 "bold_roman_offsets",
395 )
396 elif "e-translation" in class_names:
397 example_data.translation = clean_node(wxr, None, html_node)
398 calculate_bold_offsets(
399 wxr,
400 html_node,
401 example_data.translation,
402 example_data,
403 "bold_translation_offsets",
404 )
405 elif "e-literally" in class_names:
406 example_data.literal_meaning = clean_node(wxr, None, html_node)
407 calculate_bold_offsets(
408 wxr,
409 html_node,
410 example_data.literal_meaning,
411 example_data,
412 "bold_literal_offsets",
413 )
414 elif "qualifier-content" in class_names:
415 example_data.raw_tags.extend(
416 clean_node(wxr, None, html_node).split("、")
417 )
418 translate_raw_tags(example_data)
421def extract_template_Q(
422 wxr: WiktextractContext, node: TemplateNode, example_data: Example
423) -> None:
424 # https://zh.wiktionary.org/wiki/Template:Q
425 expanded_node = wxr.wtp.parse(
426 wxr.wtp.node_to_wikitext(node), expand_all=True
427 )
428 for div_tag in expanded_node.find_html(
429 "div", attr_name="class", attr_value="wiktQuote"
430 ):
431 ref_nodes = []
432 for child in div_tag.children: 432 ↛ 447line 432 didn't jump to line 447 because the loop on line 432 didn't complete
433 if isinstance(child, HTMLNode) and child.tag == "dl":
434 for i_tag in child.find_html_recursively(
435 "i", attr_name="class", attr_value="e-transliteration"
436 ):
437 example_data.roman = clean_node(wxr, None, i_tag)
438 calculate_bold_offsets(
439 wxr,
440 i_tag,
441 example_data.roman,
442 example_data,
443 "bold_roman_offsets",
444 )
445 break
446 ref_nodes.append(child)
447 ref_text = clean_node(wxr, None, ref_nodes)
448 if len(ref_text) > 0: 448 ↛ 450line 448 didn't jump to line 450 because the condition on line 448 was always true
449 example_data.ref = ref_text
450 for t_arg, field in (
451 ("quote", "text"),
452 ("t", "translation"),
453 ("trans", "translation"),
454 ("lit", "literal_meaning"),
455 ):
456 t_arg_node = wxr.wtp.parse(
457 wxr.wtp.node_to_wikitext(
458 node.template_parameters.get(t_arg, "")
459 ),
460 expand_all=True,
461 )
462 value = clean_node(wxr, None, t_arg_node)
463 if len(value) > 0:
464 setattr(example_data, field, value)
465 calculate_bold_offsets(
466 wxr,
467 t_arg_node,
468 value,
469 example_data,
470 "bold_" + field.split("_")[0] + "_offsets",
471 )
474def extract_inline_alt_forms_template(
475 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
476):
477 sense = " ".join(word_entry.senses[-1].glosses)
478 forms = []
479 raw_tag = ""
480 expanded_node = wxr.wtp.parse(
481 wxr.wtp.node_to_wikitext(t_node), expand_all=True
482 )
483 lang = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
484 for span_tag in expanded_node.find_html_recursively("span"):
485 span_class = span_tag.attrs.get("class", "")
486 span_lang = span_tag.attrs.get("lang", "")
487 if "qualifier-content" in span_class:
488 raw_tag = clean_node(wxr, None, span_tag)
489 elif span_lang == lang:
490 word = clean_node(wxr, None, span_tag)
491 if word != "": 491 ↛ 484line 491 didn't jump to line 484 because the condition on line 491 was always true
492 form = Form(form=word, sense=sense, tags=["alternative"])
493 if raw_tag != "":
494 form.raw_tags.append(raw_tag)
495 raw_tag = ""
496 translate_raw_tags(form)
497 forms.append(form)
498 elif span_class == "tr Latn" and len(forms) > 0:
499 forms[-1].roman = clean_node(wxr, None, span_tag)
500 word_entry.forms.extend(forms)