Coverage for src/wiktextract/extractor/zh/example.py: 97%
184 statements
« prev ^ index » next coverage.py v7.9.0, created at 2025-06-13 07:43 +0000
« prev ^ index » next coverage.py v7.9.0, created at 2025-06-13 07:43 +0000
1from wikitextprocessor.parser import HTMLNode, NodeKind, TemplateNode, WikiNode
3from ...page import clean_node
4from ...wxr_context import WiktextractContext
5from ..ruby import extract_ruby
6from ..share import calculate_bold_offsets
7from .linkage import process_linkage_templates_in_gloss
8from .models import Example, Sense, WordEntry
9from .tags import translate_raw_tags
11LINKAGE_TEMPLATES = {
12 "syn": "synonyms",
13 "synonyms": "synonyms",
14 "ant": "antonyms",
15 "antonyms": "antonyms",
16 "hyper": "hypernyms",
17 "hypernyms": "hypernyms",
18 "hypo": "hyponyms",
19 "hyponyms": "hyponyms",
20}
23def extract_example_list_item(
24 wxr: WiktextractContext,
25 sense_data: Sense,
26 list_item: WikiNode,
27 word_entry: WordEntry,
28 parent_example: Example | None = None,
29) -> None:
30 example_data = parent_example or Example()
31 if list_item.contain_node(NodeKind.LIST) and not all(
32 isinstance(n, TemplateNode)
33 for n in list_item.invert_find_child(NodeKind.LIST)
34 ):
35 # plain text in the nested list, not using any template
36 # https://zh.wiktionary.org/wiki/%, the second example
37 extract_plain_text_example_list(wxr, list_item, example_data)
38 else:
39 # parse example templates
40 for child in list_item.find_child(NodeKind.TEMPLATE):
41 template_name = child.template_name
42 if (
43 template_name.startswith(("quote-", "RQ:"))
44 or template_name == "quote"
45 ):
46 extract_quote_templates(wxr, child, example_data)
47 clean_node(wxr, sense_data, child) # add cat link
48 elif template_name in ["ja-x", "ja-usex"]:
49 extract_template_ja_usex(wxr, child, example_data)
50 clean_node(wxr, sense_data, child) # add cat link
51 elif template_name in ["zh-x", "zh-usex", "zh-q", "zh-co"]:
52 sense_data.examples.extend(
53 extract_template_zh_x(wxr, child, example_data)
54 )
55 clean_node(wxr, sense_data, child) # add cat link
56 elif template_name in [
57 "ux",
58 "eg",
59 "usex",
60 "uxi",
61 "collocation",
62 "co",
63 "coi",
64 "ko-usex",
65 "ko-x",
66 "koex",
67 "th-usex",
68 "th-x",
69 "th-xi",
70 ]:
71 extract_template_ux(wxr, child, example_data)
72 clean_node(wxr, sense_data, child) # add cat link
73 elif template_name == "Q":
74 extract_template_Q(wxr, child, example_data)
75 clean_node(wxr, sense_data, child) # add cat link
76 elif template_name in LINKAGE_TEMPLATES: 76 ↛ 87line 76 didn't jump to line 87 because the condition on line 76 was always true
77 process_linkage_templates_in_gloss(
78 wxr,
79 word_entry,
80 child,
81 LINKAGE_TEMPLATES[template_name],
82 sense_data.glosses[0]
83 if len(sense_data.glosses) > 0
84 else "",
85 )
86 else:
87 example_data.text = clean_node(wxr, None, child)
89 for next_list_item in list_item.find_child_recursively(
90 NodeKind.LIST_ITEM
91 ):
92 extract_example_list_item(
93 wxr, sense_data, next_list_item, word_entry, example_data
94 )
96 if len(example_data.text) > 0 and parent_example is None:
97 sense_data.examples.append(example_data)
100def extract_plain_text_example_list(
101 wxr: WiktextractContext, list_item: WikiNode, example_data: Example
102) -> None:
103 for index, nested_list in list_item.find_child(
104 NodeKind.LIST, with_index=True
105 ):
106 example_data.ref = clean_node(wxr, None, list_item.children[:index])
107 example_data.text = clean_node(
108 wxr, None, nested_list.children[0].children
109 )
112def extract_quote_templates(
113 wxr: WiktextractContext, node: TemplateNode, example_data: Example
114) -> None:
115 """
116 Process `quote-*` and "RQ:*" templates.
117 """
118 expanded_node = wxr.wtp.parse(
119 wxr.wtp.node_to_wikitext(node), expand_all=True
120 )
121 for span_tag in expanded_node.find_html_recursively("span"):
122 span_class = span_tag.attrs.get("class", "")
123 if "cited-source" == span_class:
124 example_data.ref = clean_node(wxr, None, span_tag)
125 elif "e-quotation" in span_class:
126 example_data.text = clean_node(wxr, None, span_tag)
127 calculate_bold_offsets(
128 wxr,
129 span_tag,
130 example_data.text,
131 example_data,
132 "bold_text_offsets",
133 )
134 elif "e-translation" in span_class:
135 example_data.translation = clean_node(wxr, None, span_tag)
136 calculate_bold_offsets(
137 wxr,
138 span_tag,
139 example_data.translation,
140 example_data,
141 "bold_translation_offsets",
142 )
143 for i_tag in expanded_node.find_html_recursively(
144 "i", attr_name="class", attr_value="e-transliteration"
145 ):
146 example_data.roman = clean_node(wxr, None, i_tag)
147 calculate_bold_offsets(
148 wxr,
149 i_tag,
150 example_data.roman,
151 example_data,
152 "bold_roman_offsets",
153 )
154 break
157def extract_template_ja_usex(
158 wxr: WiktextractContext, node: TemplateNode, example_data: Example
159) -> None:
160 expanded_node = wxr.wtp.parse(
161 wxr.wtp.node_to_wikitext(node), expand_all=True
162 )
163 for span_tag in expanded_node.find_html(
164 "span", attr_name="class", attr_value="Jpan"
165 ):
166 ruby_data, node_without_ruby = extract_ruby(wxr, span_tag)
167 example_data.text = clean_node(wxr, None, node_without_ruby)
168 calculate_bold_offsets(
169 wxr,
170 wxr.wtp.parse(wxr.wtp.node_to_wikitext(node_without_ruby)),
171 example_data.text,
172 example_data,
173 "bold_text_offsets",
174 )
175 example_data.ruby = ruby_data
176 for span_tag in expanded_node.find_html_recursively(
177 "span", attr_name="class", attr_value="tr"
178 ):
179 example_data.roman = clean_node(wxr, None, span_tag)
180 calculate_bold_offsets(
181 wxr,
182 span_tag,
183 example_data.roman,
184 example_data,
185 "bold_roman_offsets",
186 )
187 tr_arg = wxr.wtp.parse(
188 wxr.wtp.node_to_wikitext(node.template_parameters.get(3, "")),
189 expand_all=True,
190 )
191 example_data.translation = clean_node(wxr, None, tr_arg)
192 calculate_bold_offsets(
193 wxr,
194 tr_arg,
195 example_data.translation,
196 example_data,
197 "bold_translation_offsets",
198 )
199 lit_arg = wxr.wtp.parse(
200 wxr.wtp.node_to_wikitext(node.template_parameters.get("lit", "")),
201 expand_all=True,
202 )
203 example_data.literal_meaning = clean_node(wxr, None, lit_arg)
204 calculate_bold_offsets(
205 wxr,
206 lit_arg,
207 example_data.literal_meaning,
208 example_data,
209 "bold_literal_offsets",
210 )
213def extract_template_zh_x(
214 wxr: WiktextractContext,
215 template_node: TemplateNode,
216 parent_example: Example,
217) -> list[Example]:
218 expanded_node = wxr.wtp.parse(
219 wxr.wtp.node_to_wikitext(template_node), expand_all=True
220 )
221 has_dl_tag = False
222 results = []
223 example_data = parent_example.model_copy(deep=True)
224 tr_arg = wxr.wtp.parse(
225 wxr.wtp.node_to_wikitext(template_node.template_parameters.get(2, "")),
226 expand_all=True,
227 )
228 example_data.translation = clean_node(wxr, None, tr_arg)
229 calculate_bold_offsets(
230 wxr,
231 tr_arg,
232 example_data.translation,
233 example_data,
234 "bold_translation_offsets",
235 )
236 lit_arg = wxr.wtp.parse(
237 wxr.wtp.node_to_wikitext(
238 template_node.template_parameters.get("lit", "")
239 ),
240 expand_all=True,
241 )
242 example_data.literal_meaning = clean_node(wxr, None, lit_arg)
243 calculate_bold_offsets(
244 wxr,
245 lit_arg,
246 example_data.literal_meaning,
247 example_data,
248 "bold_literal_offsets",
249 )
250 for dl_tag in expanded_node.find_html_recursively("dl"):
251 has_dl_tag = True
252 for dd_tag in dl_tag.find_html("dd"):
253 dd_text = clean_node(wxr, None, dd_tag)
254 if dd_text.startswith("出自:"):
255 example_data.ref = dd_text.removeprefix("出自:")
256 elif not dd_text.startswith("(字面義為"): 256 ↛ 252line 256 didn't jump to line 252 because the condition on line 256 was always true
257 for span_tag in dd_tag.find_html_recursively(
258 "span", attr_name="lang", attr_value="Latn"
259 ):
260 example_data.roman = clean_node(wxr, None, span_tag)
261 calculate_bold_offsets(
262 wxr,
263 span_tag,
264 example_data.roman,
265 example_data,
266 "bold_roman_offsets",
267 )
268 for span_tag in dd_tag.find_html_recursively("span"):
269 span_text = clean_node(wxr, None, span_tag)
270 if span_text.startswith("[") and span_text.endswith(
271 "]"
272 ):
273 example_data.raw_tags.append(span_text.strip("[]"))
274 break
275 results.extend(extract_zh_x_dl_span_tag(wxr, dl_tag, example_data))
277 # no source, single line example
278 if not has_dl_tag:
279 for span_tag in expanded_node.find_html( 279 ↛ 291line 279 didn't jump to line 291 because the loop on line 279 didn't complete
280 "span", attr_name="lang", attr_value="Latn"
281 ):
282 example_data.roman = clean_node(wxr, None, span_tag)
283 calculate_bold_offsets(
284 wxr,
285 span_tag,
286 example_data.roman,
287 example_data,
288 "bold_roman_offsets",
289 )
290 break
291 for span_tag in expanded_node.find_html("span"):
292 span_text = clean_node(wxr, None, span_tag)
293 if span_text.startswith("[") and span_text.endswith("]"):
294 example_data.raw_tags.append(span_text.strip("[]"))
295 for span_tag in expanded_node.find_html("span"):
296 span_lang = span_tag.attrs.get("lang", "")
297 if span_lang in ["zh-Hant", "zh-Hans"]:
298 example_text = clean_node(wxr, None, span_tag)
299 if len(example_text) > 0: 299 ↛ 295line 299 didn't jump to line 295 because the condition on line 299 was always true
300 new_example = example_data.model_copy(deep=True)
301 new_example.text = example_text
302 calculate_bold_offsets(
303 wxr,
304 span_tag,
305 example_text,
306 new_example,
307 "bold_text_offsets",
308 )
309 new_example.tags.append(
310 "Traditional Chinese"
311 if span_lang == "zh-Hant"
312 else "Simplified Chinese"
313 )
314 translate_raw_tags(new_example)
315 results.append(new_example)
316 return results
319def extract_zh_x_dl_span_tag(
320 wxr: WiktextractContext, dl_tag: HTMLNode, example: Example
321) -> list[Example]:
322 # process example text span tag and dialect span tag
323 results = []
324 is_first_hide = True
325 for span_tag in dl_tag.find_html("span"):
326 span_lang = span_tag.attrs.get("lang", "")
327 if span_lang in ["zh-Hant", "zh-Hans"]:
328 new_example = example.model_copy(deep=True)
329 new_example.text = clean_node(wxr, None, span_tag)
330 calculate_bold_offsets(
331 wxr,
332 span_tag,
333 new_example.text,
334 new_example,
335 "bold_text_offsets",
336 )
337 results.append(new_example)
338 elif "vsHide" in span_tag.attrs.get("class", ""):
339 # template has arg "collapsed=y"
340 results.extend(
341 extract_zh_x_dl_span_tag(
342 wxr,
343 span_tag,
344 results[-1]
345 if is_first_hide and len(results) > 0
346 else example,
347 )
348 )
349 is_first_hide = False
350 elif "font-size:x-small" in span_tag.attrs.get("style", ""): 350 ↛ 325line 350 didn't jump to line 325 because the condition on line 350 was always true
351 for link_node in span_tag.find_child(NodeKind.LINK):
352 raw_tag = clean_node(wxr, None, link_node)
353 if len(raw_tag) > 0: 353 ↛ 351line 353 didn't jump to line 351 because the condition on line 353 was always true
354 if len(results) > 0:
355 results[-1].raw_tags.append(raw_tag)
356 else:
357 example.raw_tags.append(raw_tag)
359 if dl_tag.tag == "dl":
360 for data in results:
361 translate_raw_tags(data)
362 return results
365def extract_template_ux(
366 wxr: WiktextractContext, node: TemplateNode, example_data: Example
367) -> None:
368 # https://zh.wiktionary.org/wiki/Template:ux
369 expanded_node = wxr.wtp.parse(
370 wxr.wtp.node_to_wikitext(node), expand_all=True
371 )
372 for html_node in expanded_node.find_child_recursively(NodeKind.HTML):
373 class_names = html_node.attrs.get("class", "")
374 if "e-example" in class_names:
375 example_data.text = clean_node(wxr, None, html_node)
376 calculate_bold_offsets(
377 wxr,
378 html_node,
379 example_data.text,
380 example_data,
381 "bold_text_offsets",
382 )
383 elif "e-transliteration" in class_names:
384 example_data.roman = clean_node(wxr, None, html_node)
385 calculate_bold_offsets(
386 wxr,
387 html_node,
388 example_data.roman,
389 example_data,
390 "bold_roman_offsets",
391 )
392 elif "e-translation" in class_names:
393 example_data.translation = clean_node(wxr, None, html_node)
394 calculate_bold_offsets(
395 wxr,
396 html_node,
397 example_data.translation,
398 example_data,
399 "bold_translation_offsets",
400 )
401 elif "e-literally" in class_names:
402 example_data.literal_meaning = clean_node(wxr, None, html_node)
403 calculate_bold_offsets(
404 wxr,
405 html_node,
406 example_data.literal_meaning,
407 example_data,
408 "bold_literal_offsets",
409 )
410 elif "qualifier-content" in class_names:
411 example_data.raw_tags.extend(
412 clean_node(wxr, None, html_node).split("、")
413 )
414 translate_raw_tags(example_data)
417def extract_template_Q(
418 wxr: WiktextractContext, node: TemplateNode, example_data: Example
419) -> None:
420 # https://zh.wiktionary.org/wiki/Template:Q
421 expanded_node = wxr.wtp.parse(
422 wxr.wtp.node_to_wikitext(node), expand_all=True
423 )
424 for div_tag in expanded_node.find_html(
425 "div", attr_name="class", attr_value="wiktQuote"
426 ):
427 ref_nodes = []
428 for child in div_tag.children: 428 ↛ 443line 428 didn't jump to line 443 because the loop on line 428 didn't complete
429 if isinstance(child, HTMLNode) and child.tag == "dl":
430 for i_tag in child.find_html_recursively(
431 "i", attr_name="class", attr_value="e-transliteration"
432 ):
433 example_data.roman = clean_node(wxr, None, i_tag)
434 calculate_bold_offsets(
435 wxr,
436 i_tag,
437 example_data.roman,
438 example_data,
439 "bold_roman_offsets",
440 )
441 break
442 ref_nodes.append(child)
443 ref_text = clean_node(wxr, None, ref_nodes)
444 if len(ref_text) > 0: 444 ↛ 446line 444 didn't jump to line 446 because the condition on line 444 was always true
445 example_data.ref = ref_text
446 for t_arg, field in (
447 ("quote", "text"),
448 ("t", "translation"),
449 ("trans", "translation"),
450 ("lit", "literal_meaning"),
451 ):
452 t_arg_node = wxr.wtp.parse(
453 wxr.wtp.node_to_wikitext(
454 node.template_parameters.get(t_arg, "")
455 ),
456 expand_all=True,
457 )
458 value = clean_node(wxr, None, t_arg_node)
459 if len(value) > 0:
460 setattr(example_data, field, value)
461 calculate_bold_offsets(
462 wxr,
463 t_arg_node,
464 value,
465 example_data,
466 "bold_" + field.split("_")[0] + "_offsets",
467 )