Coverage for src/wiktextract/extractor/en/example.py: 53%
199 statements
« prev ^ index » next coverage.py v7.14.3, created at 2026-06-23 09:14 +0000
« prev ^ index » next coverage.py v7.14.3, created at 2026-06-23 09:14 +0000
1from copy import deepcopy
3from wikitextprocessor import HTMLNode, NodeKind, TemplateNode, WikiNode
5from ...page import clean_node
6from ...tags import valid_tags
7from ...wxr_context import WiktextractContext
8from ..ruby import extract_ruby
9from ..share import calculate_bold_offsets
10from .type_utils import ExampleData, SenseData
13def extract_example_list_item(
14 wxr: WiktextractContext,
15 list_item: WikiNode,
16 sense_data: SenseData,
17 parent_data: ExampleData,
18) -> list[ExampleData]:
19 examples = []
20 if "tags" not in parent_data: 20 ↛ 21line 20 didn't jump to line 21 because the condition on line 20 was never true
21 parent_data["tags"] = []
22 if "raw_tags" not in parent_data: 22 ↛ 23line 22 didn't jump to line 23 because the condition on line 22 was never true
23 parent_data["raw_tags"] = []
24 for template_node in list_item.find_child(NodeKind.TEMPLATE):
25 if template_node.template_name in ["zh-x", "zh-usex", "zh-q", "zh-co"]:
26 examples.extend(
27 extract_template_zh_x(
28 wxr,
29 template_node,
30 sense_data,
31 parent_data,
32 )
33 )
34 elif (
35 template_node.template_name.startswith(("quote-", "RQ:"))
36 or template_node.template_name == "quote"
37 ):
38 q_example = extract_quote_templates(wxr, template_node, sense_data)
39 if list_item.contain_node(NodeKind.LIST):
40 for next_list_item in list_item.find_child_recursively(
41 NodeKind.LIST_ITEM
42 ):
43 for key in ["tags", "raw_tags"]:
44 if key not in q_example:
45 q_example[key] = []
46 examples.extend(
47 extract_example_list_item(
48 wxr, next_list_item, sense_data, q_example
49 )
50 )
51 else:
52 examples.append(q_example)
53 elif template_node.template_name in [
54 "ux",
55 "usex",
56 "uxi",
57 "ko-usex",
58 "koex",
59 "ko-x",
60 "th-usex",
61 "th-x",
62 "th-xi",
63 "uxa",
64 "collocation",
65 "co",
66 "coi",
67 "uxa",
68 "ja-usex",
69 "ja-x",
70 "ja-ux",
71 ]:
72 copy_of_parent_data = deepcopy(parent_data)
73 if template_node.template_name in ("collocation", "co", "coi"): 73 ↛ 74line 73 didn't jump to line 74 because the condition on line 73 was never true
74 copy_of_parent_data["tags"].append("collocation")
75 examples.append(
76 extract_ux_template(
77 wxr,
78 template_node,
79 sense_data,
80 copy_of_parent_data,
81 )
82 )
84 return examples
87def extract_quote_templates(
88 wxr: WiktextractContext, node: TemplateNode, sense_data: SenseData
89) -> ExampleData:
90 expanded_node = wxr.wtp.parse(
91 wxr.wtp.node_to_wikitext(node), expand_all=True
92 )
93 clean_node(wxr, sense_data, expanded_node)
94 example_data = ExampleData(
95 text="", ref="", english="", roman="", type="quote"
96 )
97 for span_tag in expanded_node.find_html_recursively("span"): 97 ↛ 98line 97 didn't jump to line 98 because the loop on line 97 never started
98 span_class = span_tag.attrs.get("class", "")
99 if "cited-source" == span_class:
100 example_data["ref"] = clean_node(wxr, None, span_tag)
101 elif "e-quotation" in span_class:
102 ruby_data, node_without_ruby = extract_ruby(wxr, span_tag)
103 if len(ruby_data) > 0:
104 example_data["ruby"] = ruby_data
105 example_data["text"] = clean_node(wxr, None, node_without_ruby)
106 calculate_bold_offsets(
107 wxr,
108 span_tag,
109 example_data["text"],
110 example_data,
111 "bold_text_offsets",
112 )
113 elif "e-translation" in span_class:
114 example_data["translation"] = clean_node(
115 wxr, None, span_tag
116 ) # DEPRECATED for "translation"
117 example_data["english"] = example_data[
118 "translation"
119 ] # DEPRECATED for "translation"
120 calculate_bold_offsets(
121 wxr,
122 span_tag,
123 example_data["translation"],
124 example_data,
125 "bold_translation_offsets",
126 )
127 for i_tag in expanded_node.find_html_recursively( 127 ↛ 130line 127 didn't jump to line 130 because the loop on line 127 never started
128 "i", attr_name="class", attr_value="e-transliteration"
129 ):
130 example_data["roman"] = clean_node(wxr, None, i_tag)
131 calculate_bold_offsets(
132 wxr,
133 span_tag,
134 example_data["roman"],
135 example_data,
136 "bold_roman_offsets",
137 )
138 break
139 clean_example_empty_data(example_data)
140 return example_data
143def extract_template_ja_usex(
144 wxr: WiktextractContext,
145 node: TemplateNode,
146 sense_data: SenseData,
147 example_data: ExampleData,
148) -> ExampleData:
149 # https://en.wiktionary.org/wiki/Template:ja-usex
150 expanded_node = wxr.wtp.parse(
151 wxr.wtp.node_to_wikitext(node), expand_all=True
152 )
153 clean_node(wxr, sense_data, expanded_node)
154 for span_tag in expanded_node.find_html(
155 "span", attr_name="class", attr_value="Jpan"
156 ):
157 ruby_data, node_without_ruby = extract_ruby(wxr, span_tag)
158 example_data["text"] = clean_node(wxr, None, node_without_ruby)
159 calculate_bold_offsets(
160 wxr,
161 wxr.wtp.parse(wxr.wtp.node_to_wikitext(node_without_ruby)),
162 example_data["text"],
163 example_data,
164 "bold_text_offsets",
165 )
166 example_data["ruby"] = ruby_data
167 for span_tag in expanded_node.find_html_recursively(
168 "span", attr_name="class", attr_value="tr"
169 ):
170 example_data["roman"] = clean_node(wxr, None, span_tag)
171 calculate_bold_offsets(
172 wxr,
173 span_tag,
174 example_data["roman"],
175 example_data,
176 "bold_roman_offsets",
177 )
178 tr_arg = wxr.wtp.parse(
179 wxr.wtp.node_to_wikitext(node.template_parameters.get(3, "")),
180 expand_all=True,
181 )
182 example_data["translation"] = clean_node(wxr, None, tr_arg)
183 example_data["english"] = example_data[
184 "translation"
185 ] # DEPRECATED for "translation"
186 calculate_bold_offsets(
187 wxr,
188 tr_arg,
189 example_data["translation"],
190 example_data,
191 "bold_translation_offsets",
192 )
193 lit_arg = wxr.wtp.parse(
194 wxr.wtp.node_to_wikitext(node.template_parameters.get("lit", "")),
195 expand_all=True,
196 )
197 example_data["literal_meaning"] = clean_node(wxr, None, lit_arg)
198 calculate_bold_offsets(
199 wxr,
200 lit_arg,
201 example_data["literal_meaning"],
202 example_data,
203 "bold_literal_offsets",
204 )
205 clean_example_empty_data(example_data)
206 return example_data
209def extract_template_zh_x(
210 wxr: WiktextractContext,
211 template_node: TemplateNode,
212 sense_data: SenseData | None,
213 parent_example: ExampleData,
214) -> list[ExampleData]:
215 # https://en.wiktionary.org/wiki/Template:zh-x
216 expanded_node = wxr.wtp.parse(
217 wxr.wtp.node_to_wikitext(template_node), expand_all=True
218 )
219 clean_node(wxr, sense_data, expanded_node)
220 has_dl_tag = False
221 results = []
222 example_data = deepcopy(parent_example)
223 tr_arg = wxr.wtp.parse(
224 wxr.wtp.node_to_wikitext(template_node.template_parameters.get(2, "")),
225 expand_all=True,
226 )
227 example_data["translation"] = clean_node(wxr, None, tr_arg)
228 example_data["english"] = example_data["translation"]
229 calculate_bold_offsets(
230 wxr,
231 tr_arg,
232 example_data["translation"],
233 example_data,
234 "bold_translation_offsets",
235 )
236 lit_arg = wxr.wtp.parse(
237 wxr.wtp.node_to_wikitext(
238 template_node.template_parameters.get("lit", "")
239 ),
240 expand_all=True,
241 )
242 example_data["literal_meaning"] = clean_node(wxr, None, lit_arg)
243 calculate_bold_offsets(
244 wxr,
245 tr_arg,
246 example_data["literal_meaning"],
247 example_data,
248 "bold_literal_offsets",
249 )
250 for dl_tag in expanded_node.find_html_recursively("dl"): 250 ↛ 251line 250 didn't jump to line 251 because the loop on line 250 never started
251 has_dl_tag = True
252 for dd_tag in dl_tag.find_html("dd"):
253 dd_text = clean_node(wxr, None, dd_tag)
254 if dd_text.startswith("From:"):
255 example_data["ref"] = dd_text.removeprefix("From:")
256 elif not dd_text.startswith("(literally,"):
257 for span_tag in dd_tag.find_html_recursively(
258 "span", attr_name="lang", attr_value="Latn"
259 ):
260 example_data["roman"] = clean_node(wxr, None, span_tag)
261 calculate_bold_offsets(
262 wxr,
263 span_tag,
264 example_data["roman"],
265 example_data,
266 "bold_roman_offsets",
267 )
268 for span_tag in dd_tag.find_html_recursively("span"):
269 span_text = clean_node(wxr, None, span_tag)
270 if span_text.startswith("[") and span_text.endswith(
271 "]"
272 ):
273 example_data["raw_tags"].append(
274 span_text.strip("[]")
275 )
276 break
277 results.extend(extract_zh_x_dl_span_tag(wxr, dl_tag, example_data))
279 # no source, single line example
280 if not has_dl_tag: 280 ↛ 310line 280 didn't jump to line 310 because the condition on line 280 was always true
281 for span_tag in expanded_node.find_html(
282 "span", attr_name="lang", attr_value="Latn"
283 ):
284 example_data["roman"] = clean_node(wxr, None, span_tag)
285 calculate_bold_offsets(
286 wxr,
287 span_tag,
288 example_data["roman"],
289 example_data,
290 "bold_roman_offsets",
291 )
292 break
293 for span_tag in expanded_node.find_html("span"):
294 span_text = clean_node(wxr, None, span_tag)
295 if span_text.startswith("[") and span_text.endswith("]"): 295 ↛ 296line 295 didn't jump to line 296 because the condition on line 295 was never true
296 example_data["raw_tags"].append(span_text.strip("[]"))
297 for span_tag in expanded_node.find_html("span"):
298 span_lang = span_tag.attrs.get("lang", "")
299 if span_lang in ["zh-Hant", "zh-Hans"]:
300 example_text = clean_node(wxr, None, span_tag)
301 if len(example_text) > 0: 301 ↛ 297line 301 didn't jump to line 297 because the condition on line 301 was always true
302 new_example = add_zh_hant_hans_spans(
303 wxr,
304 example_data,
305 example_text,
306 span_tag,
307 span_lang,
308 )
309 results.append(new_example)
310 return results
313def extract_zh_x_dl_span_tag(
314 wxr: WiktextractContext, dl_tag: HTMLNode, example: ExampleData
315) -> list[ExampleData]:
316 # process example text span tag and dialect span tag
317 results = []
318 is_first_hide = True
319 for span_tag in dl_tag.find_html("span"):
320 span_lang = span_tag.attrs.get("lang", "")
321 if span_lang in ["zh-Hant", "zh-Hans"]:
322 new_example = add_zh_hant_hans_spans(
323 wxr,
324 example,
325 clean_node(wxr, None, span_tag),
326 span_tag,
327 span_lang,
328 )
329 results.append(new_example)
330 elif "vsHide" in span_tag.attrs.get("class", ""):
331 # template has arg "collapsed=y"
332 results.extend(
333 extract_zh_x_dl_span_tag(
334 wxr,
335 span_tag,
336 results[-1]
337 if is_first_hide and len(results) > 0
338 else example,
339 )
340 )
341 is_first_hide = False
342 elif "font-size:x-small" in span_tag.attrs.get("style", ""):
343 for link_node in span_tag.find_child_recursively(NodeKind.LINK):
344 raw_tag = clean_node(wxr, None, link_node)
345 if len(raw_tag) > 0:
346 if len(results) > 0:
347 results[-1]["raw_tags"].append(raw_tag)
348 else:
349 example["raw_tags"].append(raw_tag)
351 if dl_tag.tag == "dl":
352 for data in results:
353 clean_example_empty_data(data)
354 return results
357ZH_X_TAGS = {
358 "trad.": "Traditional-Chinese",
359 "simp.": "Simplified-Chinese",
360 "Taiwanese Mandarin": "Taiwanese-Mandarin",
361 "MSC": "Standard-Chinese",
362 "Literary Chinese": "Literary-Chinese",
363 "Classical Chinese": "Classical-Chinese",
364 "Guangzhou Cantonese": "Guangzhou-Cantonese",
365}
368def clean_example_empty_data(data: ExampleData) -> None:
369 # remove empty data and convert raw tags
370 raw_tags = data.get("raw_tags", [])
371 new_raw_tags = []
372 for raw_tag in raw_tags:
373 if raw_tag in ZH_X_TAGS: 373 ↛ 374line 373 didn't jump to line 374 because the condition on line 373 was never true
374 data["tags"].append(ZH_X_TAGS[raw_tag])
375 elif raw_tag in valid_tags: 375 ↛ 376line 375 didn't jump to line 376 because the condition on line 375 was never true
376 data["tags"].append(raw_tag)
377 else:
378 new_raw_tags.append(raw_tag)
379 data["raw_tags"] = new_raw_tags
380 if len(data.get("ref", "")) > 0: 380 ↛ 381line 380 didn't jump to line 381 because the condition on line 380 was never true
381 data["type"] = "quotation"
382 else:
383 data["type"] = "example"
384 for key, value in data.copy().items():
385 if len(value) == 0:
386 del data[key]
389def extract_ux_template(
390 wxr: WiktextractContext,
391 t_node: TemplateNode,
392 sense_data: SenseData,
393 example_data: ExampleData,
394) -> ExampleData:
395 expanded_node = wxr.wtp.parse(
396 wxr.wtp.node_to_wikitext(t_node), expand_all=True
397 )
398 clean_node(wxr, sense_data, expanded_node)
399 for html_node in expanded_node.find_child_recursively(NodeKind.HTML):
400 class_names = html_node.attrs.get("class", "")
401 if len(class_names) == 0: 401 ↛ 402line 401 didn't jump to line 402 because the condition on line 401 was never true
402 continue
403 if "e-example" in class_names:
404 # extract ruby in Japanese template
405 if t_node.template_name in ("ja-usex", "ja-x", "ja-ux"): 405 ↛ 406line 405 didn't jump to line 406 because the condition on line 405 was never true
406 ruby_data, node_without_ruby = extract_ruby(wxr, html_node)
407 example_data["text"] = clean_node(wxr, None, node_without_ruby)
408 calculate_bold_offsets(
409 wxr,
410 wxr.wtp.parse(wxr.wtp.node_to_wikitext(node_without_ruby)),
411 example_data["text"],
412 example_data,
413 "bold_text_offsets",
414 )
415 example_data["ruby"] = ruby_data
416 else:
417 example_data["text"] = clean_node(wxr, None, html_node)
418 calculate_bold_offsets(
419 wxr,
420 html_node,
421 example_data["text"],
422 example_data,
423 "bold_text_offsets",
424 )
425 elif "e-transliteration" in class_names: 425 ↛ 426line 425 didn't jump to line 426 because the condition on line 425 was never true
426 example_data["roman"] = clean_node(wxr, None, html_node)
427 calculate_bold_offsets(
428 wxr,
429 html_node,
430 example_data["roman"],
431 example_data,
432 "bold_roman_offsets",
433 )
434 elif "e-translation" in class_names:
435 example_data["translation"] = clean_node(wxr, None, html_node)
436 example_data["english"] = example_data[
437 "translation"
438 ] # DEPRECATED for "translation"
439 calculate_bold_offsets(
440 wxr,
441 html_node,
442 example_data["translation"],
443 example_data,
444 "bold_translation_offsets",
445 )
446 elif "e-literally" in class_names: 446 ↛ 447line 446 didn't jump to line 447 because the condition on line 446 was never true
447 example_data["literal_meaning"] = clean_node(wxr, None, html_node)
448 calculate_bold_offsets(
449 wxr,
450 html_node,
451 example_data["literal_meaning"],
452 example_data,
453 "bold_literal_offsets",
454 )
455 elif "qualifier-content" in class_names:
456 raw_tag = clean_node(wxr, None, html_node)
457 if raw_tag != "": 457 ↛ 399line 457 didn't jump to line 399 because the condition on line 457 was always true
458 example_data["raw_tags"].append(raw_tag)
460 clean_example_empty_data(example_data)
461 return example_data
464def add_zh_hant_hans_spans(
465 wxr, example_data, example_text, span_tag, span_lang
466):
467 new_example = deepcopy(example_data)
468 new_example["text"] = example_text
469 calculate_bold_offsets(
470 wxr,
471 span_tag,
472 example_text,
473 new_example,
474 "bold_text_offsets",
475 )
476 new_example["tags"].append(
477 "Traditional-Chinese"
478 if span_lang == "zh-Hant"
479 else "Simplified-Chinese"
480 )
481 clean_example_empty_data(new_example)
482 return new_example