Coverage for src/wiktextract/extractor/en/example.py: 59%
189 statements
« prev ^ index » next coverage.py v7.11.0, created at 2025-11-03 05:44 +0000
« prev ^ index » next coverage.py v7.11.0, created at 2025-11-03 05:44 +0000
1from copy import deepcopy
3from wikitextprocessor import HTMLNode, NodeKind, TemplateNode, WikiNode
5from ...page import clean_node
6from ...tags import valid_tags
7from ...wxr_context import WiktextractContext
8from ..ruby import extract_ruby
9from ..share import calculate_bold_offsets
10from .type_utils import ExampleData, SenseData
13def extract_example_list_item(
14 wxr: WiktextractContext,
15 list_item: WikiNode,
16 sense_data: SenseData,
17 parent_data: ExampleData,
18) -> list[ExampleData]:
19 examples = []
20 for template_node in list_item.find_child(NodeKind.TEMPLATE):
21 if template_node.template_name in ["zh-x", "zh-usex", "zh-q", "zh-co"]:
22 examples.extend(
23 extract_template_zh_x(
24 wxr,
25 template_node,
26 sense_data,
27 parent_data,
28 )
29 )
30 elif template_node.template_name in ["ja-usex", "ja-x", "ja-ux"]:
31 examples.append(
32 extract_template_ja_usex(
33 wxr,
34 template_node,
35 sense_data,
36 parent_data,
37 )
38 )
39 elif (
40 template_node.template_name.startswith(("quote-", "RQ:"))
41 or template_node.template_name == "quote"
42 ):
43 q_example = extract_quote_templates(wxr, template_node, sense_data)
44 if list_item.contain_node(NodeKind.LIST):
45 for next_list_item in list_item.find_child_recursively(
46 NodeKind.LIST_ITEM
47 ):
48 for key in ["tags", "raw_tags"]:
49 if key not in q_example:
50 q_example[key] = []
51 examples.extend(
52 extract_example_list_item(
53 wxr, next_list_item, sense_data, q_example
54 )
55 )
56 else:
57 examples.append(q_example)
58 elif template_node.template_name in [
59 "ux",
60 "usex",
61 "uxi",
62 "ko-usex",
63 "koex",
64 "ko-x",
65 "th-usex",
66 "th-x",
67 "th-xi",
68 "uxa",
69 "collocation",
70 "co",
71 "coi",
72 ]:
73 copy_of_parent_data = deepcopy(parent_data)
74 if template_node.template_name in ("collocation", "co", "coi"): 74 ↛ 75line 74 didn't jump to line 75 because the condition on line 74 was never true
75 copy_of_parent_data["tags"].append("collocation")
76 examples.append(
77 extract_ux_template(
78 wxr,
79 template_node,
80 sense_data,
81 copy_of_parent_data,
82 )
83 )
85 return examples
88def extract_quote_templates(
89 wxr: WiktextractContext, node: TemplateNode, sense_data: SenseData
90) -> ExampleData:
91 expanded_node = wxr.wtp.parse(
92 wxr.wtp.node_to_wikitext(node), expand_all=True
93 )
94 clean_node(wxr, sense_data, expanded_node)
95 example_data = ExampleData(
96 text="", ref="", english="", roman="", type="quote"
97 )
98 for span_tag in expanded_node.find_html_recursively("span"): 98 ↛ 99line 98 didn't jump to line 99 because the loop on line 98 never started
99 span_class = span_tag.attrs.get("class", "")
100 if "cited-source" == span_class:
101 example_data["ref"] = clean_node(wxr, None, span_tag)
102 elif "e-quotation" in span_class:
103 ruby_data, node_without_ruby = extract_ruby(wxr, span_tag)
104 if len(ruby_data) > 0:
105 example_data["ruby"] = ruby_data
106 example_data["text"] = clean_node(wxr, None, node_without_ruby)
107 calculate_bold_offsets(
108 wxr,
109 span_tag,
110 example_data["text"],
111 example_data,
112 "bold_text_offsets",
113 )
114 elif "e-translation" in span_class:
115 example_data["translation"] = clean_node(
116 wxr, None, span_tag
117 ) # DEPRECATED for "translation"
118 example_data["english"] = example_data[
119 "translation"
120 ] # DEPRECATED for "translation"
121 calculate_bold_offsets(
122 wxr,
123 span_tag,
124 example_data["translation"],
125 example_data,
126 "bold_translation_offsets",
127 )
128 for i_tag in expanded_node.find_html_recursively( 128 ↛ 131line 128 didn't jump to line 131 because the loop on line 128 never started
129 "i", attr_name="class", attr_value="e-transliteration"
130 ):
131 example_data["roman"] = clean_node(wxr, None, i_tag)
132 calculate_bold_offsets(
133 wxr,
134 span_tag,
135 example_data["roman"],
136 example_data,
137 "bold_roman_offsets",
138 )
139 break
140 clean_example_empty_data(example_data)
141 return example_data
144def extract_template_ja_usex(
145 wxr: WiktextractContext,
146 node: TemplateNode,
147 sense_data: SenseData,
148 example_data: ExampleData,
149) -> ExampleData:
150 # https://en.wiktionary.org/wiki/Template:ja-usex
151 expanded_node = wxr.wtp.parse(
152 wxr.wtp.node_to_wikitext(node), expand_all=True
153 )
154 clean_node(wxr, sense_data, expanded_node)
155 for span_tag in expanded_node.find_html( 155 ↛ 158line 155 didn't jump to line 158 because the loop on line 155 never started
156 "span", attr_name="class", attr_value="Jpan"
157 ):
158 ruby_data, node_without_ruby = extract_ruby(wxr, span_tag)
159 example_data["text"] = clean_node(wxr, None, node_without_ruby)
160 calculate_bold_offsets(
161 wxr,
162 wxr.wtp.parse(wxr.wtp.node_to_wikitext(node_without_ruby)),
163 example_data["text"],
164 example_data,
165 "bold_text_offsets",
166 )
167 example_data["ruby"] = ruby_data
168 for span_tag in expanded_node.find_html_recursively( 168 ↛ 171line 168 didn't jump to line 171 because the loop on line 168 never started
169 "span", attr_name="class", attr_value="tr"
170 ):
171 example_data["roman"] = clean_node(wxr, None, span_tag)
172 calculate_bold_offsets(
173 wxr,
174 span_tag,
175 example_data["roman"],
176 example_data,
177 "bold_roman_offsets",
178 )
179 tr_arg = wxr.wtp.parse(
180 wxr.wtp.node_to_wikitext(node.template_parameters.get(3, "")),
181 expand_all=True,
182 )
183 example_data["translation"] = clean_node(wxr, None, tr_arg)
184 example_data["english"] = example_data[
185 "translation"
186 ] # DEPRECATED for "translation"
187 calculate_bold_offsets(
188 wxr,
189 tr_arg,
190 example_data["translation"],
191 example_data,
192 "bold_translation_offsets",
193 )
194 lit_arg = wxr.wtp.parse(
195 wxr.wtp.node_to_wikitext(node.template_parameters.get("lit", "")),
196 expand_all=True,
197 )
198 example_data["literal_meaning"] = clean_node(wxr, None, lit_arg)
199 calculate_bold_offsets(
200 wxr,
201 lit_arg,
202 example_data["literal_meaning"],
203 example_data,
204 "bold_literal_offsets",
205 )
206 clean_example_empty_data(example_data)
207 return example_data
210def extract_template_zh_x(
211 wxr: WiktextractContext,
212 template_node: TemplateNode,
213 sense_data: SenseData | None,
214 parent_example: ExampleData,
215) -> list[ExampleData]:
216 # https://en.wiktionary.org/wiki/Template:zh-x
217 expanded_node = wxr.wtp.parse(
218 wxr.wtp.node_to_wikitext(template_node), expand_all=True
219 )
220 clean_node(wxr, sense_data, expanded_node)
221 has_dl_tag = False
222 results = []
223 example_data = deepcopy(parent_example)
224 tr_arg = wxr.wtp.parse(
225 wxr.wtp.node_to_wikitext(template_node.template_parameters.get(2, "")),
226 expand_all=True,
227 )
228 example_data["translation"] = clean_node(wxr, None, tr_arg)
229 example_data["english"] = example_data["translation"]
230 calculate_bold_offsets(
231 wxr,
232 tr_arg,
233 example_data["translation"],
234 example_data,
235 "bold_translation_offsets",
236 )
237 lit_arg = wxr.wtp.parse(
238 wxr.wtp.node_to_wikitext(
239 template_node.template_parameters.get("lit", "")
240 ),
241 expand_all=True,
242 )
243 example_data["literal_meaning"] = clean_node(wxr, None, lit_arg)
244 calculate_bold_offsets(
245 wxr,
246 tr_arg,
247 example_data["literal_meaning"],
248 example_data,
249 "bold_literal_offsets",
250 )
251 for dl_tag in expanded_node.find_html_recursively("dl"): 251 ↛ 252line 251 didn't jump to line 252 because the loop on line 251 never started
252 has_dl_tag = True
253 for dd_tag in dl_tag.find_html("dd"):
254 dd_text = clean_node(wxr, None, dd_tag)
255 if dd_text.startswith("From:"):
256 example_data["ref"] = dd_text.removeprefix("From:")
257 elif not dd_text.startswith("(literally,"):
258 for span_tag in dd_tag.find_html_recursively(
259 "span", attr_name="lang", attr_value="Latn"
260 ):
261 example_data["roman"] = clean_node(wxr, None, span_tag)
262 calculate_bold_offsets(
263 wxr,
264 span_tag,
265 example_data["roman"],
266 example_data,
267 "bold_roman_offsets",
268 )
269 for span_tag in dd_tag.find_html_recursively("span"):
270 span_text = clean_node(wxr, None, span_tag)
271 if span_text.startswith("[") and span_text.endswith(
272 "]"
273 ):
274 example_data["raw_tags"].append(
275 span_text.strip("[]")
276 )
277 break
278 results.extend(extract_zh_x_dl_span_tag(wxr, dl_tag, example_data))
280 # no source, single line example
281 if not has_dl_tag: 281 ↛ 319line 281 didn't jump to line 319 because the condition on line 281 was always true
282 for span_tag in expanded_node.find_html(
283 "span", attr_name="lang", attr_value="Latn"
284 ):
285 example_data["roman"] = clean_node(wxr, None, span_tag)
286 calculate_bold_offsets(
287 wxr,
288 span_tag,
289 example_data["roman"],
290 example_data,
291 "bold_roman_offsets",
292 )
293 break
294 for span_tag in expanded_node.find_html("span"):
295 span_text = clean_node(wxr, None, span_tag)
296 if span_text.startswith("[") and span_text.endswith("]"): 296 ↛ 297line 296 didn't jump to line 297 because the condition on line 296 was never true
297 example_data["raw_tags"].append(span_text.strip("[]"))
298 for span_tag in expanded_node.find_html("span"):
299 span_lang = span_tag.attrs.get("lang", "")
300 if span_lang in ["zh-Hant", "zh-Hans"]:
301 example_text = clean_node(wxr, None, span_tag)
302 if len(example_text) > 0: 302 ↛ 298line 302 didn't jump to line 298 because the condition on line 302 was always true
303 new_example = deepcopy(example_data)
304 new_example["text"] = example_text
305 calculate_bold_offsets(
306 wxr,
307 span_tag,
308 example_text,
309 new_example,
310 "bold_text_offsets",
311 )
312 new_example["tags"].append(
313 "Traditional-Chinese"
314 if span_lang == "zh-Hant"
315 else "Simplified-Chinese"
316 )
317 clean_example_empty_data(new_example)
318 results.append(new_example)
319 return results
322def extract_zh_x_dl_span_tag(
323 wxr: WiktextractContext, dl_tag: HTMLNode, example: ExampleData
324) -> list[ExampleData]:
325 # process example text span tag and dialect span tag
326 results = []
327 is_first_hide = True
328 for span_tag in dl_tag.find_html("span"):
329 span_lang = span_tag.attrs.get("lang", "")
330 if span_lang in ["zh-Hant", "zh-Hans"]:
331 new_example = deepcopy(example)
332 new_example["text"] = clean_node(wxr, None, span_tag)
333 calculate_bold_offsets(
334 wxr,
335 span_tag,
336 new_example["text"],
337 new_example,
338 "bold_text_offsets",
339 )
340 results.append(new_example)
341 elif "vsHide" in span_tag.attrs.get("class", ""):
342 # template has arg "collapsed=y"
343 results.extend(
344 extract_zh_x_dl_span_tag(
345 wxr,
346 span_tag,
347 results[-1]
348 if is_first_hide and len(results) > 0
349 else example,
350 )
351 )
352 is_first_hide = False
353 elif "font-size:x-small" in span_tag.attrs.get("style", ""):
354 for link_node in span_tag.find_child_recursively(NodeKind.LINK):
355 raw_tag = clean_node(wxr, None, link_node)
356 if len(raw_tag) > 0:
357 if len(results) > 0:
358 results[-1]["raw_tags"].append(raw_tag)
359 else:
360 example["raw_tags"].append(raw_tag)
362 if dl_tag.tag == "dl":
363 for data in results:
364 clean_example_empty_data(data)
365 return results
368ZH_X_TAGS = {
369 "trad.": "Traditional-Chinese",
370 "simp.": "Simplified-Chinese",
371 "Taiwanese Mandarin": "Taiwanese-Mandarin",
372 "MSC": "Standard-Chinese",
373 "Literary Chinese": "Literary-Chinese",
374 "Classical Chinese": "Classical-Chinese",
375 "Guangzhou Cantonese": "Guangzhou-Cantonese",
376}
379def clean_example_empty_data(data: ExampleData) -> None:
380 # remove empty data and convert raw tags
381 raw_tags = data.get("raw_tags", [])
382 new_raw_tags = []
383 for raw_tag in raw_tags:
384 if raw_tag in ZH_X_TAGS: 384 ↛ 385line 384 didn't jump to line 385 because the condition on line 384 was never true
385 data["tags"].append(ZH_X_TAGS[raw_tag])
386 elif raw_tag in valid_tags: 386 ↛ 387line 386 didn't jump to line 387 because the condition on line 386 was never true
387 data["tags"].append(raw_tag)
388 else:
389 new_raw_tags.append(raw_tag)
390 data["raw_tags"] = new_raw_tags
391 if len(data.get("ref", "")) > 0: 391 ↛ 392line 391 didn't jump to line 392 because the condition on line 391 was never true
392 data["type"] = "quote"
393 else:
394 data["type"] = "example"
395 for key, value in data.copy().items():
396 if len(value) == 0:
397 del data[key]
400def extract_ux_template(
401 wxr: WiktextractContext,
402 t_node: TemplateNode,
403 sense_data: SenseData,
404 example_data: ExampleData,
405) -> ExampleData:
406 expanded_node = wxr.wtp.parse(
407 wxr.wtp.node_to_wikitext(t_node), expand_all=True
408 )
409 clean_node(wxr, sense_data, expanded_node)
410 for html_node in expanded_node.find_child_recursively(NodeKind.HTML):
411 class_names = html_node.attrs.get("class", "")
412 if "e-example" in class_names:
413 example_data["text"] = clean_node(wxr, None, html_node)
414 calculate_bold_offsets(
415 wxr,
416 html_node,
417 example_data["text"],
418 example_data,
419 "bold_text_offsets",
420 )
421 elif "e-transliteration" in class_names: 421 ↛ 422line 421 didn't jump to line 422 because the condition on line 421 was never true
422 example_data["roman"] = clean_node(wxr, None, html_node)
423 calculate_bold_offsets(
424 wxr,
425 html_node,
426 example_data["roman"],
427 example_data,
428 "bold_roman_offsets",
429 )
430 elif "e-translation" in class_names:
431 example_data["translation"] = clean_node(wxr, None, html_node)
432 example_data["english"] = example_data[
433 "translation"
434 ] # DEPRECATED for "translation"
435 calculate_bold_offsets(
436 wxr,
437 html_node,
438 example_data["translation"],
439 example_data,
440 "bold_translation_offsets",
441 )
442 elif "e-literally" in class_names: 442 ↛ 443line 442 didn't jump to line 443 because the condition on line 442 was never true
443 example_data["literal_meaning"] = clean_node(wxr, None, html_node)
444 calculate_bold_offsets(
445 wxr,
446 html_node,
447 example_data["literal_meaning"],
448 example_data,
449 "bold_literal_offsets",
450 )
451 elif "qualifier-content" in class_names:
452 raw_tag = clean_node(wxr, None, html_node)
453 if raw_tag != "": 453 ↛ 410line 453 didn't jump to line 410 because the condition on line 453 was always true
454 example_data["raw_tags"].append(raw_tag)
456 clean_example_empty_data(example_data)
457 return example_data