Coverage for src / wiktextract / extractor / en / example.py: 59%
189 statements
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-11 03:38 +0000
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-11 03:38 +0000
1from copy import deepcopy
3from wikitextprocessor import HTMLNode, NodeKind, TemplateNode, WikiNode
5from ...page import clean_node
6from ...tags import valid_tags
7from ...wxr_context import WiktextractContext
8from ..ruby import extract_ruby
9from ..share import calculate_bold_offsets
10from .type_utils import ExampleData, SenseData
13def extract_example_list_item(
14 wxr: WiktextractContext,
15 list_item: WikiNode,
16 sense_data: SenseData,
17 parent_data: ExampleData,
18) -> list[ExampleData]:
19 examples = []
20 for template_node in list_item.find_child(NodeKind.TEMPLATE):
21 if template_node.template_name in ["zh-x", "zh-usex", "zh-q", "zh-co"]:
22 examples.extend(
23 extract_template_zh_x(
24 wxr,
25 template_node,
26 sense_data,
27 parent_data,
28 )
29 )
30 elif template_node.template_name in ["ja-usex", "ja-x", "ja-ux"]:
31 examples.append(
32 extract_template_ja_usex(
33 wxr,
34 template_node,
35 sense_data,
36 parent_data,
37 )
38 )
39 elif (
40 template_node.template_name.startswith(("quote-", "RQ:"))
41 or template_node.template_name == "quote"
42 ):
43 q_example = extract_quote_templates(wxr, template_node, sense_data)
44 if list_item.contain_node(NodeKind.LIST):
45 for next_list_item in list_item.find_child_recursively(
46 NodeKind.LIST_ITEM
47 ):
48 for key in ["tags", "raw_tags"]:
49 if key not in q_example:
50 q_example[key] = []
51 examples.extend(
52 extract_example_list_item(
53 wxr, next_list_item, sense_data, q_example
54 )
55 )
56 else:
57 examples.append(q_example)
58 elif template_node.template_name in [
59 "ux",
60 "usex",
61 "uxi",
62 "ko-usex",
63 "koex",
64 "ko-x",
65 "th-usex",
66 "th-x",
67 "th-xi",
68 "uxa",
69 "collocation",
70 "co",
71 "coi",
72 "uxa",
73 ]:
74 copy_of_parent_data = deepcopy(parent_data)
75 if template_node.template_name in ("collocation", "co", "coi"): 75 ↛ 76line 75 didn't jump to line 76 because the condition on line 75 was never true
76 copy_of_parent_data["tags"].append("collocation")
77 examples.append(
78 extract_ux_template(
79 wxr,
80 template_node,
81 sense_data,
82 copy_of_parent_data,
83 )
84 )
86 return examples
89def extract_quote_templates(
90 wxr: WiktextractContext, node: TemplateNode, sense_data: SenseData
91) -> ExampleData:
92 expanded_node = wxr.wtp.parse(
93 wxr.wtp.node_to_wikitext(node), expand_all=True
94 )
95 clean_node(wxr, sense_data, expanded_node)
96 example_data = ExampleData(
97 text="", ref="", english="", roman="", type="quote"
98 )
99 for span_tag in expanded_node.find_html_recursively("span"): 99 ↛ 100line 99 didn't jump to line 100 because the loop on line 99 never started
100 span_class = span_tag.attrs.get("class", "")
101 if "cited-source" == span_class:
102 example_data["ref"] = clean_node(wxr, None, span_tag)
103 elif "e-quotation" in span_class:
104 ruby_data, node_without_ruby = extract_ruby(wxr, span_tag)
105 if len(ruby_data) > 0:
106 example_data["ruby"] = ruby_data
107 example_data["text"] = clean_node(wxr, None, node_without_ruby)
108 calculate_bold_offsets(
109 wxr,
110 span_tag,
111 example_data["text"],
112 example_data,
113 "bold_text_offsets",
114 )
115 elif "e-translation" in span_class:
116 example_data["translation"] = clean_node(
117 wxr, None, span_tag
118 ) # DEPRECATED for "translation"
119 example_data["english"] = example_data[
120 "translation"
121 ] # DEPRECATED for "translation"
122 calculate_bold_offsets(
123 wxr,
124 span_tag,
125 example_data["translation"],
126 example_data,
127 "bold_translation_offsets",
128 )
129 for i_tag in expanded_node.find_html_recursively( 129 ↛ 132line 129 didn't jump to line 132 because the loop on line 129 never started
130 "i", attr_name="class", attr_value="e-transliteration"
131 ):
132 example_data["roman"] = clean_node(wxr, None, i_tag)
133 calculate_bold_offsets(
134 wxr,
135 span_tag,
136 example_data["roman"],
137 example_data,
138 "bold_roman_offsets",
139 )
140 break
141 clean_example_empty_data(example_data)
142 return example_data
145def extract_template_ja_usex(
146 wxr: WiktextractContext,
147 node: TemplateNode,
148 sense_data: SenseData,
149 example_data: ExampleData,
150) -> ExampleData:
151 # https://en.wiktionary.org/wiki/Template:ja-usex
152 expanded_node = wxr.wtp.parse(
153 wxr.wtp.node_to_wikitext(node), expand_all=True
154 )
155 clean_node(wxr, sense_data, expanded_node)
156 for span_tag in expanded_node.find_html( 156 ↛ 159line 156 didn't jump to line 159 because the loop on line 156 never started
157 "span", attr_name="class", attr_value="Jpan"
158 ):
159 ruby_data, node_without_ruby = extract_ruby(wxr, span_tag)
160 example_data["text"] = clean_node(wxr, None, node_without_ruby)
161 calculate_bold_offsets(
162 wxr,
163 wxr.wtp.parse(wxr.wtp.node_to_wikitext(node_without_ruby)),
164 example_data["text"],
165 example_data,
166 "bold_text_offsets",
167 )
168 example_data["ruby"] = ruby_data
169 for span_tag in expanded_node.find_html_recursively( 169 ↛ 172line 169 didn't jump to line 172 because the loop on line 169 never started
170 "span", attr_name="class", attr_value="tr"
171 ):
172 example_data["roman"] = clean_node(wxr, None, span_tag)
173 calculate_bold_offsets(
174 wxr,
175 span_tag,
176 example_data["roman"],
177 example_data,
178 "bold_roman_offsets",
179 )
180 tr_arg = wxr.wtp.parse(
181 wxr.wtp.node_to_wikitext(node.template_parameters.get(3, "")),
182 expand_all=True,
183 )
184 example_data["translation"] = clean_node(wxr, None, tr_arg)
185 example_data["english"] = example_data[
186 "translation"
187 ] # DEPRECATED for "translation"
188 calculate_bold_offsets(
189 wxr,
190 tr_arg,
191 example_data["translation"],
192 example_data,
193 "bold_translation_offsets",
194 )
195 lit_arg = wxr.wtp.parse(
196 wxr.wtp.node_to_wikitext(node.template_parameters.get("lit", "")),
197 expand_all=True,
198 )
199 example_data["literal_meaning"] = clean_node(wxr, None, lit_arg)
200 calculate_bold_offsets(
201 wxr,
202 lit_arg,
203 example_data["literal_meaning"],
204 example_data,
205 "bold_literal_offsets",
206 )
207 clean_example_empty_data(example_data)
208 return example_data
211def extract_template_zh_x(
212 wxr: WiktextractContext,
213 template_node: TemplateNode,
214 sense_data: SenseData | None,
215 parent_example: ExampleData,
216) -> list[ExampleData]:
217 # https://en.wiktionary.org/wiki/Template:zh-x
218 expanded_node = wxr.wtp.parse(
219 wxr.wtp.node_to_wikitext(template_node), expand_all=True
220 )
221 clean_node(wxr, sense_data, expanded_node)
222 has_dl_tag = False
223 results = []
224 example_data = deepcopy(parent_example)
225 tr_arg = wxr.wtp.parse(
226 wxr.wtp.node_to_wikitext(template_node.template_parameters.get(2, "")),
227 expand_all=True,
228 )
229 example_data["translation"] = clean_node(wxr, None, tr_arg)
230 example_data["english"] = example_data["translation"]
231 calculate_bold_offsets(
232 wxr,
233 tr_arg,
234 example_data["translation"],
235 example_data,
236 "bold_translation_offsets",
237 )
238 lit_arg = wxr.wtp.parse(
239 wxr.wtp.node_to_wikitext(
240 template_node.template_parameters.get("lit", "")
241 ),
242 expand_all=True,
243 )
244 example_data["literal_meaning"] = clean_node(wxr, None, lit_arg)
245 calculate_bold_offsets(
246 wxr,
247 tr_arg,
248 example_data["literal_meaning"],
249 example_data,
250 "bold_literal_offsets",
251 )
252 for dl_tag in expanded_node.find_html_recursively("dl"): 252 ↛ 253line 252 didn't jump to line 253 because the loop on line 252 never started
253 has_dl_tag = True
254 for dd_tag in dl_tag.find_html("dd"):
255 dd_text = clean_node(wxr, None, dd_tag)
256 if dd_text.startswith("From:"):
257 example_data["ref"] = dd_text.removeprefix("From:")
258 elif not dd_text.startswith("(literally,"):
259 for span_tag in dd_tag.find_html_recursively(
260 "span", attr_name="lang", attr_value="Latn"
261 ):
262 example_data["roman"] = clean_node(wxr, None, span_tag)
263 calculate_bold_offsets(
264 wxr,
265 span_tag,
266 example_data["roman"],
267 example_data,
268 "bold_roman_offsets",
269 )
270 for span_tag in dd_tag.find_html_recursively("span"):
271 span_text = clean_node(wxr, None, span_tag)
272 if span_text.startswith("[") and span_text.endswith(
273 "]"
274 ):
275 example_data["raw_tags"].append(
276 span_text.strip("[]")
277 )
278 break
279 results.extend(extract_zh_x_dl_span_tag(wxr, dl_tag, example_data))
281 # no source, single line example
282 if not has_dl_tag: 282 ↛ 320line 282 didn't jump to line 320 because the condition on line 282 was always true
283 for span_tag in expanded_node.find_html(
284 "span", attr_name="lang", attr_value="Latn"
285 ):
286 example_data["roman"] = clean_node(wxr, None, span_tag)
287 calculate_bold_offsets(
288 wxr,
289 span_tag,
290 example_data["roman"],
291 example_data,
292 "bold_roman_offsets",
293 )
294 break
295 for span_tag in expanded_node.find_html("span"):
296 span_text = clean_node(wxr, None, span_tag)
297 if span_text.startswith("[") and span_text.endswith("]"): 297 ↛ 298line 297 didn't jump to line 298 because the condition on line 297 was never true
298 example_data["raw_tags"].append(span_text.strip("[]"))
299 for span_tag in expanded_node.find_html("span"):
300 span_lang = span_tag.attrs.get("lang", "")
301 if span_lang in ["zh-Hant", "zh-Hans"]:
302 example_text = clean_node(wxr, None, span_tag)
303 if len(example_text) > 0: 303 ↛ 299line 303 didn't jump to line 299 because the condition on line 303 was always true
304 new_example = deepcopy(example_data)
305 new_example["text"] = example_text
306 calculate_bold_offsets(
307 wxr,
308 span_tag,
309 example_text,
310 new_example,
311 "bold_text_offsets",
312 )
313 new_example["tags"].append(
314 "Traditional-Chinese"
315 if span_lang == "zh-Hant"
316 else "Simplified-Chinese"
317 )
318 clean_example_empty_data(new_example)
319 results.append(new_example)
320 return results
323def extract_zh_x_dl_span_tag(
324 wxr: WiktextractContext, dl_tag: HTMLNode, example: ExampleData
325) -> list[ExampleData]:
326 # process example text span tag and dialect span tag
327 results = []
328 is_first_hide = True
329 for span_tag in dl_tag.find_html("span"):
330 span_lang = span_tag.attrs.get("lang", "")
331 if span_lang in ["zh-Hant", "zh-Hans"]:
332 new_example = deepcopy(example)
333 new_example["text"] = clean_node(wxr, None, span_tag)
334 calculate_bold_offsets(
335 wxr,
336 span_tag,
337 new_example["text"],
338 new_example,
339 "bold_text_offsets",
340 )
341 results.append(new_example)
342 elif "vsHide" in span_tag.attrs.get("class", ""):
343 # template has arg "collapsed=y"
344 results.extend(
345 extract_zh_x_dl_span_tag(
346 wxr,
347 span_tag,
348 results[-1]
349 if is_first_hide and len(results) > 0
350 else example,
351 )
352 )
353 is_first_hide = False
354 elif "font-size:x-small" in span_tag.attrs.get("style", ""):
355 for link_node in span_tag.find_child_recursively(NodeKind.LINK):
356 raw_tag = clean_node(wxr, None, link_node)
357 if len(raw_tag) > 0:
358 if len(results) > 0:
359 results[-1]["raw_tags"].append(raw_tag)
360 else:
361 example["raw_tags"].append(raw_tag)
363 if dl_tag.tag == "dl":
364 for data in results:
365 clean_example_empty_data(data)
366 return results
369ZH_X_TAGS = {
370 "trad.": "Traditional-Chinese",
371 "simp.": "Simplified-Chinese",
372 "Taiwanese Mandarin": "Taiwanese-Mandarin",
373 "MSC": "Standard-Chinese",
374 "Literary Chinese": "Literary-Chinese",
375 "Classical Chinese": "Classical-Chinese",
376 "Guangzhou Cantonese": "Guangzhou-Cantonese",
377}
380def clean_example_empty_data(data: ExampleData) -> None:
381 # remove empty data and convert raw tags
382 raw_tags = data.get("raw_tags", [])
383 new_raw_tags = []
384 for raw_tag in raw_tags:
385 if raw_tag in ZH_X_TAGS: 385 ↛ 386line 385 didn't jump to line 386 because the condition on line 385 was never true
386 data["tags"].append(ZH_X_TAGS[raw_tag])
387 elif raw_tag in valid_tags: 387 ↛ 388line 387 didn't jump to line 388 because the condition on line 387 was never true
388 data["tags"].append(raw_tag)
389 else:
390 new_raw_tags.append(raw_tag)
391 data["raw_tags"] = new_raw_tags
392 if len(data.get("ref", "")) > 0: 392 ↛ 393line 392 didn't jump to line 393 because the condition on line 392 was never true
393 data["type"] = "quote"
394 else:
395 data["type"] = "example"
396 for key, value in data.copy().items():
397 if len(value) == 0:
398 del data[key]
401def extract_ux_template(
402 wxr: WiktextractContext,
403 t_node: TemplateNode,
404 sense_data: SenseData,
405 example_data: ExampleData,
406) -> ExampleData:
407 expanded_node = wxr.wtp.parse(
408 wxr.wtp.node_to_wikitext(t_node), expand_all=True
409 )
410 clean_node(wxr, sense_data, expanded_node)
411 for html_node in expanded_node.find_child_recursively(NodeKind.HTML):
412 class_names = html_node.attrs.get("class", "")
413 if "e-example" in class_names:
414 example_data["text"] = clean_node(wxr, None, html_node)
415 calculate_bold_offsets(
416 wxr,
417 html_node,
418 example_data["text"],
419 example_data,
420 "bold_text_offsets",
421 )
422 elif "e-transliteration" in class_names: 422 ↛ 423line 422 didn't jump to line 423 because the condition on line 422 was never true
423 example_data["roman"] = clean_node(wxr, None, html_node)
424 calculate_bold_offsets(
425 wxr,
426 html_node,
427 example_data["roman"],
428 example_data,
429 "bold_roman_offsets",
430 )
431 elif "e-translation" in class_names:
432 example_data["translation"] = clean_node(wxr, None, html_node)
433 example_data["english"] = example_data[
434 "translation"
435 ] # DEPRECATED for "translation"
436 calculate_bold_offsets(
437 wxr,
438 html_node,
439 example_data["translation"],
440 example_data,
441 "bold_translation_offsets",
442 )
443 elif "e-literally" in class_names: 443 ↛ 444line 443 didn't jump to line 444 because the condition on line 443 was never true
444 example_data["literal_meaning"] = clean_node(wxr, None, html_node)
445 calculate_bold_offsets(
446 wxr,
447 html_node,
448 example_data["literal_meaning"],
449 example_data,
450 "bold_literal_offsets",
451 )
452 elif "qualifier-content" in class_names:
453 raw_tag = clean_node(wxr, None, html_node)
454 if raw_tag != "": 454 ↛ 411line 454 didn't jump to line 411 because the condition on line 454 was always true
455 example_data["raw_tags"].append(raw_tag)
457 clean_example_empty_data(example_data)
458 return example_data