Coverage for src/wiktextract/extractor/en/example.py: 59%
189 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-12 08:27 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-12 08:27 +0000
1from copy import deepcopy
3from wikitextprocessor import HTMLNode, NodeKind, TemplateNode, WikiNode
5from ...page import clean_node
6from ...tags import valid_tags
7from ...wxr_context import WiktextractContext
8from ..ruby import extract_ruby
9from ..share import calculate_bold_offsets
10from .type_utils import ExampleData, SenseData
13def extract_example_list_item(
14 wxr: WiktextractContext,
15 list_item: WikiNode,
16 sense_data: SenseData,
17 parent_data: ExampleData,
18) -> list[ExampleData]:
19 examples = []
20 for template_node in list_item.find_child(NodeKind.TEMPLATE):
21 if template_node.template_name in ["zh-x", "zh-usex", "zh-q", "zh-co"]:
22 examples.extend(
23 extract_template_zh_x(
24 wxr,
25 template_node,
26 sense_data,
27 parent_data,
28 )
29 )
30 elif template_node.template_name in ["ja-usex", "ja-x", "ja-ux"]:
31 examples.append(
32 extract_template_ja_usex(
33 wxr,
34 template_node,
35 sense_data,
36 parent_data,
37 )
38 )
39 elif (
40 template_node.template_name.startswith(("quote-", "RQ:"))
41 or template_node.template_name == "quote"
42 ):
43 q_example = extract_quote_templates(wxr, template_node, sense_data)
44 if list_item.contain_node(NodeKind.LIST):
45 for next_list_item in list_item.find_child_recursively(
46 NodeKind.LIST_ITEM
47 ):
48 for key in ["tags", "raw_tags"]:
49 if key not in q_example:
50 q_example[key] = []
51 examples.extend(
52 extract_example_list_item(
53 wxr, next_list_item, sense_data, q_example
54 )
55 )
56 else:
57 examples.append(q_example)
58 elif template_node.template_name in [
59 "ux",
60 "usex",
61 "uxi",
62 "ko-usex",
63 "koex",
64 "ko-x",
65 "th-usex",
66 "th-x",
67 "th-xi",
68 "uxa",
69 "collocation",
70 "co",
71 "coi",
72 ]:
73 copy_of_parent_data = deepcopy(parent_data)
74 if template_node.template_name in ("collocation", "co", "coi"): 74 ↛ 75line 74 didn't jump to line 75 because the condition on line 74 was never true
75 copy_of_parent_data["tags"].append("collocation")
76 examples.append(
77 extract_ux_template(
78 wxr,
79 template_node,
80 sense_data,
81 copy_of_parent_data,
82 )
83 )
85 return examples
88def extract_quote_templates(
89 wxr: WiktextractContext, node: TemplateNode, sense_data: SenseData
90) -> ExampleData:
91 expanded_node = wxr.wtp.parse(
92 wxr.wtp.node_to_wikitext(node), expand_all=True
93 )
94 clean_node(wxr, sense_data, expanded_node)
95 example_data = ExampleData(
96 text="", ref="", english="", roman="", type="quote"
97 )
98 for span_tag in expanded_node.find_html_recursively("span"): 98 ↛ 99line 98 didn't jump to line 99 because the loop on line 98 never started
99 span_class = span_tag.attrs.get("class", "")
100 if "cited-source" == span_class:
101 example_data["ref"] = clean_node(wxr, None, span_tag)
102 elif "e-quotation" in span_class:
103 ruby_data, node_without_ruby = extract_ruby(wxr, span_tag)
104 if len(ruby_data) > 0:
105 example_data["ruby"] = ruby_data
106 example_data["text"] = clean_node(wxr, None, node_without_ruby)
107 calculate_bold_offsets(
108 wxr,
109 span_tag,
110 example_data["text"],
111 example_data,
112 "bold_text_offsets",
113 )
114 elif "e-translation" in span_class:
115 example_data["translation"] = clean_node(
116 wxr, None, span_tag
117 ) # DEPRECATED for "translation"
118 example_data["english"] = example_data[
119 "translation"
120 ] # DEPRECATED for "translation"
121 calculate_bold_offsets(
122 wxr,
123 span_tag,
124 example_data["translation"],
125 example_data,
126 "bold_translation_offsets",
127 )
128 for i_tag in expanded_node.find_html_recursively( 128 ↛ 131line 128 didn't jump to line 131 because the loop on line 128 never started
129 "i", attr_name="class", attr_value="e-transliteration"
130 ):
131 example_data["roman"] = clean_node(wxr, None, i_tag)
132 calculate_bold_offsets(
133 wxr,
134 span_tag,
135 example_data["roman"],
136 example_data,
137 "bold_roman_offsets",
138 )
139 break
140 clean_example_empty_data(example_data)
141 return example_data
144def extract_template_ja_usex(
145 wxr: WiktextractContext,
146 node: TemplateNode,
147 sense_data: SenseData,
148 example_data: ExampleData,
149) -> ExampleData:
150 # https://en.wiktionary.org/wiki/Template:ja-usex
151 expanded_node = wxr.wtp.parse(
152 wxr.wtp.node_to_wikitext(node), expand_all=True
153 )
154 clean_node(wxr, sense_data, expanded_node)
155 for span_tag in expanded_node.find_html( 155 ↛ 158line 155 didn't jump to line 158 because the loop on line 155 never started
156 "span", attr_name="class", attr_value="Jpan"
157 ):
158 ruby_data, node_without_ruby = extract_ruby(wxr, span_tag)
159 example_data["text"] = clean_node(wxr, None, node_without_ruby)
160 calculate_bold_offsets(
161 wxr,
162 wxr.wtp.parse(wxr.wtp.node_to_wikitext(node_without_ruby)),
163 example_data["text"],
164 example_data,
165 "bold_text_offsets",
166 )
167 example_data["ruby"] = ruby_data
168 for span_tag in expanded_node.find_html_recursively( 168 ↛ 171line 168 didn't jump to line 171 because the loop on line 168 never started
169 "span", attr_name="class", attr_value="tr"
170 ):
171 example_data["roman"] = clean_node(wxr, None, span_tag)
172 calculate_bold_offsets(
173 wxr,
174 span_tag,
175 example_data["roman"],
176 example_data,
177 "bold_roman_offsets",
178 )
179 tr_arg = wxr.wtp.parse(
180 wxr.wtp.node_to_wikitext(node.template_parameters.get(3, "")),
181 expand_all=True,
182 )
183 example_data["translation"] = clean_node(wxr, None, tr_arg)
184 example_data["english"] = example_data[
185 "translation"
186 ] # DEPRECATED for "translation"
187 calculate_bold_offsets(
188 wxr,
189 tr_arg,
190 example_data["translation"],
191 example_data,
192 "bold_translation_offsets",
193 )
194 lit_arg = wxr.wtp.parse(
195 wxr.wtp.node_to_wikitext(node.template_parameters.get("lit", "")),
196 expand_all=True,
197 )
198 example_data["literal_meaning"] = clean_node(wxr, None, lit_arg)
199 calculate_bold_offsets(
200 wxr,
201 lit_arg,
202 example_data["literal_meaning"],
203 example_data,
204 "bold_literal_offsets",
205 )
206 clean_example_empty_data(example_data)
207 return example_data
210def extract_template_zh_x(
211 wxr: WiktextractContext,
212 template_node: TemplateNode,
213 sense_data: SenseData | None,
214 parent_example: ExampleData,
215) -> list[ExampleData]:
216 # https://en.wiktionary.org/wiki/Template:zh-x
217 expanded_node = wxr.wtp.parse(
218 wxr.wtp.node_to_wikitext(template_node), expand_all=True
219 )
220 clean_node(wxr, sense_data, expanded_node)
221 has_dl_tag = False
222 results = []
223 example_data = deepcopy(parent_example)
224 tr_arg = wxr.wtp.parse(
225 wxr.wtp.node_to_wikitext(template_node.template_parameters.get(2, "")),
226 expand_all=True,
227 )
228 example_data["translation"] = clean_node(wxr, None, tr_arg)
229 example_data["english"] = example_data["translation"]
230 calculate_bold_offsets(
231 wxr,
232 tr_arg,
233 example_data["translation"],
234 example_data,
235 "bold_translation_offsets",
236 )
237 lit_arg = wxr.wtp.parse(
238 wxr.wtp.node_to_wikitext(
239 template_node.template_parameters.get("lit", "")
240 ),
241 expand_all=True,
242 )
243 example_data["literal_meaning"] = clean_node(wxr, None, lit_arg)
244 calculate_bold_offsets(
245 wxr,
246 tr_arg,
247 example_data["literal_meaning"],
248 example_data,
249 "bold_literal_offsets",
250 )
251 for dl_tag in expanded_node.find_html_recursively("dl"): 251 ↛ 252line 251 didn't jump to line 252 because the loop on line 251 never started
252 has_dl_tag = True
253 for dd_tag in dl_tag.find_html("dd"):
254 dd_text = clean_node(wxr, None, dd_tag)
255 if dd_text.startswith("From:"):
256 example_data["ref"] = dd_text.removeprefix("From:")
257 elif not dd_text.startswith("(literally,"):
258 for span_tag in dd_tag.find_html_recursively(
259 "span", attr_name="lang", attr_value="Latn"
260 ):
261 example_data["roman"] = clean_node(wxr, None, span_tag)
262 calculate_bold_offsets(
263 wxr,
264 span_tag,
265 example_data["roman"],
266 example_data,
267 "bold_roman_offsets",
268 )
269 for span_tag in dd_tag.find_html_recursively("span"):
270 span_text = clean_node(wxr, None, span_tag)
271 if span_text.startswith("[") and span_text.endswith(
272 "]"
273 ):
274 example_data["raw_tags"].append(
275 span_text.strip("[]")
276 )
277 break
278 results.extend(extract_zh_x_dl_span_tag(wxr, dl_tag, example_data))
280 # no source, single line example
281 if not has_dl_tag: 281 ↛ 319line 281 didn't jump to line 319 because the condition on line 281 was always true
282 for span_tag in expanded_node.find_html(
283 "span", attr_name="lang", attr_value="Latn"
284 ):
285 example_data["roman"] = clean_node(wxr, None, span_tag)
286 calculate_bold_offsets(
287 wxr,
288 span_tag,
289 example_data["roman"],
290 example_data,
291 "bold_roman_offsets",
292 )
293 break
294 for span_tag in expanded_node.find_html("span"):
295 span_text = clean_node(wxr, None, span_tag)
296 if span_text.startswith("[") and span_text.endswith("]"): 296 ↛ 297line 296 didn't jump to line 297 because the condition on line 296 was never true
297 example_data["raw_tags"].append(span_text.strip("[]"))
298 for span_tag in expanded_node.find_html("span"):
299 span_lang = span_tag.attrs.get("lang", "")
300 if span_lang in ["zh-Hant", "zh-Hans"]:
301 example_text = clean_node(wxr, None, span_tag)
302 if len(example_text) > 0: 302 ↛ 298line 302 didn't jump to line 298 because the condition on line 302 was always true
303 new_example = deepcopy(example_data)
304 new_example["text"] = example_text
305 calculate_bold_offsets(
306 wxr,
307 span_tag,
308 example_text,
309 new_example,
310 "bold_text_offsets",
311 )
312 new_example["tags"].append(
313 "Traditional-Chinese"
314 if span_lang == "zh-Hant"
315 else "Simplified-Chinese"
316 )
317 clean_example_empty_data(new_example)
318 results.append(new_example)
319 return results
322def extract_zh_x_dl_span_tag(
323 wxr: WiktextractContext, dl_tag: HTMLNode, example: ExampleData
324) -> list[ExampleData]:
325 # process example text span tag and dialect span tag
326 results = []
327 is_first_hide = True
328 for span_tag in dl_tag.find_html("span"):
329 span_lang = span_tag.attrs.get("lang", "")
330 if span_lang in ["zh-Hant", "zh-Hans"]:
331 new_example = deepcopy(example)
332 new_example["text"] = clean_node(wxr, None, span_tag)
333 calculate_bold_offsets(
334 wxr,
335 span_tag,
336 new_example["text"],
337 new_example,
338 "bold_text_offsets",
339 )
340 results.append(new_example)
341 elif "vsHide" in span_tag.attrs.get("class", ""):
342 # template has arg "collapsed=y"
343 results.extend(
344 extract_zh_x_dl_span_tag(
345 wxr,
346 span_tag,
347 results[-1]
348 if is_first_hide and len(results) > 0
349 else example,
350 )
351 )
352 is_first_hide = False
353 elif "font-size:x-small" in span_tag.attrs.get("style", ""):
354 for link_node in span_tag.find_child_recursively(NodeKind.LINK):
355 raw_tag = clean_node(wxr, None, link_node)
356 if len(raw_tag) > 0:
357 if len(results) > 0:
358 results[-1]["raw_tags"].append(raw_tag)
359 else:
360 example["raw_tags"].append(raw_tag)
362 if dl_tag.tag == "dl":
363 for data in results:
364 clean_example_empty_data(data)
365 return results
368ZH_X_TAGS = {
369 "trad.": "Traditional-Chinese",
370 "simp.": "Simplified-Chinese",
371}
374def clean_example_empty_data(data: ExampleData) -> None:
375 # remove empty data and convert raw tags
376 raw_tags = data.get("raw_tags", [])
377 new_raw_tags = []
378 for raw_tag in raw_tags:
379 if raw_tag in ZH_X_TAGS: 379 ↛ 380line 379 didn't jump to line 380 because the condition on line 379 was never true
380 data["tags"].append(ZH_X_TAGS[raw_tag])
381 elif raw_tag in valid_tags: 381 ↛ 382line 381 didn't jump to line 382 because the condition on line 381 was never true
382 data["tags"].append(raw_tag)
383 else:
384 new_raw_tags.append(raw_tag)
385 data["raw_tags"] = new_raw_tags
386 if len(data.get("ref", "")) > 0: 386 ↛ 387line 386 didn't jump to line 387 because the condition on line 386 was never true
387 data["type"] = "quote"
388 else:
389 data["type"] = "example"
390 for key, value in data.copy().items():
391 if len(value) == 0:
392 del data[key]
395def extract_ux_template(
396 wxr: WiktextractContext,
397 t_node: TemplateNode,
398 sense_data: SenseData,
399 example_data: ExampleData,
400) -> ExampleData:
401 expanded_node = wxr.wtp.parse(
402 wxr.wtp.node_to_wikitext(t_node), expand_all=True
403 )
404 clean_node(wxr, sense_data, expanded_node)
405 for html_node in expanded_node.find_child_recursively(NodeKind.HTML):
406 class_names = html_node.attrs.get("class", "")
407 if "e-example" in class_names:
408 example_data["text"] = clean_node(wxr, None, html_node)
409 calculate_bold_offsets(
410 wxr,
411 html_node,
412 example_data["text"],
413 example_data,
414 "bold_text_offsets",
415 )
416 elif "e-transliteration" in class_names: 416 ↛ 417line 416 didn't jump to line 417 because the condition on line 416 was never true
417 example_data["roman"] = clean_node(wxr, None, html_node)
418 calculate_bold_offsets(
419 wxr,
420 html_node,
421 example_data["roman"],
422 example_data,
423 "bold_roman_offsets",
424 )
425 elif "e-translation" in class_names:
426 example_data["translation"] = clean_node(wxr, None, html_node)
427 example_data["english"] = example_data[
428 "translation"
429 ] # DEPRECATED for "translation"
430 calculate_bold_offsets(
431 wxr,
432 html_node,
433 example_data["translation"],
434 example_data,
435 "bold_translation_offsets",
436 )
437 elif "e-literally" in class_names: 437 ↛ 438line 437 didn't jump to line 438 because the condition on line 437 was never true
438 example_data["literal_meaning"] = clean_node(wxr, None, html_node)
439 calculate_bold_offsets(
440 wxr,
441 html_node,
442 example_data["literal_meaning"],
443 example_data,
444 "bold_literal_offsets",
445 )
446 elif "qualifier-content" in class_names:
447 raw_tag = clean_node(wxr, None, html_node)
448 if raw_tag != "": 448 ↛ 405line 448 didn't jump to line 405 because the condition on line 448 was always true
449 example_data["raw_tags"].append(raw_tag)
451 clean_example_empty_data(example_data)
452 return example_data