Coverage for src/wiktextract/extractor/en/example.py: 60%
182 statements
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-11 10:26 +0000
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-11 10:26 +0000
1from copy import deepcopy
3from wikitextprocessor import HTMLNode, NodeKind, TemplateNode, WikiNode
5from ...page import clean_node
6from ...tags import valid_tags
7from ...wxr_context import WiktextractContext
8from ..ruby import extract_ruby
9from ..share import calculate_bold_offsets
10from .type_utils import ExampleData, SenseData
13def extract_example_list_item(
14 wxr: WiktextractContext,
15 list_item: WikiNode,
16 sense_data: SenseData,
17 parent_data: ExampleData,
18) -> list[ExampleData]:
19 examples = []
20 for template_node in list_item.find_child(NodeKind.TEMPLATE):
21 if template_node.template_name in ["zh-x", "zh-usex", "zh-q"]:
22 examples.extend(
23 extract_template_zh_x(
24 wxr,
25 template_node,
26 sense_data,
27 parent_data,
28 )
29 )
30 elif template_node.template_name in ["ja-usex", "ja-x", "ja-ux"]:
31 examples.append(
32 extract_template_ja_usex(
33 wxr,
34 template_node,
35 sense_data,
36 parent_data,
37 )
38 )
39 elif (
40 template_node.template_name.startswith(("quote-", "RQ:"))
41 or template_node.template_name == "quote"
42 ):
43 q_example = extract_quote_templates(wxr, template_node, sense_data)
44 if list_item.contain_node(NodeKind.LIST):
45 for next_list_item in list_item.find_child_recursively(
46 NodeKind.LIST_ITEM
47 ):
48 for key in ["tags", "raw_tags"]:
49 if key not in q_example:
50 q_example[key] = []
51 examples.extend(
52 extract_example_list_item(
53 wxr, next_list_item, sense_data, q_example
54 )
55 )
56 else:
57 examples.append(q_example)
58 elif template_node.template_name in [
59 "ux",
60 "usex",
61 "uxi",
62 "ko-usex",
63 "koex",
64 "ko-x",
65 "th-usex",
66 "th-x",
67 "th-xi",
68 "uxa",
69 "collocation",
70 "co",
71 "coi",
72 ]:
73 copy_of_parent_data = deepcopy(parent_data)
74 if template_node.template_name in ("collocation", "co", "coi"): 74 ↛ 75line 74 didn't jump to line 75 because the condition on line 74 was never true
75 copy_of_parent_data["tags"].append("collocation")
76 examples.append(
77 extract_ux_template(
78 wxr,
79 template_node,
80 sense_data,
81 copy_of_parent_data,
82 )
83 )
85 return examples
88def extract_quote_templates(
89 wxr: WiktextractContext, node: TemplateNode, sense_data: SenseData
90) -> ExampleData:
91 expanded_node = wxr.wtp.parse(
92 wxr.wtp.node_to_wikitext(node), expand_all=True
93 )
94 clean_node(wxr, sense_data, expanded_node)
95 example_data = ExampleData(
96 text="", ref="", english="", roman="", type="quote"
97 )
98 for span_tag in expanded_node.find_html_recursively("span"): 98 ↛ 99line 98 didn't jump to line 99 because the loop on line 98 never started
99 span_class = span_tag.attrs.get("class", "")
100 if "cited-source" == span_class:
101 example_data["ref"] = clean_node(wxr, None, span_tag)
102 elif "e-quotation" in span_class:
103 example_data["text"] = clean_node(wxr, None, span_tag)
104 calculate_bold_offsets(
105 wxr,
106 span_tag,
107 example_data["text"],
108 example_data,
109 "bold_text_offsets",
110 )
111 elif "e-translation" in span_class:
112 example_data["english"] = clean_node(wxr, None, span_tag)
113 calculate_bold_offsets(
114 wxr,
115 span_tag,
116 example_data["english"],
117 example_data,
118 "bold_english_offsets",
119 )
120 for i_tag in expanded_node.find_html_recursively( 120 ↛ 123line 120 didn't jump to line 123 because the loop on line 120 never started
121 "i", attr_name="class", attr_value="e-transliteration"
122 ):
123 example_data["roman"] = clean_node(wxr, None, i_tag)
124 calculate_bold_offsets(
125 wxr,
126 span_tag,
127 example_data["roman"],
128 example_data,
129 "bold_roman_offsets",
130 )
131 break
132 clean_example_empty_data(example_data)
133 return example_data
136def extract_template_ja_usex(
137 wxr: WiktextractContext,
138 node: TemplateNode,
139 sense_data: SenseData,
140 example_data: ExampleData,
141) -> ExampleData:
142 # https://en.wiktionary.org/wiki/Template:ja-usex
143 expanded_node = wxr.wtp.parse(
144 wxr.wtp.node_to_wikitext(node), expand_all=True
145 )
146 clean_node(wxr, sense_data, expanded_node)
147 for span_tag in expanded_node.find_html( 147 ↛ 150line 147 didn't jump to line 150 because the loop on line 147 never started
148 "span", attr_name="class", attr_value="Jpan"
149 ):
150 ruby_data, node_without_ruby = extract_ruby(wxr, span_tag)
151 example_data["text"] = clean_node(wxr, None, node_without_ruby)
152 calculate_bold_offsets(
153 wxr,
154 wxr.wtp.parse(wxr.wtp.node_to_wikitext(node_without_ruby)),
155 example_data["text"],
156 example_data,
157 "bold_text_offsets",
158 )
159 example_data["ruby"] = ruby_data
160 for span_tag in expanded_node.find_html_recursively( 160 ↛ 163line 160 didn't jump to line 163 because the loop on line 160 never started
161 "span", attr_name="class", attr_value="tr"
162 ):
163 example_data["roman"] = clean_node(wxr, None, span_tag)
164 calculate_bold_offsets(
165 wxr,
166 span_tag,
167 example_data["roman"],
168 example_data,
169 "bold_roman_offsets",
170 )
171 tr_arg = wxr.wtp.parse(
172 wxr.wtp.node_to_wikitext(node.template_parameters.get(3, "")),
173 expand_all=True,
174 )
175 example_data["english"] = clean_node(wxr, None, tr_arg)
176 calculate_bold_offsets(
177 wxr,
178 tr_arg,
179 example_data["english"],
180 example_data,
181 "bold_english_offsets",
182 )
183 lit_arg = wxr.wtp.parse(
184 wxr.wtp.node_to_wikitext(node.template_parameters.get("lit", "")),
185 expand_all=True,
186 )
187 example_data["literal_meaning"] = clean_node(wxr, None, lit_arg)
188 calculate_bold_offsets(
189 wxr,
190 lit_arg,
191 example_data["literal_meaning"],
192 example_data,
193 "bold_literal_offsets",
194 )
195 clean_example_empty_data(example_data)
196 return example_data
199def extract_template_zh_x(
200 wxr: WiktextractContext,
201 template_node: TemplateNode,
202 sense_data: SenseData | None,
203 parent_example: ExampleData,
204) -> list[ExampleData]:
205 # https://en.wiktionary.org/wiki/Template:zh-x
206 expanded_node = wxr.wtp.parse(
207 wxr.wtp.node_to_wikitext(template_node), expand_all=True
208 )
209 clean_node(wxr, sense_data, expanded_node)
210 has_dl_tag = False
211 results = []
212 example_data = deepcopy(parent_example)
213 tr_arg = wxr.wtp.parse(
214 wxr.wtp.node_to_wikitext(template_node.template_parameters.get(2, "")),
215 expand_all=True,
216 )
217 example_data["english"] = clean_node(wxr, None, tr_arg)
218 calculate_bold_offsets(
219 wxr,
220 tr_arg,
221 example_data["english"],
222 example_data,
223 "bold_english_offsets",
224 )
225 lit_arg = wxr.wtp.parse(
226 wxr.wtp.node_to_wikitext(
227 template_node.template_parameters.get("lit", "")
228 ),
229 expand_all=True,
230 )
231 example_data["literal_meaning"] = clean_node(wxr, None, lit_arg)
232 calculate_bold_offsets(
233 wxr,
234 tr_arg,
235 example_data["literal_meaning"],
236 example_data,
237 "bold_literal_offsets",
238 )
239 for dl_tag in expanded_node.find_html_recursively("dl"): 239 ↛ 240line 239 didn't jump to line 240 because the loop on line 239 never started
240 has_dl_tag = True
241 for dd_tag in dl_tag.find_html("dd"):
242 dd_text = clean_node(wxr, None, dd_tag)
243 if dd_text.startswith("From:"):
244 example_data["ref"] = dd_text.removeprefix("From:")
245 elif not dd_text.startswith("(literally,"):
246 for span_tag in dd_tag.find_html_recursively(
247 "span", attr_name="lang", attr_value="Latn"
248 ):
249 example_data["roman"] = clean_node(wxr, None, span_tag)
250 calculate_bold_offsets(
251 wxr,
252 span_tag,
253 example_data["roman"],
254 example_data,
255 "bold_roman_offsets",
256 )
257 for span_tag in dd_tag.find_html_recursively("span"):
258 span_text = clean_node(wxr, None, span_tag)
259 if span_text.startswith("[") and span_text.endswith(
260 "]"
261 ):
262 example_data["raw_tags"].append(
263 span_text.strip("[]")
264 )
265 break
266 results.extend(extract_zh_x_dl_span_tag(wxr, dl_tag, example_data))
268 # no source, single line example
269 if not has_dl_tag: 269 ↛ 307line 269 didn't jump to line 307 because the condition on line 269 was always true
270 for span_tag in expanded_node.find_html(
271 "span", attr_name="lang", attr_value="Latn"
272 ):
273 example_data["roman"] = clean_node(wxr, None, span_tag)
274 calculate_bold_offsets(
275 wxr,
276 span_tag,
277 example_data["roman"],
278 example_data,
279 "bold_roman_offsets",
280 )
281 break
282 for span_tag in expanded_node.find_html("span"):
283 span_text = clean_node(wxr, None, span_tag)
284 if span_text.startswith("[") and span_text.endswith("]"): 284 ↛ 285line 284 didn't jump to line 285 because the condition on line 284 was never true
285 example_data["raw_tags"].append(span_text.strip("[]"))
286 for span_tag in expanded_node.find_html("span"):
287 span_lang = span_tag.attrs.get("lang", "")
288 if span_lang in ["zh-Hant", "zh-Hans"]:
289 example_text = clean_node(wxr, None, span_tag)
290 if len(example_text) > 0: 290 ↛ 286line 290 didn't jump to line 286 because the condition on line 290 was always true
291 new_example = deepcopy(example_data)
292 new_example["text"] = example_text
293 calculate_bold_offsets(
294 wxr,
295 span_tag,
296 example_text,
297 new_example,
298 "bold_text_offsets",
299 )
300 new_example["tags"].append(
301 "Traditional Chinese"
302 if span_lang == "zh-Hant"
303 else "Simplified Chinese"
304 )
305 clean_example_empty_data(new_example)
306 results.append(new_example)
307 return results
310def extract_zh_x_dl_span_tag(
311 wxr: WiktextractContext, dl_tag: HTMLNode, example: ExampleData
312) -> list[ExampleData]:
313 # process example text span tag and dialect span tag
314 results = []
315 is_first_hide = True
316 for span_tag in dl_tag.find_html("span"):
317 span_lang = span_tag.attrs.get("lang", "")
318 if span_lang in ["zh-Hant", "zh-Hans"]:
319 new_example = deepcopy(example)
320 new_example["text"] = clean_node(wxr, None, span_tag)
321 calculate_bold_offsets(
322 wxr,
323 span_tag,
324 new_example["text"],
325 new_example,
326 "bold_text_offsets",
327 )
328 results.append(new_example)
329 elif "vsHide" in span_tag.attrs.get("class", ""):
330 # template has arg "collapsed=y"
331 results.extend(
332 extract_zh_x_dl_span_tag(
333 wxr,
334 span_tag,
335 results[-1]
336 if is_first_hide and len(results) > 0
337 else example,
338 )
339 )
340 is_first_hide = False
341 elif "font-size:x-small" in span_tag.attrs.get("style", ""):
342 for link_node in span_tag.find_child_recursively(NodeKind.LINK):
343 raw_tag = clean_node(wxr, None, link_node)
344 if len(raw_tag) > 0:
345 if len(results) > 0:
346 results[-1]["raw_tags"].append(raw_tag)
347 else:
348 example["raw_tags"].append(raw_tag)
350 if dl_tag.tag == "dl":
351 for data in results:
352 clean_example_empty_data(data)
353 return results
356ZH_X_TAGS = {
357 "trad.": "Traditional Chinese",
358 "simp.": "Simplified Chinese",
359}
362def clean_example_empty_data(data: ExampleData) -> None:
363 # remove empty data and convert raw tags
364 raw_tags = data.get("raw_tags", [])
365 new_raw_tags = []
366 for raw_tag in raw_tags:
367 if raw_tag in ZH_X_TAGS: 367 ↛ 368line 367 didn't jump to line 368 because the condition on line 367 was never true
368 data["tags"].append(ZH_X_TAGS[raw_tag])
369 elif raw_tag in valid_tags: 369 ↛ 370line 369 didn't jump to line 370 because the condition on line 369 was never true
370 data["tags"].append(raw_tag)
371 else:
372 new_raw_tags.append(raw_tag)
373 data["raw_tags"] = new_raw_tags
374 if len(data.get("ref", "")) > 0: 374 ↛ 375line 374 didn't jump to line 375 because the condition on line 374 was never true
375 data["type"] = "quote"
376 else:
377 data["type"] = "example"
378 for key, value in data.copy().items():
379 if len(value) == 0:
380 del data[key]
383def extract_ux_template(
384 wxr: WiktextractContext,
385 t_node: TemplateNode,
386 sense_data: SenseData,
387 example_data: ExampleData,
388) -> ExampleData:
389 expanded_node = wxr.wtp.parse(
390 wxr.wtp.node_to_wikitext(t_node), expand_all=True
391 )
392 clean_node(wxr, sense_data, expanded_node)
393 for html_node in expanded_node.find_child_recursively(NodeKind.HTML):
394 class_names = html_node.attrs.get("class", "")
395 if "e-example" in class_names:
396 example_data["text"] = clean_node(wxr, None, html_node)
397 calculate_bold_offsets(
398 wxr,
399 html_node,
400 example_data["text"],
401 example_data,
402 "bold_text_offsets",
403 )
404 elif "e-transliteration" in class_names: 404 ↛ 405line 404 didn't jump to line 405 because the condition on line 404 was never true
405 example_data["roman"] = clean_node(wxr, None, html_node)
406 calculate_bold_offsets(
407 wxr,
408 html_node,
409 example_data["roman"],
410 example_data,
411 "bold_roman_offsets",
412 )
413 elif "e-translation" in class_names:
414 example_data["english"] = clean_node(wxr, None, html_node)
415 calculate_bold_offsets(
416 wxr,
417 html_node,
418 example_data["english"],
419 example_data,
420 "bold_english_offsets",
421 )
422 elif "e-literally" in class_names: 422 ↛ 423line 422 didn't jump to line 423 because the condition on line 422 was never true
423 example_data["literal_meaning"] = clean_node(wxr, None, html_node)
424 calculate_bold_offsets(
425 wxr,
426 html_node,
427 example_data["literal_meaning"],
428 example_data,
429 "bold_literal_offsets",
430 )
431 elif "qualifier-content" in class_names:
432 raw_tag = clean_node(wxr, None, html_node)
433 if raw_tag != "": 433 ↛ 393line 433 didn't jump to line 393 because the condition on line 433 was always true
434 example_data["raw_tags"].append(raw_tag)
436 clean_example_empty_data(example_data)
437 return example_data