Coverage for src / wiktextract / extractor / vi / example.py: 25%
172 statements
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-05 07:46 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-05 07:46 +0000
1from wikitextprocessor import HTMLNode, NodeKind, TemplateNode, WikiNode
3from ...page import clean_node
4from ...wxr_context import WiktextractContext
5from ..ruby import extract_ruby
6from ..share import calculate_bold_offsets
7from .linkage import (
8 GLOSS_LIST_LINKAGE_TEMPLATES,
9 extract_gloss_list_linkage_template,
10)
11from .models import Example, Sense, WordEntry
12from .tags import translate_raw_tags
15def extract_example_list_item(
16 wxr: WiktextractContext,
17 word_entry: WordEntry,
18 sense: Sense,
19 list_item: WikiNode,
20 ref: str = "",
21):
22 for index, node in enumerate(list_item.children):
23 if (
24 isinstance(node, WikiNode)
25 and node.kind == NodeKind.ITALIC
26 and node.contain_node(NodeKind.BOLD)
27 ):
28 e_text = clean_node(wxr, None, node)
29 if e_text != "": 29 ↛ 22line 29 didn't jump to line 22 because the condition on line 29 was always true
30 e_data = Example(text=e_text)
31 calculate_bold_offsets(
32 wxr, node, e_text, e_data, "bold_text_offsets"
33 )
34 e_data.translation = clean_node(
35 wxr, None, list_item.children[index + 1 :]
36 ).strip("—- \n")
37 sense.examples.append(e_data)
38 break
39 elif isinstance(node, TemplateNode):
40 if node.template_name in [ 40 ↛ 52line 40 didn't jump to line 52 because the condition on line 40 was never true
41 "ux",
42 "usex",
43 "ux2",
44 "uxi",
45 "collocation",
46 "th-usex",
47 "th-x",
48 "tha-x",
49 "tha-usex",
50 "uxa",
51 ]:
52 extract_ux_template(wxr, sense, node)
53 elif node.template_name.startswith(("quote-", "RQ:")):
54 ref = extract_quote_template(wxr, sense, node)
55 elif node.template_name in GLOSS_LIST_LINKAGE_TEMPLATES: 55 ↛ 65line 55 didn't jump to line 65 because the condition on line 55 was always true
56 extract_gloss_list_linkage_template(
57 wxr,
58 word_entry,
59 node,
60 GLOSS_LIST_LINKAGE_TEMPLATES[node.template_name],
61 " ".join(word_entry.senses[-1].glosses)
62 if len(word_entry.senses) > 0
63 else "",
64 )
65 elif node.template_name in ["ja-usex", "ja-x", "jpn-usex"]:
66 extract_ja_x_template(wxr, node, sense, ref)
67 elif node.template_name in [
68 "zho-x",
69 "zh-x",
70 "zh-usex",
71 "zho-usex",
72 "zhex",
73 ]:
74 extract_zh_x_template(wxr, node, sense, ref)
75 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 75 ↛ 76line 75 didn't jump to line 76 because the condition on line 75 was never true
76 for child_list_item in node.find_child(NodeKind.LIST_ITEM):
77 extract_example_list_item(
78 wxr, word_entry, sense, child_list_item, ref
79 )
82def extract_ux_template(
83 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode
84):
85 expanded_node = wxr.wtp.parse(
86 wxr.wtp.node_to_wikitext(t_node), expand_all=True
87 )
88 e_data = Example(text="")
89 for i_tag in expanded_node.find_html_recursively("i"):
90 i_class = i_tag.attrs.get("class", "")
91 if "e-example" in i_class:
92 e_data.text = clean_node(wxr, None, i_tag)
93 calculate_bold_offsets(
94 wxr, i_tag, e_data.text, e_data, "bold_text_offsets"
95 )
96 elif "e-transliteration" in i_class:
97 e_data.roman = clean_node(wxr, None, i_tag)
98 calculate_bold_offsets(
99 wxr, i_tag, e_data.roman, e_data, "bold_roman_offsets"
100 )
101 for span_tag in expanded_node.find_html_recursively("span"):
102 span_class = span_tag.attrs.get("class", "")
103 if "e-translation" in span_class:
104 e_data.translation = clean_node(wxr, None, span_tag)
105 calculate_bold_offsets(
106 wxr,
107 span_tag,
108 e_data.translation,
109 e_data,
110 "bold_translation_offsets",
111 )
112 elif "e-literally" in span_class:
113 e_data.literal_meaning = clean_node(wxr, None, span_tag)
114 calculate_bold_offsets(
115 wxr,
116 span_tag,
117 e_data.literal_meaning,
118 e_data,
119 "bold_literal_offsets",
120 )
121 elif "qualifier-content" in span_class:
122 raw_tag = clean_node(wxr, None, span_tag)
123 if raw_tag != "":
124 e_data.raw_tags.append(raw_tag)
126 e_data.ref = clean_node(
127 wxr, None, t_node.template_parameters.get("ref", "")
128 )
129 if e_data.text != "":
130 translate_raw_tags(e_data)
131 sense.examples.append(e_data)
132 for link_node in expanded_node.find_child(NodeKind.LINK):
133 clean_node(wxr, sense, link_node)
136def extract_quote_template(
137 wxr: WiktextractContext,
138 sense: Sense,
139 t_node: TemplateNode,
140) -> str:
141 ref = ""
142 if all( 142 ↛ 145line 142 didn't jump to line 145 because the condition on line 142 was never true
143 arg not in t_node.template_parameters for arg in ["text", "passage", 7]
144 ):
145 ref = clean_node(wxr, sense, t_node)
146 else:
147 expanded_node = wxr.wtp.parse(
148 wxr.wtp.node_to_wikitext(t_node), expand_all=True
149 )
150 example = Example(text="")
151 for span_tag in expanded_node.find_html_recursively("span"):
152 span_class = span_tag.attrs.get("class", "")
153 if "cited-source" == span_class:
154 example.ref = clean_node(wxr, None, span_tag)
155 elif "e-quotation" in span_class:
156 example.ruby, node_without_ruby = extract_ruby(wxr, span_tag)
157 example.text = clean_node(wxr, None, node_without_ruby)
158 calculate_bold_offsets(
159 wxr, span_tag, example.text, example, "bold_text_offsets"
160 )
161 elif "e-translation" in span_class: 161 ↛ 162line 161 didn't jump to line 162 because the condition on line 161 was never true
162 example.translation = clean_node(wxr, None, span_tag)
163 calculate_bold_offsets(
164 wxr,
165 span_tag,
166 example.translation,
167 example,
168 "bold_translation_text",
169 )
170 for i_tag in expanded_node.find_html_recursively( 170 ↛ 173line 170 didn't jump to line 173 because the loop on line 170 never started
171 "i", attr_name="class", attr_value="e-transliteration"
172 ):
173 example.roman = clean_node(wxr, None, i_tag)
174 calculate_bold_offsets(
175 wxr, i_tag, example.roman, example, "bold_roman_offsets"
176 )
177 break
178 if example.text != "": 178 ↛ 180line 178 didn't jump to line 180 because the condition on line 178 was always true
179 sense.examples.append(example)
180 clean_node(wxr, sense, expanded_node)
182 return ref
185def extract_ja_x_template(
186 wxr: WiktextractContext, t_node: TemplateNode, sense: Sense, ref: str
187) -> None:
188 expanded_node = wxr.wtp.parse(
189 wxr.wtp.node_to_wikitext(t_node), expand_all=True
190 )
191 example = Example(text="", ref=ref)
192 for span_tag in expanded_node.find_html(
193 "span", attr_name="class", attr_value="Jpan"
194 ):
195 ruby_data, node_without_ruby = extract_ruby(wxr, span_tag)
196 example.text = clean_node(wxr, None, node_without_ruby)
197 example.ruby = ruby_data
198 calculate_bold_offsets(
199 wxr,
200 wxr.wtp.parse(wxr.wtp.node_to_wikitext(node_without_ruby)),
201 example.text,
202 example,
203 "bold_text_offsets",
204 )
205 for span_tag in expanded_node.find_html_recursively(
206 "span", attr_name="class", attr_value="tr"
207 ):
208 example.roman = clean_node(wxr, None, span_tag)
209 calculate_bold_offsets(
210 wxr, span_tag, example.roman, example, "bold_roman_offsets"
211 )
212 third_arg = t_node.template_parameters.get(3, "")
213 example.translation = clean_node(wxr, None, third_arg)
214 calculate_bold_offsets(
215 wxr,
216 wxr.wtp.parse(wxr.wtp.node_to_wikitext(third_arg)),
217 example.translation,
218 example,
219 "bold_translation_offsets",
220 )
221 lit_arg = t_node.template_parameters.get("lit", "")
222 example.literal_meaning = clean_node(wxr, None, lit_arg)
223 calculate_bold_offsets(
224 wxr,
225 wxr.wtp.parse(wxr.wtp.node_to_wikitext(lit_arg)),
226 example.literal_meaning,
227 example,
228 "bold_literal_offsets",
229 )
230 if example.text != "":
231 sense.examples.append(example)
232 for link_node in expanded_node.find_child(NodeKind.LINK):
233 clean_node(wxr, sense, link_node)
236def extract_zh_x_template(
237 wxr: WiktextractContext, t_node: TemplateNode, sense: Sense, ref: str
238):
239 expanded_node = wxr.wtp.parse(
240 wxr.wtp.node_to_wikitext(t_node), expand_all=True
241 )
242 examples = []
243 for dl_tag in expanded_node.find_html("dl"):
244 examples.extend(extract_zh_x_dl_tag(wxr, dl_tag))
245 if len(examples) == 0:
246 examples.extend(extract_zh_x_no_dl_tag(wxr, expanded_node))
248 second_arg = t_node.template_parameters.get(2, "")
249 translation = clean_node(wxr, None, second_arg)
250 for e_data in examples:
251 e_data.translation = translation
252 calculate_bold_offsets(
253 wxr,
254 wxr.wtp.parse(wxr.wtp.node_to_wikitext(second_arg)),
255 translation,
256 e_data,
257 "bold_translation_offsets",
258 )
259 translate_raw_tags(e_data)
261 for link_node in expanded_node.find_child(NodeKind.LINK):
262 clean_node(wxr, sense, link_node)
263 sense.examples.extend(examples)
266def extract_zh_x_dl_tag(
267 wxr: WiktextractContext, dl_tag: HTMLNode
268) -> list[Example]:
269 examples = []
270 for span_tag in dl_tag.find_html("span"):
271 if "lang" in span_tag.attrs:
272 e_text = clean_node(wxr, None, span_tag)
273 if e_text != "":
274 e_data = Example(text=e_text)
275 calculate_bold_offsets(
276 wxr, span_tag, e_text, e_data, "bold_text_offsets"
277 )
278 examples.append(e_data)
279 else:
280 raw_tags = clean_node(wxr, None, span_tag).strip("[] ")
281 for raw_tag in raw_tags.split(","):
282 raw_tag = raw_tag.strip()
283 if raw_tag != "" and len(examples) > 0:
284 examples[-1].raw_tags.append(raw_tag)
285 ref = ""
286 for dd_tag in dl_tag.find_html("dd"):
287 for span_tag in dd_tag.find_html("span"):
288 if "Latn" in span_tag.attrs.get("lang", ""):
289 roman = clean_node(wxr, None, span_tag)
290 for e_data in examples:
291 e_data.roman = roman
292 calculate_bold_offsets(
293 wxr, span_tag, roman, e_data, "bold_roman_offsets"
294 )
295 else:
296 raw_tag = clean_node(wxr, None, span_tag).strip("[] ")
297 if raw_tag != "":
298 for e_data in examples:
299 e_data.raw_tags.append(raw_tag)
300 for small_tag in dd_tag.find_html("small"):
301 ref = clean_node(wxr, None, small_tag).removeprefix("Từ:").strip()
302 for e_data in examples:
303 e_data.ref = ref
305 return examples
308def extract_zh_x_no_dl_tag(
309 wxr: WiktextractContext, expanded_node: WikiNode
310) -> list[Example]:
311 examples = []
312 for span_tag in expanded_node.find_html("span"):
313 lang = span_tag.attrs.get("lang", "")
314 match lang:
315 case "zh-Latn":
316 roman = clean_node(wxr, None, span_tag)
317 for e_data in examples:
318 e_data.roman = roman
319 calculate_bold_offsets(
320 wxr, span_tag, roman, e_data, "bold_roman_offsets"
321 )
322 case "zh-Hant" | "zh-Hans":
323 e_text = clean_node(wxr, None, span_tag)
324 example = Example(text=e_text)
325 example.tags.append(
326 "Traditional-Chinese"
327 if lang == "zh-Hant"
328 else "Simplified-Chinese"
329 )
330 if example.text != "":
331 calculate_bold_offsets(
332 wxr, span_tag, e_text, example, "bold_text_offsets"
333 )
334 examples.append(example)
336 return examples