Coverage for src/wiktextract/extractor/vi/example.py: 25%
172 statements
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
1from wikitextprocessor import HTMLNode, NodeKind, TemplateNode, WikiNode
3from ...page import clean_node
4from ...wxr_context import WiktextractContext
5from ..ruby import extract_ruby
6from ..share import calculate_bold_offsets
7from .linkage import (
8 GLOSS_LIST_LINKAGE_TEMPLATES,
9 extract_gloss_list_linkage_template,
10)
11from .models import Example, Sense, WordEntry
12from .tags import translate_raw_tags
15def extract_example_list_item(
16 wxr: WiktextractContext,
17 word_entry: WordEntry,
18 sense: Sense,
19 list_item: WikiNode,
20 ref: str = "",
21):
22 for index, node in enumerate(list_item.children):
23 if (
24 isinstance(node, WikiNode)
25 and node.kind == NodeKind.ITALIC
26 and node.contain_node(NodeKind.BOLD)
27 ):
28 e_text = clean_node(wxr, None, node)
29 if e_text != "": 29 ↛ 22line 29 didn't jump to line 22 because the condition on line 29 was always true
30 e_data = Example(text=e_text)
31 calculate_bold_offsets(
32 wxr, node, e_text, e_data, "bold_text_offsets"
33 )
34 e_data.translation = clean_node(
35 wxr, None, list_item.children[index + 1 :]
36 ).strip("—- \n")
37 sense.examples.append(e_data)
38 break
39 elif isinstance(node, TemplateNode):
40 if node.template_name in [ 40 ↛ 51line 40 didn't jump to line 51 because the condition on line 40 was never true
41 "ux",
42 "usex",
43 "ux2",
44 "uxi",
45 "collocation",
46 "th-usex",
47 "th-x",
48 "tha-x",
49 "tha-usex",
50 ]:
51 extract_ux_template(wxr, sense, node)
52 elif node.template_name.startswith(("quote-", "RQ:")):
53 ref = extract_quote_template(wxr, sense, node)
54 elif node.template_name in GLOSS_LIST_LINKAGE_TEMPLATES: 54 ↛ 64line 54 didn't jump to line 64 because the condition on line 54 was always true
55 extract_gloss_list_linkage_template(
56 wxr,
57 word_entry,
58 node,
59 GLOSS_LIST_LINKAGE_TEMPLATES[node.template_name],
60 " ".join(word_entry.senses[-1].glosses)
61 if len(word_entry.senses) > 0
62 else "",
63 )
64 elif node.template_name in ["ja-usex", "ja-x", "jpn-usex"]:
65 extract_ja_x_template(wxr, node, sense, ref)
66 elif node.template_name in [
67 "zho-x",
68 "zh-x",
69 "zh-usex",
70 "zho-usex",
71 "zhex",
72 ]:
73 extract_zh_x_template(wxr, node, sense, ref)
74 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 74 ↛ 75line 74 didn't jump to line 75 because the condition on line 74 was never true
75 for child_list_item in node.find_child(NodeKind.LIST_ITEM):
76 extract_example_list_item(
77 wxr, word_entry, sense, child_list_item, ref
78 )
81def extract_ux_template(
82 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode
83):
84 expanded_node = wxr.wtp.parse(
85 wxr.wtp.node_to_wikitext(t_node), expand_all=True
86 )
87 e_data = Example(text="")
88 for i_tag in expanded_node.find_html_recursively("i"):
89 i_class = i_tag.attrs.get("class", "")
90 if "e-example" in i_class:
91 e_data.text = clean_node(wxr, None, i_tag)
92 calculate_bold_offsets(
93 wxr, i_tag, e_data.text, e_data, "bold_text_offsets"
94 )
95 elif "e-transliteration" in i_class:
96 e_data.roman = clean_node(wxr, None, i_tag)
97 calculate_bold_offsets(
98 wxr, i_tag, e_data.roman, e_data, "bold_roman_offsets"
99 )
100 for span_tag in expanded_node.find_html_recursively("span"):
101 span_class = span_tag.attrs.get("class", "")
102 if "e-translation" in span_class:
103 e_data.translation = clean_node(wxr, None, span_tag)
104 calculate_bold_offsets(
105 wxr,
106 span_tag,
107 e_data.translation,
108 e_data,
109 "bold_translation_offsets",
110 )
111 elif "e-literally" in span_class:
112 e_data.literal_meaning = clean_node(wxr, None, span_tag)
113 calculate_bold_offsets(
114 wxr,
115 span_tag,
116 e_data.literal_meaning,
117 e_data,
118 "bold_literal_offsets",
119 )
120 elif "qualifier-content" in span_class:
121 raw_tag = clean_node(wxr, None, span_tag)
122 if raw_tag != "":
123 e_data.raw_tags.append(raw_tag)
125 e_data.ref = clean_node(
126 wxr, None, t_node.template_parameters.get("ref", "")
127 )
128 if e_data.text != "":
129 translate_raw_tags(e_data)
130 sense.examples.append(e_data)
131 for link_node in expanded_node.find_child(NodeKind.LINK):
132 clean_node(wxr, sense, link_node)
135def extract_quote_template(
136 wxr: WiktextractContext,
137 sense: Sense,
138 t_node: TemplateNode,
139) -> str:
140 ref = ""
141 if all( 141 ↛ 144line 141 didn't jump to line 144 because the condition on line 141 was never true
142 arg not in t_node.template_parameters for arg in ["text", "passage", 7]
143 ):
144 ref = clean_node(wxr, sense, t_node)
145 else:
146 expanded_node = wxr.wtp.parse(
147 wxr.wtp.node_to_wikitext(t_node), expand_all=True
148 )
149 example = Example(text="")
150 for span_tag in expanded_node.find_html_recursively("span"):
151 span_class = span_tag.attrs.get("class", "")
152 if "cited-source" == span_class:
153 example.ref = clean_node(wxr, None, span_tag)
154 elif "e-quotation" in span_class:
155 example.ruby, node_without_ruby = extract_ruby(wxr, span_tag)
156 example.text = clean_node(wxr, None, node_without_ruby)
157 calculate_bold_offsets(
158 wxr, span_tag, example.text, example, "bold_text_offsets"
159 )
160 elif "e-translation" in span_class: 160 ↛ 161line 160 didn't jump to line 161 because the condition on line 160 was never true
161 example.translation = clean_node(wxr, None, span_tag)
162 calculate_bold_offsets(
163 wxr,
164 span_tag,
165 example.translation,
166 example,
167 "bold_translation_text",
168 )
169 for i_tag in expanded_node.find_html_recursively( 169 ↛ 172line 169 didn't jump to line 172 because the loop on line 169 never started
170 "i", attr_name="class", attr_value="e-transliteration"
171 ):
172 example.roman = clean_node(wxr, None, i_tag)
173 calculate_bold_offsets(
174 wxr, i_tag, example.roman, example, "bold_roman_offsets"
175 )
176 break
177 if example.text != "": 177 ↛ 179line 177 didn't jump to line 179 because the condition on line 177 was always true
178 sense.examples.append(example)
179 clean_node(wxr, sense, expanded_node)
181 return ref
184def extract_ja_x_template(
185 wxr: WiktextractContext, t_node: TemplateNode, sense: Sense, ref: str
186) -> None:
187 expanded_node = wxr.wtp.parse(
188 wxr.wtp.node_to_wikitext(t_node), expand_all=True
189 )
190 example = Example(text="", ref=ref)
191 for span_tag in expanded_node.find_html(
192 "span", attr_name="class", attr_value="Jpan"
193 ):
194 ruby_data, node_without_ruby = extract_ruby(wxr, span_tag)
195 example.text = clean_node(wxr, None, node_without_ruby)
196 example.ruby = ruby_data
197 calculate_bold_offsets(
198 wxr,
199 wxr.wtp.parse(wxr.wtp.node_to_wikitext(node_without_ruby)),
200 example.text,
201 example,
202 "bold_text_offsets",
203 )
204 for span_tag in expanded_node.find_html_recursively(
205 "span", attr_name="class", attr_value="tr"
206 ):
207 example.roman = clean_node(wxr, None, span_tag)
208 calculate_bold_offsets(
209 wxr, span_tag, example.roman, example, "bold_roman_offsets"
210 )
211 third_arg = t_node.template_parameters.get(3, "")
212 example.translation = clean_node(wxr, None, third_arg)
213 calculate_bold_offsets(
214 wxr,
215 wxr.wtp.parse(wxr.wtp.node_to_wikitext(third_arg)),
216 example.translation,
217 example,
218 "bold_translation_offsets",
219 )
220 lit_arg = t_node.template_parameters.get("lit", "")
221 example.literal_meaning = clean_node(wxr, None, lit_arg)
222 calculate_bold_offsets(
223 wxr,
224 wxr.wtp.parse(wxr.wtp.node_to_wikitext(lit_arg)),
225 example.literal_meaning,
226 example,
227 "bold_literal_offsets",
228 )
229 if example.text != "":
230 sense.examples.append(example)
231 for link_node in expanded_node.find_child(NodeKind.LINK):
232 clean_node(wxr, sense, link_node)
235def extract_zh_x_template(
236 wxr: WiktextractContext, t_node: TemplateNode, sense: Sense, ref: str
237):
238 expanded_node = wxr.wtp.parse(
239 wxr.wtp.node_to_wikitext(t_node), expand_all=True
240 )
241 examples = []
242 for dl_tag in expanded_node.find_html("dl"):
243 examples.extend(extract_zh_x_dl_tag(wxr, dl_tag))
244 if len(examples) == 0:
245 examples.extend(extract_zh_x_no_dl_tag(wxr, expanded_node))
247 second_arg = t_node.template_parameters.get(2, "")
248 translation = clean_node(wxr, None, second_arg)
249 for e_data in examples:
250 e_data.translation = translation
251 calculate_bold_offsets(
252 wxr,
253 wxr.wtp.parse(wxr.wtp.node_to_wikitext(second_arg)),
254 translation,
255 e_data,
256 "bold_translation_offsets",
257 )
258 translate_raw_tags(e_data)
260 for link_node in expanded_node.find_child(NodeKind.LINK):
261 clean_node(wxr, sense, link_node)
262 sense.examples.extend(examples)
265def extract_zh_x_dl_tag(
266 wxr: WiktextractContext, dl_tag: HTMLNode
267) -> list[Example]:
268 examples = []
269 for span_tag in dl_tag.find_html("span"):
270 if "lang" in span_tag.attrs:
271 e_text = clean_node(wxr, None, span_tag)
272 if e_text != "":
273 e_data = Example(text=e_text)
274 calculate_bold_offsets(
275 wxr, span_tag, e_text, e_data, "bold_text_offsets"
276 )
277 examples.append(e_data)
278 else:
279 raw_tags = clean_node(wxr, None, span_tag).strip("[] ")
280 for raw_tag in raw_tags.split(","):
281 raw_tag = raw_tag.strip()
282 if raw_tag != "" and len(examples) > 0:
283 examples[-1].raw_tags.append(raw_tag)
284 ref = ""
285 for dd_tag in dl_tag.find_html("dd"):
286 for span_tag in dd_tag.find_html("span"):
287 if "Latn" in span_tag.attrs.get("lang", ""):
288 roman = clean_node(wxr, None, span_tag)
289 for e_data in examples:
290 e_data.roman = roman
291 calculate_bold_offsets(
292 wxr, span_tag, roman, e_data, "bold_roman_offsets"
293 )
294 else:
295 raw_tag = clean_node(wxr, None, span_tag).strip("[] ")
296 if raw_tag != "":
297 for e_data in examples:
298 e_data.raw_tags.append(raw_tag)
299 for small_tag in dd_tag.find_html("small"):
300 ref = clean_node(wxr, None, small_tag).removeprefix("Từ:").strip()
301 for e_data in examples:
302 e_data.ref = ref
304 return examples
307def extract_zh_x_no_dl_tag(
308 wxr: WiktextractContext, expanded_node: WikiNode
309) -> list[Example]:
310 examples = []
311 for span_tag in expanded_node.find_html("span"):
312 lang = span_tag.attrs.get("lang", "")
313 match lang:
314 case "zh-Latn":
315 roman = clean_node(wxr, None, span_tag)
316 for e_data in examples:
317 e_data.roman = roman
318 calculate_bold_offsets(
319 wxr, span_tag, roman, e_data, "bold_roman_offsets"
320 )
321 case "zh-Hant" | "zh-Hans":
322 e_text = clean_node(wxr, None, span_tag)
323 example = Example(text=e_text)
324 example.tags.append(
325 "Traditional-Chinese"
326 if lang == "zh-Hant"
327 else "Simplified-Chinese"
328 )
329 if example.text != "":
330 calculate_bold_offsets(
331 wxr, span_tag, e_text, example, "bold_text_offsets"
332 )
333 examples.append(example)
335 return examples