Coverage for src/wiktextract/extractor/th/example.py: 80%
163 statements
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-15 05:18 +0000
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-15 05:18 +0000
1import re
3from wikitextprocessor import HTMLNode, NodeKind, TemplateNode, WikiNode
5from ...page import clean_node
6from ...wxr_context import WiktextractContext
7from ..ruby import extract_ruby
8from ..share import calculate_bold_offsets
9from .models import Example, Sense, WordEntry
10from .tags import translate_raw_tags
13def extract_example_list_item(
14 wxr: WiktextractContext,
15 word_entry: WordEntry,
16 sense: Sense,
17 list_item: WikiNode,
18 ref: str = "",
19) -> None:
20 from .linkage import LINKAGE_TEMPLATES, extract_syn_template
22 for node in list_item.children:
23 if isinstance(node, TemplateNode):
24 if node.template_name in ["ux", "usex", "ko-usex"]:
25 extract_ux_template(wxr, sense, node)
26 elif node.template_name in ["zh-x", "zh-usex"]:
27 extract_template_zh_x(wxr, sense, node)
28 elif node.template_name in ["ja-x", "ja-usex"]:
29 extract_template_ja_usex(wxr, sense, node, ref)
30 elif node.template_name.startswith("quote-"):
31 ref = extract_quote_template(wxr, sense, node)
32 elif node.template_name in LINKAGE_TEMPLATES:
33 extract_syn_template(
34 wxr, word_entry, node, LINKAGE_TEMPLATES[node.template_name]
35 )
36 elif node.template_name == "audio" and len(sense.examples) > 0: 36 ↛ 22line 36 didn't jump to line 22 because the condition on line 36 was always true
37 from .sound import extract_audio_template
39 extract_audio_template(wxr, sense.examples[-1], node)
40 sense.categories.extend(sense.examples[-1].categories)
41 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
42 for child_list_item in node.find_child(NodeKind.LIST_ITEM):
43 extract_example_list_item(
44 wxr, word_entry, sense, child_list_item, ref
45 )
48def extract_ux_template(
49 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode
50) -> None:
51 expanded_node = wxr.wtp.parse(
52 wxr.wtp.node_to_wikitext(t_node), expand_all=True
53 )
54 e_data = Example(text="")
55 for i_tag in expanded_node.find_html_recursively("i"):
56 i_class = i_tag.attrs.get("class", "")
57 if "e-example" in i_class:
58 e_data.text = clean_node(wxr, None, i_tag)
59 calculate_bold_offsets(
60 wxr, i_tag, e_data.text, e_data, "bold_text_offsets"
61 )
62 elif "e-transliteration" in i_class: 62 ↛ 55line 62 didn't jump to line 55 because the condition on line 62 was always true
63 e_data.roman = clean_node(wxr, None, i_tag)
64 calculate_bold_offsets(
65 wxr, i_tag, e_data.roman, e_data, "bold_roman_offsets"
66 )
67 for span_tag in expanded_node.find_html_recursively("span"):
68 span_class = span_tag.attrs.get("class", "")
69 if "e-translation" in span_class: 69 ↛ 78line 69 didn't jump to line 78 because the condition on line 69 was always true
70 e_data.translation = clean_node(wxr, None, span_tag)
71 calculate_bold_offsets(
72 wxr,
73 span_tag,
74 e_data.translation,
75 e_data,
76 "bold_translation_offsets",
77 )
78 elif "e-literally" in span_class:
79 e_data.literal_meaning = clean_node(wxr, None, span_tag)
80 calculate_bold_offsets(
81 wxr,
82 span_tag,
83 e_data.literal_meaning,
84 e_data,
85 "bold_literal_offsets",
86 )
87 elif "qualifier-content" in span_class:
88 raw_tag = clean_node(wxr, None, span_tag)
89 if raw_tag != "":
90 e_data.raw_tags.append(raw_tag)
92 e_data.ref = clean_node(
93 wxr, None, t_node.template_parameters.get("ref", "")
94 )
95 if e_data.text != "": 95 ↛ exitline 95 didn't return from function 'extract_ux_template' because the condition on line 95 was always true
96 translate_raw_tags(e_data)
97 sense.examples.append(e_data)
98 for link_node in expanded_node.find_child(NodeKind.LINK):
99 clean_node(wxr, sense, link_node)
102def extract_template_zh_x(
103 wxr: WiktextractContext,
104 sense: Sense,
105 t_node: TemplateNode,
106) -> None:
107 expanded_node = wxr.wtp.parse(
108 wxr.wtp.node_to_wikitext(t_node), expand_all=True
109 )
110 examples = []
111 for dl_tag in expanded_node.find_html("dl"):
112 examples.extend(extract_zh_x_dl_tag(wxr, dl_tag))
113 if len(examples) == 0:
114 examples.extend(extract_zh_x_no_dl_tag(wxr, expanded_node))
116 second_arg = t_node.template_parameters.get(2, "")
117 translation = clean_node(wxr, None, second_arg)
118 for e_data in examples:
119 e_data.translation = translation
120 calculate_bold_offsets(
121 wxr,
122 wxr.wtp.parse(wxr.wtp.node_to_wikitext(second_arg)),
123 translation,
124 e_data,
125 "bold_translation_offsets",
126 )
127 translate_raw_tags(e_data)
129 for link_node in expanded_node.find_child(NodeKind.LINK):
130 clean_node(wxr, sense, link_node)
132 sense.examples.extend(examples)
135def extract_zh_x_dl_tag(
136 wxr: WiktextractContext, dl_tag: HTMLNode
137) -> list[Example]:
138 examples = []
139 for span_tag in dl_tag.find_html("span"):
140 if "lang" in span_tag.attrs:
141 e_text = clean_node(wxr, None, span_tag)
142 if e_text != "": 142 ↛ 139line 142 didn't jump to line 139 because the condition on line 142 was always true
143 e_data = Example(text=e_text)
144 calculate_bold_offsets(
145 wxr, span_tag, e_text, e_data, "bold_text_offsets"
146 )
147 examples.append(e_data)
148 else:
149 raw_tags = clean_node(wxr, None, span_tag).strip("[] ")
150 for raw_tag in re.split(r", | and ", raw_tags):
151 raw_tag = raw_tag.strip()
152 if raw_tag != "" and len(examples) > 0: 152 ↛ 150line 152 didn't jump to line 150 because the condition on line 152 was always true
153 examples[-1].raw_tags.append(raw_tag)
154 for dd_tag in dl_tag.find_html("dd"):
155 for span_tag in dd_tag.find_html("span"):
156 if "Latn" in span_tag.attrs.get("lang", ""):
157 roman = clean_node(wxr, None, span_tag)
158 for e_data in examples:
159 e_data.roman = roman
160 calculate_bold_offsets(
161 wxr, span_tag, roman, e_data, "bold_roman_offsets"
162 )
163 else:
164 raw_tag = clean_node(wxr, None, span_tag).strip("[] ")
165 if raw_tag != "": 165 ↛ 155line 165 didn't jump to line 155 because the condition on line 165 was always true
166 for e_data in examples:
167 e_data.raw_tags.append(raw_tag)
168 return examples
171def extract_zh_x_no_dl_tag(
172 wxr: WiktextractContext, expanded_node: WikiNode
173) -> list[Example]:
174 examples = []
175 for span_tag in expanded_node.find_html("span"):
176 lang = span_tag.attrs.get("lang", "")
177 match lang:
178 case "zh-Latn":
179 roman = clean_node(wxr, None, span_tag)
180 for e_data in examples:
181 e_data.roman = roman
182 calculate_bold_offsets(
183 wxr, span_tag, roman, e_data, "bold_roman_offsets"
184 )
185 case "zh-Hant" | "zh-Hans":
186 e_text = clean_node(wxr, None, span_tag)
187 example = Example(text=e_text)
188 example.tags.append(
189 "Traditional-Chinese"
190 if lang == "zh-Hant"
191 else "Simplified-Chinese"
192 )
193 if example.text != "": 193 ↛ 175line 193 didn't jump to line 175 because the condition on line 193 was always true
194 calculate_bold_offsets(
195 wxr, span_tag, e_text, example, "bold_text_offsets"
196 )
197 examples.append(example)
199 return examples
202def extract_quote_template(
203 wxr: WiktextractContext,
204 sense: Sense,
205 t_node: TemplateNode,
206) -> str:
207 ref = ""
208 if all( 208 ↛ 213line 208 didn't jump to line 213 because the condition on line 208 was always true
209 arg not in t_node.template_parameters for arg in ["text", "passage", 7]
210 ):
211 ref = clean_node(wxr, sense, t_node)
212 else:
213 expanded_node = wxr.wtp.parse(
214 wxr.wtp.node_to_wikitext(t_node), expand_all=True
215 )
216 example = Example(text="")
217 for span_tag in expanded_node.find_html_recursively("span"):
218 span_class = span_tag.attrs.get("class", "")
219 if "cited-source" == span_class:
220 example.ref = clean_node(wxr, None, span_tag)
221 elif "e-quotation" in span_class:
222 example.text = clean_node(wxr, None, span_tag)
223 calculate_bold_offsets(
224 wxr, span_tag, example.text, example, "bold_text_offsets"
225 )
226 elif "e-translation" in span_class:
227 example.translation = clean_node(wxr, None, span_tag)
228 calculate_bold_offsets(
229 wxr,
230 span_tag,
231 example.translation,
232 example,
233 "bold_translation_text",
234 )
235 for i_tag in expanded_node.find_html_recursively(
236 "i", attr_name="class", attr_value="e-transliteration"
237 ):
238 example.roman = clean_node(wxr, None, i_tag)
239 calculate_bold_offsets(
240 wxr, i_tag, example.roman, example, "bold_roman_offsets"
241 )
242 break
243 if example.text != "":
244 sense.examples.append(example)
245 clean_node(wxr, sense, expanded_node)
247 return ref
250def extract_template_ja_usex(
251 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode, ref: str
252) -> None:
253 expanded_node = wxr.wtp.parse(
254 wxr.wtp.node_to_wikitext(t_node), expand_all=True
255 )
256 example = Example(text="", ref=ref)
257 for span_tag in expanded_node.find_html(
258 "span", attr_name="class", attr_value="Jpan"
259 ):
260 ruby_data, node_without_ruby = extract_ruby(wxr, span_tag)
261 example.text = clean_node(wxr, None, node_without_ruby)
262 example.ruby = ruby_data
263 calculate_bold_offsets(
264 wxr,
265 wxr.wtp.parse(wxr.wtp.node_to_wikitext(node_without_ruby)),
266 example.text,
267 example,
268 "bold_text_offsets",
269 )
270 for span_tag in expanded_node.find_html_recursively(
271 "span", attr_name="class", attr_value="tr"
272 ):
273 example.roman = clean_node(wxr, None, span_tag)
274 calculate_bold_offsets(
275 wxr, span_tag, example.roman, example, "bold_roman_offsets"
276 )
277 third_arg = t_node.template_parameters.get(3, "")
278 example.translation = clean_node(wxr, None, third_arg)
279 calculate_bold_offsets(
280 wxr,
281 wxr.wtp.parse(wxr.wtp.node_to_wikitext(third_arg)),
282 example.translation,
283 example,
284 "bold_translation_offsets",
285 )
286 lit_arg = t_node.template_parameters.get("lit", "")
287 example.literal_meaning = clean_node(wxr, None, lit_arg)
288 calculate_bold_offsets(
289 wxr,
290 wxr.wtp.parse(wxr.wtp.node_to_wikitext(lit_arg)),
291 example.literal_meaning,
292 example,
293 "bold_literal_offsets",
294 )
295 if example.text != "": 295 ↛ exitline 295 didn't return from function 'extract_template_ja_usex' because the condition on line 295 was always true
296 sense.examples.append(example)
297 for link_node in expanded_node.find_child(NodeKind.LINK):
298 clean_node(wxr, sense, link_node)