Coverage for src/wiktextract/extractor/ko/example.py: 58%
163 statements
« prev ^ index » next coverage.py v7.11.0, created at 2025-11-03 05:44 +0000
« prev ^ index » next coverage.py v7.11.0, created at 2025-11-03 05:44 +0000
1from wikitextprocessor import HTMLNode, NodeKind, TemplateNode, WikiNode
3from ...page import clean_node
4from ...wxr_context import WiktextractContext
5from ..ruby import extract_ruby
6from ..share import calculate_bold_offsets, set_sound_file_url_fields
7from .models import Example, Sense, Sound
8from .tags import translate_raw_tags
11def extract_example_list_item(
12 wxr: WiktextractContext,
13 sense: Sense,
14 list_item: WikiNode,
15 lang_code: str,
16 parent_example: Example | None = None,
17) -> None:
18 example = Example() if parent_example is None else parent_example
19 e_text_nodes = []
20 e_tr_nodes = []
21 after_lang_template = False
22 for node in list_item.children:
23 if isinstance(node, TemplateNode) and node.template_name == "lang":
24 after_lang_template = True
25 extract_example_lang_template(wxr, example, node, lang_code)
26 elif isinstance(node, TemplateNode) and node.template_name.startswith(
27 ("따옴", "지봉유설")
28 ):
29 example.ref = (
30 clean_node(wxr, None, node).strip("() ").removeprefix("따옴◄")
31 )
32 elif isinstance(node, TemplateNode) and node.template_name in [
33 "예문",
34 "ux",
35 "uxi",
36 ]:
37 extract_ux_template(wxr, sense, example, node)
38 break
39 elif isinstance(node, TemplateNode) and node.template_name in [ 39 ↛ 43line 39 didn't jump to line 43 because the condition on line 39 was never true
40 "zh-x",
41 "zh-usex",
42 ]:
43 extract_template_zh_x(wxr, sense, node)
44 break
45 elif after_lang_template:
46 e_tr_nodes.append(node)
47 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
48 break
49 elif (
50 isinstance(node, WikiNode)
51 and node.kind == NodeKind.LINK
52 and len(node.largs) > 0
53 and len(node.largs[0]) > 0
54 and isinstance(node.largs[0][0], str)
55 and node.largs[0][0].startswith("File:")
56 ):
57 sound = Sound()
58 sound_file = node.largs[0][0].removeprefix("File:").strip()
59 set_sound_file_url_fields(wxr, sound_file, sound)
60 if sound.audio != "": 60 ↛ 22line 60 didn't jump to line 22 because the condition on line 60 was always true
61 example.sounds.append(sound)
62 else:
63 e_text_nodes.append(node)
65 e_text = clean_node(wxr, sense, e_text_nodes)
66 if e_text != "":
67 example.text = e_text
68 calculate_bold_offsets(
69 wxr,
70 wxr.wtp.parse(wxr.wtp.node_to_wikitext(e_text_nodes)),
71 e_text,
72 example,
73 "bold_text_offsets",
74 )
75 e_tr = clean_node(wxr, sense, e_tr_nodes)
76 if e_tr != "":
77 example.translation = e_tr
79 if len(example.text) > 0:
80 if lang_code == "zh" and "/" in example.text:
81 example.bold_text_offsets = example.bold_text_offsets[
82 : len(example.bold_text_offsets) // 2
83 ]
84 for index, text in enumerate(example.text.split("/", 1)):
85 new_example = example.model_copy(deep=True)
86 new_example.text = text
87 new_example.tags.append(
88 "Traditional-Chinese"
89 if index == 0
90 else "Simplified-Chinese"
91 )
92 sense.examples.append(new_example)
93 else:
94 sense.examples.append(example)
96 for nested_list in list_item.find_child(NodeKind.LIST):
97 for nested_list_item in nested_list.find_child(NodeKind.LIST_ITEM):
98 extract_example_list_item(
99 wxr,
100 sense,
101 nested_list_item,
102 lang_code,
103 example if example.text == "" else Example(),
104 )
107def extract_example_lang_template(
108 wxr: WiktextractContext,
109 example: Example,
110 node: TemplateNode,
111 lang_code: str,
112) -> None:
113 # https://ko.wiktionary.org/wiki/틀:lang
114 if lang_code == "ja":
115 example.ruby, text_nodes = extract_ruby(
116 wxr,
117 wxr.wtp.parse(
118 wxr.wtp.node_to_wikitext(node.template_parameters.get(2, "")),
119 expand_all=True,
120 ).children,
121 )
122 example.text = clean_node(wxr, None, text_nodes)
123 calculate_bold_offsets(
124 wxr,
125 wxr.wtp.parse(wxr.wtp.node_to_wikitext(text_nodes)),
126 example.text,
127 example,
128 "bold_text_offsets",
129 )
130 else:
131 second_arg = node.template_parameters.get(2, "")
132 example.text = clean_node(wxr, None, second_arg)
133 calculate_bold_offsets(
134 wxr,
135 wxr.wtp.parse(wxr.wtp.node_to_wikitext(second_arg)),
136 example.text,
137 example,
138 "bold_text_offsets",
139 )
140 tr_arg = node.template_parameters.get(4, "")
141 example.translation = clean_node(wxr, None, tr_arg)
142 calculate_bold_offsets(
143 wxr,
144 wxr.wtp.parse(wxr.wtp.node_to_wikitext(tr_arg)),
145 example.translation,
146 example,
147 "bold_translation_offsets",
148 )
149 if lang_code == "zh" and "(" in example.text and example.text.endswith(")"):
150 roman_start_index = example.text.index("(")
151 example.roman = example.text[roman_start_index:].strip("() ")
152 example.text = example.text[:roman_start_index].strip()
155def extract_ux_template(
156 wxr: WiktextractContext,
157 sense: Sense,
158 example: Example,
159 t_node: TemplateNode,
160) -> None:
161 # https://ko.wiktionary.org/wiki/틀:ux
162 # https://ko.wiktionary.org/wiki/모듈:usex/templates
163 lang_code = t_node.template_parameters.get(1, "")
164 expanded_node = wxr.wtp.parse(
165 wxr.wtp.node_to_wikitext(t_node), expand_all=True
166 )
167 if lang_code == "ja":
168 for span_tag in expanded_node.find_html_recursively("span"):
169 span_class = span_tag.attrs.get("class", "")
170 if span_class == "Jpan":
171 example.ruby, no_ruby = extract_ruby(wxr, span_tag)
172 example.text = clean_node(wxr, None, no_ruby)
173 calculate_bold_offsets(
174 wxr,
175 wxr.wtp.parse(wxr.wtp.node_to_wikitext(no_ruby)),
176 example.text,
177 example,
178 "bold_text_offsets",
179 )
180 elif span_class == "tr": 180 ↛ 168line 180 didn't jump to line 168 because the condition on line 180 was always true
181 example.roman = clean_node(wxr, None, span_tag)
182 calculate_bold_offsets(
183 wxr,
184 wxr.wtp.parse(wxr.wtp.node_to_wikitext(span_tag)),
185 example.roman,
186 example,
187 "bold_roman_offsets",
188 )
189 tr_arg = t_node.template_parameters.get(4, "")
190 example.translation = clean_node(wxr, None, tr_arg)
191 calculate_bold_offsets(
192 wxr,
193 wxr.wtp.parse(wxr.wtp.node_to_wikitext(tr_arg)),
194 example.translation,
195 example,
196 "bold_translation_offsets",
197 )
198 lit_arg = t_node.template_parameters.get("lit", "")
199 example.literal_meaning = clean_node(wxr, None, lit_arg)
200 calculate_bold_offsets(
201 wxr,
202 wxr.wtp.parse(wxr.wtp.node_to_wikitext(lit_arg)),
203 example.literal_meaning,
204 example,
205 "bold_literal_offsets",
206 )
207 if example.ref == "": 207 ↛ 242line 207 didn't jump to line 242 because the condition on line 207 was always true
208 example.ref = clean_node(
209 wxr, None, t_node.template_parameters.get("ref", "")
210 )
211 else:
212 second_arg = t_node.template_parameters.get(2, "")
213 example.text = clean_node(wxr, None, second_arg)
214 calculate_bold_offsets(
215 wxr,
216 wxr.wtp.parse(wxr.wtp.node_to_wikitext(second_arg)),
217 example.text,
218 example,
219 "bold_text_offsets",
220 )
221 third_arg = t_node.template_parameters.get(3, "")
222 example.translation = clean_node(wxr, None, third_arg)
223 calculate_bold_offsets(
224 wxr,
225 wxr.wtp.parse(wxr.wtp.node_to_wikitext(third_arg)),
226 example.translation,
227 example,
228 "bold_translation_offsets",
229 )
230 example.note = clean_node(
231 wxr, None, t_node.template_parameters.get("footer", "")
232 )
233 if example.ref == "": 233 ↛ 237line 233 didn't jump to line 237 because the condition on line 233 was always true
234 example.ref = clean_node(
235 wxr, None, t_node.template_parameters.get("출처", "")
236 )
237 if example.ref == "": 237 ↛ 238line 237 didn't jump to line 238 because the condition on line 237 was never true
238 example.ref = clean_node(
239 wxr, None, t_node.template_parameters.get("source", "")
240 )
242 for link_node in expanded_node.find_child(NodeKind.LINK):
243 clean_node(wxr, sense, link_node)
246def extract_template_zh_x(
247 wxr: WiktextractContext,
248 sense: Sense,
249 t_node: TemplateNode,
250) -> None:
251 expanded_node = wxr.wtp.parse(
252 wxr.wtp.node_to_wikitext(t_node), expand_all=True
253 )
254 examples = []
255 for dl_tag in expanded_node.find_html("dl"):
256 examples.extend(extract_zh_x_dl_tag(wxr, dl_tag))
257 if len(examples) == 0:
258 examples.extend(extract_zh_x_no_dl_tag(wxr, expanded_node))
260 second_arg = t_node.template_parameters.get(2, "")
261 translation = clean_node(wxr, None, second_arg)
262 for e_data in examples:
263 e_data.translation = translation
264 calculate_bold_offsets(
265 wxr,
266 wxr.wtp.parse(wxr.wtp.node_to_wikitext(second_arg)),
267 translation,
268 e_data,
269 "bold_translation_offsets",
270 )
271 translate_raw_tags(e_data)
273 for link_node in expanded_node.find_child(NodeKind.LINK):
274 clean_node(wxr, sense, link_node)
276 sense.examples.extend(examples)
279def extract_zh_x_dl_tag(
280 wxr: WiktextractContext, dl_tag: HTMLNode
281) -> list[Example]:
282 examples = []
283 for span_tag in dl_tag.find_html("span"):
284 if "lang" in span_tag.attrs:
285 e_text = clean_node(wxr, None, span_tag)
286 if e_text != "":
287 e_data = Example(text=e_text)
288 calculate_bold_offsets(
289 wxr, span_tag, e_text, e_data, "bold_text_offsets"
290 )
291 examples.append(e_data)
292 else:
293 raw_tags = clean_node(wxr, None, span_tag).strip("[] ")
294 for raw_tag in raw_tags.split(","):
295 raw_tag = raw_tag.strip()
296 if raw_tag != "" and len(examples) > 0:
297 examples[-1].raw_tags.append(raw_tag)
298 for dd_tag in dl_tag.find_html("dd"):
299 for span_tag in dd_tag.find_html("span"):
300 if "Latn" in span_tag.attrs.get("lang", ""):
301 roman = clean_node(wxr, None, span_tag)
302 for e_data in examples:
303 e_data.roman = roman
304 calculate_bold_offsets(
305 wxr, span_tag, roman, e_data, "bold_roman_offsets"
306 )
307 else:
308 raw_tag = clean_node(wxr, None, span_tag).strip("[] ")
309 if raw_tag != "":
310 for e_data in examples:
311 e_data.raw_tags.append(raw_tag)
312 return examples
315def extract_zh_x_no_dl_tag(
316 wxr: WiktextractContext, expanded_node: WikiNode
317) -> list[Example]:
318 examples = []
319 for span_tag in expanded_node.find_html("span"):
320 lang = span_tag.attrs.get("lang", "")
321 match lang:
322 case "zh-Latn":
323 roman = clean_node(wxr, None, span_tag)
324 for e_data in examples:
325 e_data.roman = roman
326 calculate_bold_offsets(
327 wxr, span_tag, roman, e_data, "bold_roman_offsets"
328 )
329 case "zh-Hant" | "zh-Hans":
330 e_text = clean_node(wxr, None, span_tag)
331 example = Example(text=e_text)
332 example.tags.append(
333 "Traditional-Chinese"
334 if lang == "zh-Hant"
335 else "Simplified-Chinese"
336 )
337 if example.text != "":
338 calculate_bold_offsets(
339 wxr, span_tag, e_text, example, "bold_text_offsets"
340 )
341 examples.append(example)
343 return examples