Coverage for src/wiktextract/extractor/ko/example.py: 96%
99 statements
« prev ^ index » next coverage.py v7.9.0, created at 2025-06-13 07:43 +0000
« prev ^ index » next coverage.py v7.9.0, created at 2025-06-13 07:43 +0000
1from wikitextprocessor import NodeKind, TemplateNode, WikiNode
3from ...page import clean_node
4from ...wxr_context import WiktextractContext
5from ..ruby import extract_ruby
6from ..share import calculate_bold_offsets, set_sound_file_url_fields
7from .models import Example, Sense, Sound
10def extract_example_list_item(
11 wxr: WiktextractContext,
12 sense: Sense,
13 list_item: WikiNode,
14 lang_code: str,
15 parent_example: Example | None = None,
16) -> None:
17 example = Example() if parent_example is None else parent_example
18 e_text_nodes = []
19 e_tr_nodes = []
20 after_lang_template = False
21 for node in list_item.children:
22 if isinstance(node, TemplateNode) and node.template_name == "lang":
23 after_lang_template = True
24 extract_example_lang_template(wxr, example, node, lang_code)
25 elif isinstance(node, TemplateNode) and node.template_name.startswith(
26 ("따옴", "지봉유설")
27 ):
28 example.ref = (
29 clean_node(wxr, None, node).strip("() ").removeprefix("따옴◄")
30 )
31 elif isinstance(node, TemplateNode) and node.template_name in [
32 "예문",
33 "ux",
34 "uxi",
35 ]:
36 extract_ux_template(wxr, sense, example, node)
37 break
38 elif after_lang_template:
39 e_tr_nodes.append(node)
40 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
41 break
42 elif (
43 isinstance(node, WikiNode)
44 and node.kind == NodeKind.LINK
45 and len(node.largs) > 0
46 and len(node.largs[0]) > 0
47 and isinstance(node.largs[0][0], str)
48 and node.largs[0][0].startswith("File:")
49 ):
50 sound = Sound()
51 sound_file = node.largs[0][0].removeprefix("File:").strip()
52 set_sound_file_url_fields(wxr, sound_file, sound)
53 if sound.audio != "": 53 ↛ 21line 53 didn't jump to line 21 because the condition on line 53 was always true
54 example.sounds.append(sound)
55 else:
56 e_text_nodes.append(node)
58 e_text = clean_node(wxr, sense, e_text_nodes)
59 if e_text != "":
60 example.text = e_text
61 calculate_bold_offsets(
62 wxr,
63 wxr.wtp.parse(wxr.wtp.node_to_wikitext(e_text_nodes)),
64 e_text,
65 example,
66 "bold_text_offsets",
67 )
68 e_tr = clean_node(wxr, sense, e_tr_nodes)
69 if e_tr != "":
70 example.translation = e_tr
72 if len(example.text) > 0:
73 if lang_code == "zh" and "/" in example.text:
74 example.bold_text_offsets = example.bold_text_offsets[
75 : len(example.bold_text_offsets) // 2
76 ]
77 for index, text in enumerate(example.text.split("/", 1)):
78 new_example = example.model_copy(deep=True)
79 new_example.text = text
80 new_example.tags.append(
81 "Traditional Chinese"
82 if index == 0
83 else "Simplified Chinese"
84 )
85 sense.examples.append(new_example)
86 else:
87 sense.examples.append(example)
89 for nested_list in list_item.find_child(NodeKind.LIST):
90 for nested_list_item in nested_list.find_child(NodeKind.LIST_ITEM):
91 extract_example_list_item(
92 wxr,
93 sense,
94 nested_list_item,
95 lang_code,
96 example if example.text == "" else Example(),
97 )
100def extract_example_lang_template(
101 wxr: WiktextractContext,
102 example: Example,
103 node: TemplateNode,
104 lang_code: str,
105) -> None:
106 # https://ko.wiktionary.org/wiki/틀:lang
107 if lang_code == "ja":
108 example.ruby, text_nodes = extract_ruby(
109 wxr,
110 wxr.wtp.parse(
111 wxr.wtp.node_to_wikitext(node.template_parameters.get(2, "")),
112 expand_all=True,
113 ).children,
114 )
115 example.text = clean_node(wxr, None, text_nodes)
116 calculate_bold_offsets(
117 wxr,
118 wxr.wtp.parse(wxr.wtp.node_to_wikitext(text_nodes)),
119 example.text,
120 example,
121 "bold_text_offsets",
122 )
123 else:
124 second_arg = node.template_parameters.get(2, "")
125 example.text = clean_node(wxr, None, second_arg)
126 calculate_bold_offsets(
127 wxr,
128 wxr.wtp.parse(wxr.wtp.node_to_wikitext(second_arg)),
129 example.text,
130 example,
131 "bold_text_offsets",
132 )
133 tr_arg = node.template_parameters.get(4, "")
134 example.translation = clean_node(wxr, None, tr_arg)
135 calculate_bold_offsets(
136 wxr,
137 wxr.wtp.parse(wxr.wtp.node_to_wikitext(tr_arg)),
138 example.translation,
139 example,
140 "bold_translation_offsets",
141 )
142 if lang_code == "zh" and "(" in example.text and example.text.endswith(")"):
143 roman_start_index = example.text.index("(")
144 example.roman = example.text[roman_start_index:].strip("() ")
145 example.text = example.text[:roman_start_index].strip()
148def extract_ux_template(
149 wxr: WiktextractContext,
150 sense: Sense,
151 example: Example,
152 t_node: TemplateNode,
153) -> None:
154 # https://ko.wiktionary.org/wiki/틀:ux
155 # https://ko.wiktionary.org/wiki/모듈:usex/templates
156 lang_code = t_node.template_parameters.get(1, "")
157 expanded_node = wxr.wtp.parse(
158 wxr.wtp.node_to_wikitext(t_node), expand_all=True
159 )
160 if lang_code == "ja":
161 for span_tag in expanded_node.find_html_recursively("span"):
162 span_class = span_tag.attrs.get("class", "")
163 if span_class == "Jpan":
164 example.ruby, no_ruby = extract_ruby(wxr, span_tag)
165 example.text = clean_node(wxr, None, no_ruby)
166 calculate_bold_offsets(
167 wxr,
168 wxr.wtp.parse(wxr.wtp.node_to_wikitext(no_ruby)),
169 example.text,
170 example,
171 "bold_text_offsets",
172 )
173 elif span_class == "tr": 173 ↛ 161line 173 didn't jump to line 161 because the condition on line 173 was always true
174 example.roman = clean_node(wxr, None, span_tag)
175 calculate_bold_offsets(
176 wxr,
177 wxr.wtp.parse(wxr.wtp.node_to_wikitext(span_tag)),
178 example.roman,
179 example,
180 "bold_roman_offsets",
181 )
182 tr_arg = t_node.template_parameters.get(4, "")
183 example.translation = clean_node(wxr, None, tr_arg)
184 calculate_bold_offsets(
185 wxr,
186 wxr.wtp.parse(wxr.wtp.node_to_wikitext(tr_arg)),
187 example.translation,
188 example,
189 "bold_translation_offsets",
190 )
191 lit_arg = t_node.template_parameters.get("lit", "")
192 example.literal_meaning = clean_node(wxr, None, lit_arg)
193 calculate_bold_offsets(
194 wxr,
195 wxr.wtp.parse(wxr.wtp.node_to_wikitext(lit_arg)),
196 example.literal_meaning,
197 example,
198 "bold_literal_offsets",
199 )
200 if example.ref == "": 200 ↛ 235line 200 didn't jump to line 235 because the condition on line 200 was always true
201 example.ref = clean_node(
202 wxr, None, t_node.template_parameters.get("ref", "")
203 )
204 else:
205 second_arg = t_node.template_parameters.get(2, "")
206 example.text = clean_node(wxr, None, second_arg)
207 calculate_bold_offsets(
208 wxr,
209 wxr.wtp.parse(wxr.wtp.node_to_wikitext(second_arg)),
210 example.text,
211 example,
212 "bold_text_offsets",
213 )
214 third_arg = t_node.template_parameters.get(3, "")
215 example.translation = clean_node(wxr, None, third_arg)
216 calculate_bold_offsets(
217 wxr,
218 wxr.wtp.parse(wxr.wtp.node_to_wikitext(third_arg)),
219 example.translation,
220 example,
221 "bold_translation_offsets",
222 )
223 example.note = clean_node(
224 wxr, None, t_node.template_parameters.get("footer", "")
225 )
226 if example.ref == "": 226 ↛ 230line 226 didn't jump to line 230 because the condition on line 226 was always true
227 example.ref = clean_node(
228 wxr, None, t_node.template_parameters.get("출처", "")
229 )
230 if example.ref == "": 230 ↛ 231line 230 didn't jump to line 231 because the condition on line 230 was never true
231 example.ref = clean_node(
232 wxr, None, t_node.template_parameters.get("source", "")
233 )
235 for link_node in expanded_node.find_child(NodeKind.LINK):
236 clean_node(wxr, sense, link_node)