Coverage for src / wiktextract / extractor / ko / pos.py: 72%
205 statements
« prev ^ index » next coverage.py v7.12.0, created at 2025-11-26 11:06 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2025-11-26 11:06 +0000
1import re
3from wikitextprocessor import (
4 HTMLNode,
5 LevelNode,
6 NodeKind,
7 TemplateNode,
8 WikiNode,
9)
11from ...page import clean_node
12from ...wxr_context import WiktextractContext
13from ..ruby import extract_ruby
14from .example import extract_example_list_item
15from .linkage import (
16 LINKAGE_TEMPLATES,
17 extract_linkage_list_item,
18 extract_linkage_template,
19)
20from .models import AltForm, Classifier, Form, Sense, WordEntry
21from .section_titles import LINKAGE_SECTIONS, POS_DATA
22from .sound import SOUND_TEMPLATES, extract_sound_template
23from .tags import translate_raw_tags
24from .translation import extract_translation_template
27def extract_pos_section(
28 wxr: WiktextractContext,
29 page_data: list[WordEntry],
30 base_data: WordEntry,
31 level_node: LevelNode,
32 pos_title: str,
33) -> None:
34 page_data.append(base_data.model_copy(deep=True))
35 orig_title = pos_title
36 pos_title = pos_title.removeprefix("보조 ").strip()
37 if pos_title in POS_DATA:
38 page_data[-1].pos_title = orig_title
39 pos_data = POS_DATA[pos_title]
40 page_data[-1].pos = pos_data["pos"]
41 page_data[-1].tags.extend(pos_data.get("tags", []))
42 if ( 42 ↛ 46line 42 didn't jump to line 46 because the condition on line 42 was never true
43 orig_title.startswith("보조 ")
44 and "auxiliary" not in page_data[-1].tags
45 ):
46 page_data[-1].tags.append("auxiliary")
48 has_linkage = False
49 for node in level_node.find_child(NodeKind.LIST | NodeKind.TEMPLATE):
50 if isinstance(node, TemplateNode):
51 if node.template_name in SOUND_TEMPLATES:
52 extract_sound_template(wxr, page_data[-1], node)
53 elif node.template_name in LINKAGE_TEMPLATES:
54 has_linkage = extract_linkage_template(
55 wxr, page_data[-1], node, "derived"
56 )
57 elif node.template_name == "외국어":
58 extract_translation_template(
59 wxr,
60 page_data[-1],
61 node,
62 page_data[-1].senses[-1].glosses[-1]
63 if len(page_data[-1].senses) > 0
64 else "",
65 )
66 elif node.template_name.startswith( 66 ↛ 49line 66 didn't jump to line 49 because the condition on line 66 was always true
67 base_data.lang_code + "-"
68 ) or node.template_name.endswith((" 동사", " 명사", " 고유명사")):
69 extract_headword_line_template(wxr, page_data[-1], node)
70 elif node.kind == NodeKind.LIST: 70 ↛ 49line 70 didn't jump to line 49 because the condition on line 70 was always true
71 for list_item in node.find_child(NodeKind.LIST_ITEM):
72 if node.sarg.startswith("#") and node.sarg.endswith("#"):
73 extract_gloss_list_item(
74 wxr,
75 page_data[-1],
76 list_item,
77 Sense(pattern=page_data[-1].pattern),
78 )
79 else:
80 extract_unorderd_list_item(wxr, page_data[-1], list_item)
82 if not (
83 len(page_data[-1].senses) > 0
84 or len(page_data[-1].sounds) > len(base_data.sounds)
85 or len(page_data[-1].translations) > len(base_data.translations)
86 or has_linkage
87 ):
88 page_data.pop()
91def extract_gloss_list_item(
92 wxr: WiktextractContext,
93 word_entry: WordEntry,
94 list_item: WikiNode,
95 parent_sense: Sense,
96) -> None:
97 gloss_nodes = []
98 sense = parent_sense.model_copy(deep=True)
99 for node in list_item.children:
100 if isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
101 gloss_text = clean_node(wxr, sense, gloss_nodes)
102 if len(gloss_text) > 0: 102 ↛ 107line 102 didn't jump to line 107 because the condition on line 102 was always true
103 sense.glosses.append(gloss_text)
104 translate_raw_tags(sense)
105 word_entry.senses.append(sense)
106 gloss_nodes.clear()
107 for nested_list_item in node.find_child(NodeKind.LIST_ITEM):
108 if node.sarg.startswith("#") and node.sarg.endswith("#"):
109 extract_gloss_list_item(
110 wxr, word_entry, nested_list_item, sense
111 )
112 else:
113 extract_unorderd_list_item(
114 wxr, word_entry, nested_list_item
115 )
116 continue
117 elif isinstance(node, TemplateNode) and node.template_name.endswith(
118 " of"
119 ):
120 extract_form_of_template(wxr, sense, node)
121 gloss_nodes.append(node)
122 elif isinstance(node, TemplateNode) and node.template_name == "라벨":
123 sense.raw_tags.extend(
124 [
125 raw_tag.strip()
126 for raw_tag in clean_node(wxr, sense, node)
127 .strip("()")
128 .split(",")
129 ]
130 )
131 elif isinstance(node, TemplateNode) and node.template_name == "zh-mw": 131 ↛ 132line 131 didn't jump to line 132 because the condition on line 131 was never true
132 extract_zh_mw_template(wxr, node, sense)
133 else:
134 gloss_nodes.append(node)
136 gloss_text = clean_node(wxr, sense, gloss_nodes)
137 if len(gloss_text) > 0:
138 sense.glosses.append(gloss_text)
139 translate_raw_tags(sense)
140 word_entry.senses.append(sense)
143def extract_unorderd_list_item(
144 wxr: WiktextractContext, word_entry: WordEntry, list_item: WikiNode
145) -> None:
146 is_first_bold = True
147 for index, node in enumerate(list_item.children):
148 if (
149 isinstance(node, WikiNode)
150 and node.kind == NodeKind.BOLD
151 and is_first_bold
152 ):
153 # `* '''1.''' gloss text`, terrible obsolete layout
154 is_first_bold = False
155 bold_text = clean_node(wxr, None, node)
156 if re.fullmatch(r"\d+(?:-\d+)?\.?", bold_text):
157 new_list_item = WikiNode(NodeKind.LIST_ITEM, 0)
158 new_list_item.children = list_item.children[index + 1 :]
159 extract_gloss_list_item(wxr, word_entry, new_list_item, Sense())
160 break
161 elif isinstance(node, str) and "어원:" in node:
162 etymology_nodes = []
163 etymology_nodes.append(node[node.index(":") + 1 :])
164 etymology_nodes.extend(list_item.children[index + 1 :])
165 e_text = clean_node(wxr, None, etymology_nodes)
166 if len(e_text) > 0: 166 ↛ 168line 166 didn't jump to line 168 because the condition on line 166 was always true
167 word_entry.etymology_texts.append(e_text)
168 break
169 elif (
170 isinstance(node, str)
171 and re.search(r"(?:참고|참조|활용):", node) is not None
172 ):
173 note_str = node[node.index(":") + 1 :].strip()
174 note_str += clean_node(
175 wxr,
176 word_entry.senses[-1]
177 if len(word_entry.senses) > 0
178 else word_entry,
179 list_item.children[index + 1 :],
180 )
181 if len(word_entry.senses) > 0:
182 word_entry.senses[-1].note = note_str
183 else:
184 word_entry.note = note_str
185 break
186 elif (
187 isinstance(node, str)
188 and ":" in node
189 and node[: node.index(":")].strip() in LINKAGE_SECTIONS
190 ):
191 extract_linkage_list_item(wxr, word_entry, list_item, "", False)
192 break
193 elif isinstance(node, str) and "문형:" in node:
194 word_entry.pattern = node[node.index(":") + 1 :].strip()
195 word_entry.pattern += clean_node(
196 wxr, None, list_item.children[index + 1 :]
197 )
198 break
199 else:
200 if len(word_entry.senses) > 0:
201 extract_example_list_item(
202 wxr, word_entry.senses[-1], list_item, word_entry.lang_code
203 )
206def extract_form_of_template(
207 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode
208) -> None:
209 if "form-of" not in sense.tags: 209 ↛ 211line 209 didn't jump to line 211 because the condition on line 209 was always true
210 sense.tags.append("form-of")
211 word_arg = 1 if t_node.template_name == "ko-hanja form of" else 2
212 word = clean_node(wxr, None, t_node.template_parameters.get(word_arg, ""))
213 if len(word) > 0: 213 ↛ exitline 213 didn't return from function 'extract_form_of_template' because the condition on line 213 was always true
214 sense.form_of.append(AltForm(word=word))
217def extract_grammar_note_section(
218 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode
219) -> None:
220 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM):
221 word_entry.note = clean_node(wxr, None, list_item.children)
224def extract_zh_mw_template(
225 wxr: WiktextractContext, t_node: TemplateNode, sense: Sense
226) -> None:
227 # Chinese inline classifier template
228 # copied from zh edition code
229 expanded_node = wxr.wtp.parse(
230 wxr.wtp.node_to_wikitext(t_node), expand_all=True
231 )
232 classifiers = []
233 last_word = ""
234 for span_tag in expanded_node.find_html_recursively("span"):
235 span_class = span_tag.attrs.get("class", "")
236 if span_class in ["Hani", "Hant", "Hans"]:
237 word = clean_node(wxr, None, span_tag)
238 if word != "/":
239 classifier = Classifier(classifier=word)
240 if span_class == "Hant":
241 classifier.tags.append("Traditional-Chinese")
242 elif span_class == "Hans":
243 classifier.tags.append("Simplified-Chinese")
245 if len(classifiers) > 0 and last_word != "/":
246 sense.classifiers.extend(classifiers)
247 classifiers.clear()
248 classifiers.append(classifier)
249 last_word = word
250 elif "title" in span_tag.attrs:
251 raw_tag = clean_node(wxr, None, span_tag.attrs["title"])
252 if len(raw_tag) > 0:
253 for classifier in classifiers:
254 classifier.raw_tags.append(raw_tag)
255 sense.classifiers.extend(classifiers)
256 for classifier in sense.classifiers:
257 translate_raw_tags(classifier)
260def extract_headword_line_template(
261 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
262):
263 forms = []
264 expanded_node = wxr.wtp.parse(
265 wxr.wtp.node_to_wikitext(t_node), expand_all=True
266 )
267 for main_span_tag in expanded_node.find_html(
268 "span", attr_name="class", attr_value="headword-line"
269 ):
270 i_tags = []
271 for html_node in main_span_tag.find_child(NodeKind.HTML):
272 class_names = html_node.attrs.get("class", "").split()
273 if html_node.tag == "strong" and "headword" in class_names:
274 ruby, no_ruby = extract_ruby(wxr, html_node)
275 strong_str = clean_node(wxr, None, no_ruby)
276 if strong_str not in ["", wxr.wtp.title] or len(ruby) > 0:
277 forms.append(
278 Form(form=strong_str, tags=["canonical"], ruby=ruby)
279 )
280 elif html_node.tag == "span":
281 if "headword-tr" in class_names or "tr" in class_names:
282 roman = clean_node(wxr, None, html_node)
283 if (
284 len(forms) > 0
285 and "canonical" not in forms[-1].tags
286 and "romanization" not in forms[-1].tags
287 ):
288 forms[-1].roman = roman
289 elif roman != "": 289 ↛ 271line 289 didn't jump to line 271 because the condition on line 289 was always true
290 forms.append(Form(form=roman, tags=["romanization"]))
291 elif "gender" in class_names: 291 ↛ 292line 291 didn't jump to line 292 because the condition on line 291 was never true
292 for abbr_tag in html_node.find_html("abbr"):
293 gender_tag = clean_node(wxr, None, abbr_tag)
294 if (
295 len(forms) > 0
296 and "canonical" not in forms[-1].tags
297 and "romanization" not in forms[-1].tags
298 ):
299 forms[-1].raw_tags.append(gender_tag)
300 else:
301 word_entry.raw_tags.append(gender_tag)
302 elif "ib-content" in class_names: 302 ↛ 303line 302 didn't jump to line 303 because the condition on line 302 was never true
303 raw_tag = clean_node(wxr, None, html_node)
304 if raw_tag != "":
305 word_entry.raw_tags.append(raw_tag)
306 elif html_node.tag == "sup" and word_entry.lang_code == "ja": 306 ↛ 307line 306 didn't jump to line 307 because the condition on line 306 was never true
307 forms.append(extract_historical_kana(wxr, html_node))
308 elif html_node.tag == "i":
309 if len(i_tags) > 0:
310 word_entry.raw_tags.extend(i_tags)
311 i_tags.clear()
312 for i_child in html_node.children:
313 raw_tag = (
314 clean_node(wxr, None, i_child)
315 .removeprefix("^†")
316 .strip()
317 )
318 if raw_tag != "": 318 ↛ 312line 318 didn't jump to line 312 because the condition on line 318 was always true
319 i_tags.append(raw_tag)
320 elif html_node.tag == "b": 320 ↛ 271line 320 didn't jump to line 271 because the condition on line 320 was always true
321 ruby, no_ruby = extract_ruby(wxr, html_node)
322 for form_str in filter(
323 None,
324 map(str.strip, clean_node(wxr, None, no_ruby).split(",")),
325 ):
326 form = Form(form=form_str, ruby=ruby)
327 if i_tags == ["또는"]: 327 ↛ 328line 327 didn't jump to line 328 because the condition on line 327 was never true
328 if len(forms) > 0:
329 form.raw_tags.extend(forms[-1].raw_tags)
330 else:
331 form.raw_tags.extend(i_tags)
332 forms.append(form)
333 i_tags.clear()
335 if len(i_tags) > 0: 335 ↛ 336line 335 didn't jump to line 336 because the condition on line 335 was never true
336 word_entry.raw_tags.extend(i_tags)
337 for form in forms:
338 translate_raw_tags(form)
339 word_entry.forms.extend(forms)
340 clean_node(wxr, word_entry, expanded_node)
341 translate_raw_tags(word_entry)
344def extract_historical_kana(
345 wxr: WiktextractContext, sup_node: HTMLNode
346) -> Form:
347 form = Form(form="", tags=["archaic"])
348 for strong_node in sup_node.find_html("strong"):
349 form.form = clean_node(wxr, None, strong_node)
350 for span_node in sup_node.find_html(
351 "span", attr_name="class", attr_value="tr"
352 ):
353 form.roman = clean_node(wxr, None, span_node)
354 return form