Coverage for src/wiktextract/extractor/ko/pos.py: 74%
168 statements
« prev ^ index » next coverage.py v7.11.0, created at 2025-11-03 05:44 +0000
« prev ^ index » next coverage.py v7.11.0, created at 2025-11-03 05:44 +0000
1import re
3from wikitextprocessor import (
4 HTMLNode,
5 LevelNode,
6 NodeKind,
7 TemplateNode,
8 WikiNode,
9)
11from ...page import clean_node
12from ...wxr_context import WiktextractContext
13from .example import extract_example_list_item
14from .linkage import (
15 LINKAGE_TEMPLATES,
16 extract_linkage_list_item,
17 extract_linkage_template,
18)
19from .models import AltForm, Classifier, Form, Sense, WordEntry
20from .section_titles import LINKAGE_SECTIONS, POS_DATA
21from .sound import SOUND_TEMPLATES, extract_sound_template
22from .tags import translate_raw_tags
23from .translation import extract_translation_template
26def extract_pos_section(
27 wxr: WiktextractContext,
28 page_data: list[WordEntry],
29 base_data: WordEntry,
30 level_node: LevelNode,
31 pos_title: str,
32) -> None:
33 page_data.append(base_data.model_copy(deep=True))
34 orig_title = pos_title
35 pos_title = pos_title.removeprefix("보조 ").strip()
36 if pos_title in POS_DATA:
37 page_data[-1].pos_title = orig_title
38 pos_data = POS_DATA[pos_title]
39 page_data[-1].pos = pos_data["pos"]
40 page_data[-1].tags.extend(pos_data.get("tags", []))
41 if ( 41 ↛ 45line 41 didn't jump to line 45 because the condition on line 41 was never true
42 orig_title.startswith("보조 ")
43 and "auxiliary" not in page_data[-1].tags
44 ):
45 page_data[-1].tags.append("auxiliary")
47 has_linkage = False
48 for node in level_node.find_child(NodeKind.LIST | NodeKind.TEMPLATE):
49 if isinstance(node, TemplateNode):
50 if node.template_name in SOUND_TEMPLATES:
51 extract_sound_template(wxr, page_data[-1], node)
52 elif node.template_name in LINKAGE_TEMPLATES:
53 has_linkage = extract_linkage_template(
54 wxr, page_data[-1], node, "derived"
55 )
56 elif node.template_name == "외국어":
57 extract_translation_template(
58 wxr,
59 page_data[-1],
60 node,
61 page_data[-1].senses[-1].glosses[-1]
62 if len(page_data[-1].senses) > 0
63 else "",
64 )
65 elif node.template_name in HEADER_TEMPLATES: 65 ↛ 48line 65 didn't jump to line 48 because the condition on line 65 was always true
66 extract_header_template(wxr, page_data[-1], node)
67 elif node.kind == NodeKind.LIST: 67 ↛ 48line 67 didn't jump to line 48 because the condition on line 67 was always true
68 for list_item in node.find_child(NodeKind.LIST_ITEM):
69 if node.sarg.startswith("#") and node.sarg.endswith("#"):
70 extract_gloss_list_item(
71 wxr,
72 page_data[-1],
73 list_item,
74 Sense(pattern=page_data[-1].pattern),
75 )
76 else:
77 extract_unorderd_list_item(wxr, page_data[-1], list_item)
79 if not (
80 len(page_data[-1].senses) > 0
81 or len(page_data[-1].sounds) > len(base_data.sounds)
82 or len(page_data[-1].translations) > len(base_data.translations)
83 or has_linkage
84 ):
85 page_data.pop()
88def extract_gloss_list_item(
89 wxr: WiktextractContext,
90 word_entry: WordEntry,
91 list_item: WikiNode,
92 parent_sense: Sense,
93) -> None:
94 gloss_nodes = []
95 sense = parent_sense.model_copy(deep=True)
96 for node in list_item.children:
97 if isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
98 gloss_text = clean_node(wxr, sense, gloss_nodes)
99 if len(gloss_text) > 0: 99 ↛ 104line 99 didn't jump to line 104 because the condition on line 99 was always true
100 sense.glosses.append(gloss_text)
101 translate_raw_tags(sense)
102 word_entry.senses.append(sense)
103 gloss_nodes.clear()
104 for nested_list_item in node.find_child(NodeKind.LIST_ITEM):
105 if node.sarg.startswith("#") and node.sarg.endswith("#"):
106 extract_gloss_list_item(
107 wxr, word_entry, nested_list_item, sense
108 )
109 else:
110 extract_unorderd_list_item(
111 wxr, word_entry, nested_list_item
112 )
113 continue
114 elif isinstance(node, TemplateNode) and node.template_name.endswith(
115 " of"
116 ):
117 extract_form_of_template(wxr, sense, node)
118 gloss_nodes.append(node)
119 elif isinstance(node, TemplateNode) and node.template_name == "라벨":
120 sense.raw_tags.extend(
121 [
122 raw_tag.strip()
123 for raw_tag in clean_node(wxr, sense, node)
124 .strip("()")
125 .split(",")
126 ]
127 )
128 elif isinstance(node, TemplateNode) and node.template_name == "zh-mw": 128 ↛ 129line 128 didn't jump to line 129 because the condition on line 128 was never true
129 extract_zh_mw_template(wxr, node, sense)
130 else:
131 gloss_nodes.append(node)
133 gloss_text = clean_node(wxr, sense, gloss_nodes)
134 if len(gloss_text) > 0:
135 sense.glosses.append(gloss_text)
136 translate_raw_tags(sense)
137 word_entry.senses.append(sense)
140def extract_unorderd_list_item(
141 wxr: WiktextractContext, word_entry: WordEntry, list_item: WikiNode
142) -> None:
143 is_first_bold = True
144 for index, node in enumerate(list_item.children):
145 if (
146 isinstance(node, WikiNode)
147 and node.kind == NodeKind.BOLD
148 and is_first_bold
149 ):
150 # `* '''1.''' gloss text`, terrible obsolete layout
151 is_first_bold = False
152 bold_text = clean_node(wxr, None, node)
153 if re.fullmatch(r"\d+(?:-\d+)?\.?", bold_text):
154 new_list_item = WikiNode(NodeKind.LIST_ITEM, 0)
155 new_list_item.children = list_item.children[index + 1 :]
156 extract_gloss_list_item(wxr, word_entry, new_list_item, Sense())
157 break
158 elif isinstance(node, str) and "어원:" in node:
159 etymology_nodes = []
160 etymology_nodes.append(node[node.index(":") + 1 :])
161 etymology_nodes.extend(list_item.children[index + 1 :])
162 e_text = clean_node(wxr, None, etymology_nodes)
163 if len(e_text) > 0: 163 ↛ 165line 163 didn't jump to line 165 because the condition on line 163 was always true
164 word_entry.etymology_texts.append(e_text)
165 break
166 elif (
167 isinstance(node, str)
168 and re.search(r"(?:참고|참조|활용):", node) is not None
169 ):
170 note_str = node[node.index(":") + 1 :].strip()
171 note_str += clean_node(
172 wxr,
173 word_entry.senses[-1]
174 if len(word_entry.senses) > 0
175 else word_entry,
176 list_item.children[index + 1 :],
177 )
178 if len(word_entry.senses) > 0:
179 word_entry.senses[-1].note = note_str
180 else:
181 word_entry.note = note_str
182 break
183 elif (
184 isinstance(node, str)
185 and ":" in node
186 and node[: node.index(":")].strip() in LINKAGE_SECTIONS
187 ):
188 extract_linkage_list_item(wxr, word_entry, list_item, "", False)
189 break
190 elif isinstance(node, str) and "문형:" in node:
191 word_entry.pattern = node[node.index(":") + 1 :].strip()
192 word_entry.pattern += clean_node(
193 wxr, None, list_item.children[index + 1 :]
194 )
195 break
196 else:
197 if len(word_entry.senses) > 0:
198 extract_example_list_item(
199 wxr, word_entry.senses[-1], list_item, word_entry.lang_code
200 )
203def extract_form_of_template(
204 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode
205) -> None:
206 if "form-of" not in sense.tags: 206 ↛ 208line 206 didn't jump to line 208 because the condition on line 206 was always true
207 sense.tags.append("form-of")
208 word_arg = 1 if t_node.template_name == "ko-hanja form of" else 2
209 word = clean_node(wxr, None, t_node.template_parameters.get(word_arg, ""))
210 if len(word) > 0: 210 ↛ exitline 210 didn't return from function 'extract_form_of_template' because the condition on line 210 was always true
211 sense.form_of.append(AltForm(word=word))
214HEADER_TEMPLATES = frozenset(
215 [
216 "ko-verb",
217 "한국어 동사",
218 "ko-noun",
219 "한국어 명사",
220 "ko-proper noun",
221 "한국어 고유명사",
222 ]
223)
226def extract_header_template(
227 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
228) -> None:
229 if t_node.template_name in ["ko-verb", "한국어 동사"]: 229 ↛ 231line 229 didn't jump to line 231 because the condition on line 229 was always true
230 extract_ko_verb_template(wxr, word_entry, t_node)
231 elif t_node.template_name in [
232 "ko-noun",
233 "한국어 명사",
234 "ko-proper noun",
235 "한국어 고유명사",
236 ]:
237 extract_ko_noun_template(wxr, word_entry, t_node)
240def extract_ko_verb_template(
241 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
242) -> None:
243 # https://ko.wiktionary.org/wiki/틀:한국어_동사
244 expanded_node = wxr.wtp.parse(
245 wxr.wtp.node_to_wikitext(t_node), expand_all=True
246 )
247 clean_node(wxr, word_entry, expanded_node)
248 for top_span_tag in expanded_node.find_html(
249 "span", attr_name="class", attr_value="headword-line"
250 ):
251 raw_tag = ""
252 for node in top_span_tag.children:
253 if isinstance(node, str):
254 if "(" in node:
255 raw_tag = node[node.rindex("(") + 1 :].strip(", ")
256 else:
257 raw_tag = node.strip(", ")
258 elif isinstance(node, HTMLNode) and node.tag == "b":
259 form = Form(form=clean_node(wxr, None, node))
260 if raw_tag != "": 260 ↛ 262line 260 didn't jump to line 262 because the condition on line 260 was always true
261 form.raw_tags.append(raw_tag)
262 if form.form != "": 262 ↛ 252line 262 didn't jump to line 252 because the condition on line 262 was always true
263 translate_raw_tags(form)
264 word_entry.forms.append(form)
267def extract_ko_noun_template(
268 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
269) -> None:
270 # https://ko.wiktionary.org/wiki/틀:한국어_명사
271 # https://ko.wiktionary.org/wiki/틀:한국어_고유명사
272 hanja = clean_node(wxr, None, t_node.template_parameters.get("한자", ""))
273 if hanja != "":
274 word_entry.forms.append(Form(form=hanja, tags=["hanja"]))
277def extract_grammar_note_section(
278 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode
279) -> None:
280 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM):
281 word_entry.note = clean_node(wxr, None, list_item.children)
284def extract_zh_mw_template(
285 wxr: WiktextractContext, t_node: TemplateNode, sense: Sense
286) -> None:
287 # Chinese inline classifier template
288 # copied from zh edition code
289 expanded_node = wxr.wtp.parse(
290 wxr.wtp.node_to_wikitext(t_node), expand_all=True
291 )
292 classifiers = []
293 last_word = ""
294 for span_tag in expanded_node.find_html_recursively("span"):
295 span_class = span_tag.attrs.get("class", "")
296 if span_class in ["Hani", "Hant", "Hans"]:
297 word = clean_node(wxr, None, span_tag)
298 if word != "/":
299 classifier = Classifier(classifier=word)
300 if span_class == "Hant":
301 classifier.tags.append("Traditional-Chinese")
302 elif span_class == "Hans":
303 classifier.tags.append("Simplified-Chinese")
305 if len(classifiers) > 0 and last_word != "/":
306 sense.classifiers.extend(classifiers)
307 classifiers.clear()
308 classifiers.append(classifier)
309 last_word = word
310 elif "title" in span_tag.attrs:
311 raw_tag = clean_node(wxr, None, span_tag.attrs["title"])
312 if len(raw_tag) > 0:
313 for classifier in classifiers:
314 classifier.raw_tags.append(raw_tag)
315 sense.classifiers.extend(classifiers)
316 for classifier in sense.classifiers:
317 translate_raw_tags(classifier)