Coverage for src/wiktextract/extractor/ko/pos.py: 89%
139 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-12 08:27 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-12 08:27 +0000
1import re
3from wikitextprocessor import (
4 HTMLNode,
5 LevelNode,
6 NodeKind,
7 TemplateNode,
8 WikiNode,
9)
11from ...page import clean_node
12from ...wxr_context import WiktextractContext
13from .example import extract_example_list_item
14from .linkage import (
15 LINKAGE_TEMPLATES,
16 extract_linkage_list_item,
17 extract_linkage_template,
18)
19from .models import AltForm, Form, Sense, WordEntry
20from .section_titles import LINKAGE_SECTIONS, POS_DATA
21from .sound import SOUND_TEMPLATES, extract_sound_template
22from .tags import translate_raw_tags
23from .translation import extract_translation_template
26def extract_pos_section(
27 wxr: WiktextractContext,
28 page_data: list[WordEntry],
29 base_data: WordEntry,
30 level_node: LevelNode,
31 pos_title: str,
32) -> None:
33 page_data.append(base_data.model_copy(deep=True))
34 orig_title = pos_title
35 pos_title = pos_title.removeprefix("보조 ").strip()
36 if pos_title in POS_DATA:
37 page_data[-1].pos_title = orig_title
38 pos_data = POS_DATA[pos_title]
39 page_data[-1].pos = pos_data["pos"]
40 page_data[-1].tags.extend(pos_data.get("tags", []))
41 if ( 41 ↛ 45line 41 didn't jump to line 45 because the condition on line 41 was never true
42 orig_title.startswith("보조 ")
43 and "auxiliary" not in page_data[-1].tags
44 ):
45 page_data[-1].tags.append("auxiliary")
47 has_linkage = False
48 for node in level_node.find_child(NodeKind.LIST | NodeKind.TEMPLATE):
49 if isinstance(node, TemplateNode):
50 if node.template_name in SOUND_TEMPLATES:
51 extract_sound_template(wxr, page_data[-1], node)
52 elif node.template_name in LINKAGE_TEMPLATES:
53 has_linkage = extract_linkage_template(wxr, page_data[-1], node)
54 elif node.template_name == "외국어":
55 extract_translation_template(
56 wxr,
57 page_data[-1],
58 node,
59 page_data[-1].senses[-1].glosses[-1]
60 if len(page_data[-1].senses) > 0
61 else "",
62 )
63 elif node.template_name in HEADER_TEMPLATES: 63 ↛ 48line 63 didn't jump to line 48 because the condition on line 63 was always true
64 extract_header_template(wxr, page_data[-1], node)
65 elif node.kind == NodeKind.LIST: 65 ↛ 48line 65 didn't jump to line 48 because the condition on line 65 was always true
66 for list_item in node.find_child(NodeKind.LIST_ITEM):
67 if node.sarg.startswith("#") and node.sarg.endswith("#"):
68 extract_gloss_list_item(
69 wxr,
70 page_data[-1],
71 list_item,
72 Sense(pattern=page_data[-1].pattern),
73 )
74 else:
75 extract_unorderd_list_item(wxr, page_data[-1], list_item)
77 if not (
78 len(page_data[-1].senses) > 0
79 or len(page_data[-1].sounds) > len(base_data.sounds)
80 or len(page_data[-1].translations) > len(base_data.translations)
81 or has_linkage
82 ):
83 page_data.pop()
86def extract_gloss_list_item(
87 wxr: WiktextractContext,
88 word_entry: WordEntry,
89 list_item: WikiNode,
90 parent_sense: Sense,
91) -> None:
92 gloss_nodes = []
93 sense = parent_sense.model_copy(deep=True)
94 for node in list_item.children:
95 if isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
96 gloss_text = clean_node(wxr, sense, gloss_nodes)
97 if len(gloss_text) > 0: 97 ↛ 102line 97 didn't jump to line 102 because the condition on line 97 was always true
98 sense.glosses.append(gloss_text)
99 translate_raw_tags(sense)
100 word_entry.senses.append(sense)
101 gloss_nodes.clear()
102 for nested_list_item in node.find_child(NodeKind.LIST_ITEM):
103 if node.sarg.startswith("#") and node.sarg.endswith("#"):
104 extract_gloss_list_item(
105 wxr, word_entry, nested_list_item, sense
106 )
107 else:
108 extract_unorderd_list_item(
109 wxr, word_entry, nested_list_item
110 )
111 continue
112 elif isinstance(node, TemplateNode) and node.template_name.endswith(
113 " of"
114 ):
115 extract_form_of_template(wxr, sense, node)
116 gloss_nodes.append(node)
117 elif isinstance(node, TemplateNode) and node.template_name == "라벨":
118 sense.raw_tags.extend(
119 [
120 raw_tag.strip()
121 for raw_tag in clean_node(wxr, sense, node)
122 .strip("()")
123 .split(",")
124 ]
125 )
126 else:
127 gloss_nodes.append(node)
129 gloss_text = clean_node(wxr, sense, gloss_nodes)
130 if len(gloss_text) > 0:
131 sense.glosses.append(gloss_text)
132 translate_raw_tags(sense)
133 word_entry.senses.append(sense)
136def extract_unorderd_list_item(
137 wxr: WiktextractContext, word_entry: WordEntry, list_item: WikiNode
138) -> None:
139 is_first_bold = True
140 for index, node in enumerate(list_item.children):
141 if (
142 isinstance(node, WikiNode)
143 and node.kind == NodeKind.BOLD
144 and is_first_bold
145 ):
146 # `* '''1.''' gloss text`, terrible obsolete layout
147 is_first_bold = False
148 bold_text = clean_node(wxr, None, node)
149 if re.fullmatch(r"\d+(?:-\d+)?\.?", bold_text):
150 new_list_item = WikiNode(NodeKind.LIST_ITEM, 0)
151 new_list_item.children = list_item.children[index + 1 :]
152 extract_gloss_list_item(wxr, word_entry, new_list_item, Sense())
153 break
154 elif isinstance(node, str) and "어원:" in node:
155 etymology_nodes = []
156 etymology_nodes.append(node[node.index(":") + 1 :])
157 etymology_nodes.extend(list_item.children[index + 1 :])
158 e_text = clean_node(wxr, None, etymology_nodes)
159 if len(e_text) > 0: 159 ↛ 161line 159 didn't jump to line 161 because the condition on line 159 was always true
160 word_entry.etymology_texts.append(e_text)
161 break
162 elif (
163 isinstance(node, str)
164 and re.search(r"(?:참고|참조|활용):", node) is not None
165 ):
166 note_str = node[node.index(":") + 1 :].strip()
167 note_str += clean_node(
168 wxr,
169 word_entry.senses[-1]
170 if len(word_entry.senses) > 0
171 else word_entry,
172 list_item.children[index + 1 :],
173 )
174 if len(word_entry.senses) > 0:
175 word_entry.senses[-1].note = note_str
176 else:
177 word_entry.note = note_str
178 break
179 elif (
180 isinstance(node, str)
181 and ":" in node
182 and node[: node.index(":")].strip() in LINKAGE_SECTIONS
183 ):
184 extract_linkage_list_item(wxr, word_entry, list_item, "", False)
185 break
186 elif isinstance(node, str) and "문형:" in node:
187 word_entry.pattern = node[node.index(":") + 1 :].strip()
188 word_entry.pattern += clean_node(
189 wxr, None, list_item.children[index + 1 :]
190 )
191 break
192 else:
193 if len(word_entry.senses) > 0:
194 extract_example_list_item(
195 wxr, word_entry.senses[-1], list_item, word_entry.lang_code
196 )
199def extract_form_of_template(
200 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode
201) -> None:
202 if "form-of" not in sense.tags: 202 ↛ 204line 202 didn't jump to line 204 because the condition on line 202 was always true
203 sense.tags.append("form-of")
204 word_arg = 1 if t_node.template_name == "ko-hanja form of" else 2
205 word = clean_node(wxr, None, t_node.template_parameters.get(word_arg, ""))
206 if len(word) > 0: 206 ↛ exitline 206 didn't return from function 'extract_form_of_template' because the condition on line 206 was always true
207 sense.form_of.append(AltForm(word=word))
210HEADER_TEMPLATES = frozenset(
211 [
212 "ko-verb",
213 "한국어 동사",
214 "ko-noun",
215 "한국어 명사",
216 "ko-proper noun",
217 "한국어 고유명사",
218 ]
219)
222def extract_header_template(
223 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
224) -> None:
225 if t_node.template_name in ["ko-verb", "한국어 동사"]: 225 ↛ 227line 225 didn't jump to line 227 because the condition on line 225 was always true
226 extract_ko_verb_template(wxr, word_entry, t_node)
227 elif t_node.template_name in [
228 "ko-noun",
229 "한국어 명사",
230 "ko-proper noun",
231 "한국어 고유명사",
232 ]:
233 extract_ko_noun_template(wxr, word_entry, t_node)
236def extract_ko_verb_template(
237 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
238) -> None:
239 # https://ko.wiktionary.org/wiki/틀:한국어_동사
240 expanded_node = wxr.wtp.parse(
241 wxr.wtp.node_to_wikitext(t_node), expand_all=True
242 )
243 clean_node(wxr, word_entry, expanded_node)
244 for top_span_tag in expanded_node.find_html(
245 "span", attr_name="class", attr_value="headword-line"
246 ):
247 raw_tag = ""
248 for node in top_span_tag.children:
249 if isinstance(node, str):
250 if "(" in node:
251 raw_tag = node[node.rindex("(") + 1 :].strip(", ")
252 else:
253 raw_tag = node.strip(", ")
254 elif isinstance(node, HTMLNode) and node.tag == "b":
255 form = Form(form=clean_node(wxr, None, node))
256 if raw_tag != "": 256 ↛ 258line 256 didn't jump to line 258 because the condition on line 256 was always true
257 form.raw_tags.append(raw_tag)
258 if form.form != "": 258 ↛ 248line 258 didn't jump to line 248 because the condition on line 258 was always true
259 translate_raw_tags(form)
260 word_entry.forms.append(form)
263def extract_ko_noun_template(
264 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
265) -> None:
266 # https://ko.wiktionary.org/wiki/틀:한국어_명사
267 # https://ko.wiktionary.org/wiki/틀:한국어_고유명사
268 hanja = clean_node(wxr, None, t_node.template_parameters.get("한자", ""))
269 if hanja != "":
270 word_entry.forms.append(Form(form=hanja, tags=["hanja"]))
273def extract_grammar_note_section(
274 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode
275) -> None:
276 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM):
277 word_entry.note = clean_node(wxr, None, list_item.children)