Coverage for src/wiktextract/extractor/ko/pos.py: 89%
138 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1import re
3from wikitextprocessor import (
4 HTMLNode,
5 LevelNode,
6 NodeKind,
7 TemplateNode,
8 WikiNode,
9)
11from ...page import clean_node
12from ...wxr_context import WiktextractContext
13from .example import extract_example_list_item
14from .linkage import (
15 LINKAGE_TEMPLATES,
16 extract_linkage_list_item,
17 extract_linkage_template,
18)
19from .models import AltForm, Form, Sense, WordEntry
20from .section_titles import LINKAGE_SECTIONS, POS_DATA
21from .sound import SOUND_TEMPLATES, extract_sound_template
22from .tags import translate_raw_tags
23from .translation import extract_translation_template
26def extract_pos_section(
27 wxr: WiktextractContext,
28 page_data: list[WordEntry],
29 base_data: WordEntry,
30 level_node: LevelNode,
31 pos_title: str,
32) -> None:
33 page_data.append(base_data.model_copy(deep=True))
34 orig_title = pos_title
35 pos_title = pos_title.removeprefix("보조 ").strip()
36 if pos_title in POS_DATA:
37 page_data[-1].pos_title = orig_title
38 pos_data = POS_DATA[pos_title]
39 page_data[-1].pos = pos_data["pos"]
40 page_data[-1].tags.extend(pos_data.get("tags", []))
41 if ( 41 ↛ 45line 41 didn't jump to line 45 because the condition on line 41 was never true
42 orig_title.startswith("보조 ")
43 and "auxiliary" not in page_data[-1].tags
44 ):
45 page_data[-1].tags.append("auxiliary")
47 for node in level_node.find_child(NodeKind.LIST | NodeKind.TEMPLATE):
48 if isinstance(node, TemplateNode):
49 if node.template_name in SOUND_TEMPLATES:
50 extract_sound_template(wxr, page_data[-1], node)
51 elif node.template_name in LINKAGE_TEMPLATES:
52 extract_linkage_template(wxr, page_data[-1], node)
53 elif node.template_name == "외국어":
54 extract_translation_template(
55 wxr,
56 page_data[-1],
57 node,
58 page_data[-1].senses[-1].glosses[-1]
59 if len(page_data[-1].senses) > 0
60 else "",
61 )
62 elif node.template_name in HEADER_TEMPLATES: 62 ↛ 47line 62 didn't jump to line 47 because the condition on line 62 was always true
63 extract_header_template(wxr, page_data[-1], node)
64 elif node.kind == NodeKind.LIST: 64 ↛ 47line 64 didn't jump to line 47 because the condition on line 64 was always true
65 for list_item in node.find_child(NodeKind.LIST_ITEM):
66 if node.sarg.startswith("#") and node.sarg.endswith("#"):
67 extract_gloss_list_item(
68 wxr,
69 page_data[-1],
70 list_item,
71 Sense(pattern=page_data[-1].pattern),
72 )
73 else:
74 extract_unorderd_list_item(wxr, page_data[-1], list_item)
76 if len(
77 page_data[-1].model_dump(
78 exclude_defaults=True, exclude={"pos_title", "tags"}
79 )
80 ) == len(base_data.model_dump(exclude_defaults=True)):
81 page_data.pop()
84def extract_gloss_list_item(
85 wxr: WiktextractContext,
86 word_entry: WordEntry,
87 list_item: WikiNode,
88 parent_sense: Sense,
89) -> None:
90 gloss_nodes = []
91 sense = parent_sense.model_copy(deep=True)
92 for node in list_item.children:
93 if isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
94 gloss_text = clean_node(wxr, sense, gloss_nodes)
95 if len(gloss_text) > 0: 95 ↛ 100line 95 didn't jump to line 100 because the condition on line 95 was always true
96 sense.glosses.append(gloss_text)
97 translate_raw_tags(sense)
98 word_entry.senses.append(sense)
99 gloss_nodes.clear()
100 for nested_list_item in node.find_child(NodeKind.LIST_ITEM):
101 if node.sarg.startswith("#") and node.sarg.endswith("#"):
102 extract_gloss_list_item(
103 wxr, word_entry, nested_list_item, sense
104 )
105 else:
106 extract_unorderd_list_item(
107 wxr, word_entry, nested_list_item
108 )
109 continue
110 elif isinstance(node, TemplateNode) and node.template_name.endswith(
111 " of"
112 ):
113 extract_form_of_template(wxr, sense, node)
114 gloss_nodes.append(node)
115 elif isinstance(node, TemplateNode) and node.template_name == "라벨":
116 sense.raw_tags.extend(
117 [
118 raw_tag.strip()
119 for raw_tag in clean_node(wxr, sense, node)
120 .strip("()")
121 .split(",")
122 ]
123 )
124 else:
125 gloss_nodes.append(node)
127 gloss_text = clean_node(wxr, sense, gloss_nodes)
128 if len(gloss_text) > 0:
129 sense.glosses.append(gloss_text)
130 translate_raw_tags(sense)
131 word_entry.senses.append(sense)
134def extract_unorderd_list_item(
135 wxr: WiktextractContext, word_entry: WordEntry, list_item: WikiNode
136) -> None:
137 is_first_bold = True
138 for index, node in enumerate(list_item.children):
139 if (
140 isinstance(node, WikiNode)
141 and node.kind == NodeKind.BOLD
142 and is_first_bold
143 ):
144 # `* '''1.''' gloss text`, terrible obsolete layout
145 is_first_bold = False
146 bold_text = clean_node(wxr, None, node)
147 if re.fullmatch(r"\d+(?:-\d+)?\.?", bold_text):
148 new_list_item = WikiNode(NodeKind.LIST_ITEM, 0)
149 new_list_item.children = list_item.children[index + 1 :]
150 extract_gloss_list_item(wxr, word_entry, new_list_item, Sense())
151 break
152 elif isinstance(node, str) and "어원:" in node:
153 etymology_nodes = []
154 etymology_nodes.append(node[node.index(":") + 1 :])
155 etymology_nodes.extend(list_item.children[index + 1 :])
156 e_text = clean_node(wxr, None, etymology_nodes)
157 if len(e_text) > 0: 157 ↛ 159line 157 didn't jump to line 159 because the condition on line 157 was always true
158 word_entry.etymology_texts.append(e_text)
159 break
160 elif (
161 isinstance(node, str)
162 and re.search(r"(?:참고|참조|활용):", node) is not None
163 ):
164 note_str = node[node.index(":") + 1 :].strip()
165 note_str += clean_node(
166 wxr,
167 word_entry.senses[-1]
168 if len(word_entry.senses) > 0
169 else word_entry,
170 list_item.children[index + 1 :],
171 )
172 if len(word_entry.senses) > 0:
173 word_entry.senses[-1].note = note_str
174 else:
175 word_entry.note = note_str
176 break
177 elif (
178 isinstance(node, str)
179 and ":" in node
180 and node[: node.index(":")].strip() in LINKAGE_SECTIONS
181 ):
182 extract_linkage_list_item(wxr, word_entry, list_item, "", False)
183 break
184 elif isinstance(node, str) and "문형:" in node:
185 word_entry.pattern = node[node.index(":") + 1 :].strip()
186 word_entry.pattern += clean_node(
187 wxr, None, list_item.children[index + 1 :]
188 )
189 break
190 else:
191 if len(word_entry.senses) > 0:
192 extract_example_list_item(
193 wxr, word_entry.senses[-1], list_item, word_entry.lang_code
194 )
197def extract_form_of_template(
198 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode
199) -> None:
200 if "form-of" not in sense.tags: 200 ↛ 202line 200 didn't jump to line 202 because the condition on line 200 was always true
201 sense.tags.append("form-of")
202 word_arg = 1 if t_node.template_name == "ko-hanja form of" else 2
203 word = clean_node(wxr, None, t_node.template_parameters.get(word_arg, ""))
204 if len(word) > 0: 204 ↛ exitline 204 didn't return from function 'extract_form_of_template' because the condition on line 204 was always true
205 sense.form_of.append(AltForm(word=word))
208HEADER_TEMPLATES = frozenset(
209 [
210 "ko-verb",
211 "한국어 동사",
212 "ko-noun",
213 "한국어 명사",
214 "ko-proper noun",
215 "한국어 고유명사",
216 ]
217)
220def extract_header_template(
221 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
222) -> None:
223 if t_node.template_name in ["ko-verb", "한국어 동사"]: 223 ↛ 225line 223 didn't jump to line 225 because the condition on line 223 was always true
224 extract_ko_verb_template(wxr, word_entry, t_node)
225 elif t_node.template_name in [
226 "ko-noun",
227 "한국어 명사",
228 "ko-proper noun",
229 "한국어 고유명사",
230 ]:
231 extract_ko_noun_template(wxr, word_entry, t_node)
234def extract_ko_verb_template(
235 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
236) -> None:
237 # https://ko.wiktionary.org/wiki/틀:한국어_동사
238 expanded_node = wxr.wtp.parse(
239 wxr.wtp.node_to_wikitext(t_node), expand_all=True
240 )
241 clean_node(wxr, word_entry, expanded_node)
242 for top_span_tag in expanded_node.find_html(
243 "span", attr_name="class", attr_value="headword-line"
244 ):
245 raw_tag = ""
246 for node in top_span_tag.children:
247 if isinstance(node, str):
248 if "(" in node:
249 raw_tag = node[node.rindex("(") + 1 :].strip(", ")
250 else:
251 raw_tag = node.strip(", ")
252 elif isinstance(node, HTMLNode) and node.tag == "b":
253 form = Form(form=clean_node(wxr, None, node))
254 if raw_tag != "": 254 ↛ 256line 254 didn't jump to line 256 because the condition on line 254 was always true
255 form.raw_tags.append(raw_tag)
256 if form.form != "": 256 ↛ 246line 256 didn't jump to line 246 because the condition on line 256 was always true
257 translate_raw_tags(form)
258 word_entry.forms.append(form)
261def extract_ko_noun_template(
262 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
263) -> None:
264 # https://ko.wiktionary.org/wiki/틀:한국어_명사
265 # https://ko.wiktionary.org/wiki/틀:한국어_고유명사
266 hanja = clean_node(wxr, None, t_node.template_parameters.get("한자", ""))
267 if hanja != "":
268 word_entry.forms.append(Form(form=hanja, tags=["hanja"]))
271def extract_grammar_note_section(
272 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode
273) -> None:
274 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM):
275 word_entry.note = clean_node(wxr, None, list_item.children)