Coverage for src/wiktextract/extractor/ko/page.py: 34%
192 statements
« prev ^ index » next coverage.py v7.11.0, created at 2025-10-27 07:52 +0000
« prev ^ index » next coverage.py v7.11.0, created at 2025-10-27 07:52 +0000
1import re
2from typing import Any
4from mediawiki_langcodes import name_to_code
5from wikitextprocessor.parser import (
6 LEVEL_KIND_FLAGS,
7 HTMLNode,
8 LevelNode,
9 NodeKind,
10 TemplateNode,
11 WikiNode,
12)
14from ...page import clean_node
15from ...wxr_context import WiktextractContext
16from .etymology import extract_etymology_section
17from .linkage import extract_linkage_section
18from .models import Form, Linkage, Sense, WordEntry
19from .pos import extract_grammar_note_section, extract_pos_section
20from .section_titles import LINKAGE_SECTIONS, POS_DATA
21from .sound import (
22 SOUND_TEMPLATES,
23 extract_sound_section,
24 extract_sound_template,
25)
26from .tags import translate_raw_tags
27from .translation import extract_translation_section
30def extract_section_categories(
31 wxr: WiktextractContext,
32 page_data: list[WordEntry],
33 base_data: WordEntry,
34 level_node: LevelNode,
35) -> None:
36 for link_node in level_node.find_child(NodeKind.LINK):
37 clean_node(
38 wxr, page_data[-1] if len(page_data) > 0 else base_data, link_node
39 )
40 for t_node in level_node.find_child(NodeKind.TEMPLATE):
41 if t_node.template_name in ["C", "topics"]: 41 ↛ 42line 41 didn't jump to line 42 because the condition on line 41 was never true
42 clean_node(
43 wxr, page_data[-1] if len(page_data) > 0 else base_data, t_node
44 )
47def parse_section(
48 wxr: WiktextractContext,
49 page_data: list[WordEntry],
50 base_data: WordEntry,
51 level_node: LevelNode,
52) -> None:
53 title_text = clean_node(wxr, None, level_node.largs)
54 title_text = re.sub(r"\s*\d+$", "", title_text).strip("() ")
55 if "(" in title_text: 55 ↛ 56line 55 didn't jump to line 56 because the condition on line 55 was never true
56 title_text = title_text[: title_text.index("(")]
57 if title_text.removeprefix("보조 ").strip() in POS_DATA:
58 orig_page_data_len = len(page_data)
59 extract_pos_section(wxr, page_data, base_data, level_node, title_text)
60 if (
61 len(page_data) == orig_page_data_len
62 and title_text in LINKAGE_SECTIONS
63 and len(page_data) > 0
64 ): # try extract as linkage section
65 extract_linkage_section(
66 wxr, page_data[-1], level_node, LINKAGE_SECTIONS[title_text]
67 )
68 elif title_text in LINKAGE_SECTIONS:
69 extract_linkage_section(
70 wxr,
71 page_data[-1] if len(page_data) > 0 else base_data,
72 level_node,
73 LINKAGE_SECTIONS[title_text],
74 )
75 elif title_text == "번역":
76 extract_translation_section(
77 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
78 )
79 elif title_text == "발음":
80 extract_sound_section(
81 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
82 )
83 elif title_text == "어원": 83 ↛ 91line 83 didn't jump to line 91 because the condition on line 83 was always true
84 extract_etymology_section(
85 wxr,
86 page_data[-1]
87 if len(page_data) > 0 and len(page_data[-1].etymology_texts) == 0
88 else base_data,
89 level_node,
90 )
91 elif title_text == "어법 주의 사항":
92 extract_grammar_note_section(
93 wxr,
94 page_data[-1] if len(page_data) > 0 else base_data,
95 level_node,
96 )
97 elif title_text in ["다른 표기", "표기"]:
98 extract_alt_form_section(wxr, base_data, level_node)
99 elif title_text in [
100 "참고 문헌",
101 "독음",
102 "자원",
103 "교차언어",
104 "관사를 입력하세요",
105 "각주",
106 "갤러리",
107 "참조",
108 "이체자",
109 "외부 링크",
110 ]:
111 pass # ignore
112 else:
113 wxr.wtp.debug(f"unknown title: {title_text}", sortid="ko/page/63")
115 for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
116 parse_section(wxr, page_data, base_data, next_level)
118 extract_section_categories(wxr, page_data, base_data, level_node)
121def parse_language_section(
122 wxr: WiktextractContext, page_data: list[WordEntry], level2_node: LevelNode
123) -> None:
124 pre_data_len = len(page_data)
125 lang_name = clean_node(wxr, None, level2_node.largs)
126 if lang_name == "": 126 ↛ 127line 126 didn't jump to line 127 because the condition on line 126 was never true
127 lang_name = "unknown"
128 lang_code = name_to_code(lang_name, "ko")
129 if lang_code == "":
130 lang_code = "unknown"
131 if ( 131 ↛ 135line 131 didn't jump to line 135 because the condition on line 131 was never true
132 wxr.config.capture_language_codes is not None
133 and lang_code not in wxr.config.capture_language_codes
134 ):
135 return
136 wxr.wtp.start_section(lang_name)
137 base_data = WordEntry(
138 word=wxr.wtp.title,
139 lang_code=lang_code,
140 lang=lang_name,
141 pos="unknown",
142 )
143 extract_section_categories(wxr, page_data, base_data, level2_node)
144 for t_node in level2_node.find_child(NodeKind.TEMPLATE):
145 if t_node.template_name in SOUND_TEMPLATES: 145 ↛ 147line 145 didn't jump to line 147 because the condition on line 145 was always true
146 extract_sound_template(wxr, base_data, t_node)
147 elif t_node.template_name == "zh-see":
148 base_data.redirects.append(
149 clean_node(wxr, None, t_node.template_parameters.get(1, ""))
150 )
151 clean_node(wxr, base_data, t_node)
152 elif t_node.template_name in ["ja-see", "ja-see-kango"]:
153 extract_ja_see_template(wxr, base_data, t_node)
154 elif t_node.template_name == "zh-forms":
155 extract_zh_forms(wxr, base_data, t_node)
156 if len(base_data.redirects) > 0: 156 ↛ 157line 156 didn't jump to line 157 because the condition on line 156 was never true
157 page_data.append(base_data)
158 for next_level in level2_node.find_child(LEVEL_KIND_FLAGS):
159 parse_section(wxr, page_data, base_data, next_level)
161 # no POS section
162 if len(page_data) == pre_data_len:
163 extract_pos_section(wxr, page_data, base_data, level2_node, "")
166def parse_page(
167 wxr: WiktextractContext, page_title: str, page_text: str
168) -> list[dict[str, Any]]:
169 # page layout
170 # https://ko.wiktionary.org/wiki/위키낱말사전:문서_양식
171 # https://ko.wiktionary.org/wiki/위키낱말사전:한국어_편집부
172 if page_title.startswith(("Appendix:", "T195546/NS111")): 172 ↛ 173line 172 didn't jump to line 173 because the condition on line 172 was never true
173 return []
174 wxr.wtp.start_page(page_title)
175 tree = wxr.wtp.parse(page_text)
176 page_data: list[WordEntry] = []
177 for level2_node in tree.find_child(NodeKind.LEVEL2):
178 parse_language_section(wxr, page_data, level2_node)
180 for data in page_data:
181 if len(data.senses) == 0:
182 data.senses.append(Sense(tags=["no-gloss"]))
183 return [m.model_dump(exclude_defaults=True) for m in page_data]
186def extract_alt_form_section(
187 wxr: WiktextractContext, base_data: WordEntry, level_node: LevelNode
188):
189 for t_node in level_node.find_child_recursively(NodeKind.TEMPLATE):
190 if t_node.template_name in ["alt", "alter"]:
191 extract_alt_template(wxr, base_data, t_node)
194def extract_alt_template(
195 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
196):
197 expanded_node = wxr.wtp.parse(
198 wxr.wtp.node_to_wikitext(t_node), expand_all=True
199 )
200 forms = []
201 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
202 for span_tag in expanded_node.find_html("span"):
203 span_lang = span_tag.attrs.get("lang", "")
204 span_class = span_tag.attrs.get("class", "").split()
205 if span_lang == lang_code:
206 word = clean_node(wxr, None, span_tag)
207 if word != "":
208 forms.append(Form(form=word))
209 elif span_lang.endswith("-Latn") and len(forms) > 0:
210 forms[-1].roman = clean_node(wxr, None, span_tag)
211 elif "label-content" in span_class and len(forms) > 0:
212 raw_tag = clean_node(wxr, None, span_tag)
213 if raw_tag != "":
214 for form in forms:
215 form.raw_tags.append(raw_tag)
216 translate_raw_tags(form)
217 base_data.forms.extend(forms)
220def extract_ja_see_template(
221 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
222):
223 for key, value in t_node.template_parameters.items():
224 if isinstance(key, int):
225 base_data.redirects.append(clean_node(wxr, None, value))
226 clean_node(wxr, base_data, t_node)
229def extract_zh_forms(
230 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
231):
232 base_data.literal_meaning = clean_node(
233 wxr, None, t_node.template_parameters.get("lit", "")
234 )
235 expanded_node = wxr.wtp.parse(
236 wxr.wtp.node_to_wikitext(t_node), expand_all=True
237 )
238 for table in expanded_node.find_child(NodeKind.TABLE):
239 for row in table.find_child(NodeKind.TABLE_ROW):
240 row_header = ""
241 row_header_tags = []
242 header_has_span = False
243 for cell in row.find_child(
244 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
245 ):
246 if cell.kind == NodeKind.TABLE_HEADER_CELL:
247 row_header, row_header_tags, header_has_span = (
248 extract_zh_forms_header_cell(wxr, base_data, cell)
249 )
250 elif not header_has_span:
251 extract_zh_forms_data_cell(
252 wxr, base_data, cell, row_header, row_header_tags
253 )
256def extract_zh_forms_header_cell(
257 wxr: WiktextractContext, base_data: WordEntry, header_cell: WikiNode
258) -> tuple[str, list[str], bool]:
259 row_header = ""
260 row_header_tags = []
261 header_has_span = False
262 first_span_index = len(header_cell.children)
263 for index, span_tag in header_cell.find_html("span", with_index=True):
264 if index < first_span_index:
265 first_span_index = index
266 header_has_span = True
267 row_header = clean_node(wxr, None, header_cell.children[:first_span_index])
268 for raw_tag in re.split(r"/| 및 ", row_header):
269 raw_tag = raw_tag.strip()
270 if raw_tag != "":
271 row_header_tags.append(raw_tag)
272 for span_tag in header_cell.find_html_recursively("span"):
273 span_lang = span_tag.attrs.get("lang", "")
274 form_nodes = []
275 sup_title = ""
276 for node in span_tag.children:
277 if isinstance(node, HTMLNode) and node.tag == "sup":
278 for sup_span in node.find_html("span"):
279 sup_title = sup_span.attrs.get("title", "")
280 else:
281 form_nodes.append(node)
282 if span_lang in ["zh-Hant", "zh-Hans"]:
283 for word in clean_node(wxr, None, form_nodes).split("/"):
284 if word not in [base_data.word, ""]:
285 form = Form(form=word, raw_tags=row_header_tags)
286 if sup_title != "":
287 form.raw_tags.append(sup_title)
288 translate_raw_tags(form)
289 base_data.forms.append(form)
290 return row_header, row_header_tags, header_has_span
293def extract_zh_forms_data_cell(
294 wxr: WiktextractContext,
295 base_data: WordEntry,
296 cell: WikiNode,
297 row_header: str,
298 row_header_tags: list[str],
299):
300 forms = []
301 for top_span_tag in cell.find_html("span"):
302 span_style = top_span_tag.attrs.get("style", "")
303 span_lang = top_span_tag.attrs.get("lang", "")
304 if span_style == "white-space:nowrap;":
305 extract_zh_forms_data_cell(
306 wxr, base_data, top_span_tag, row_header, row_header_tags
307 )
308 elif "font-size:80%" in span_style:
309 raw_tag = clean_node(wxr, None, top_span_tag)
310 if raw_tag != "":
311 for form in forms:
312 form.raw_tags.append(raw_tag)
313 translate_raw_tags(form)
314 elif span_lang in ["zh-Hant", "zh-Hans", "zh"]:
315 word = clean_node(wxr, None, top_span_tag)
316 if word not in ["", "/", base_data.word]:
317 form = Form(form=word)
318 if row_header != "anagram":
319 form.raw_tags = row_header_tags
320 if span_lang == "zh-Hant":
321 form.tags.append("Traditional-Chinese")
322 elif span_lang == "zh-Hans":
323 form.tags.append("Simplified-Chinese")
324 translate_raw_tags(form)
325 forms.append(form)
327 if row_header == "어구전철":
328 for form in forms:
329 base_data.anagrams.append(
330 Linkage(word=form.form, raw_tags=form.raw_tags, tags=form.tags)
331 )
332 else:
333 base_data.forms.extend(forms)