Coverage for src / wiktextract / extractor / ko / page.py: 34%
194 statements
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-01 08:08 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-01 08:08 +0000
1import re
2from typing import Any
4from mediawiki_langcodes import name_to_code
5from wikitextprocessor.parser import (
6 LEVEL_KIND_FLAGS,
7 HTMLNode,
8 LevelNode,
9 NodeKind,
10 TemplateNode,
11 WikiNode,
12)
14from ...page import clean_node
15from ...wxr_context import WiktextractContext
16from .etymology import extract_etymology_section, extract_ja_kanjitab_template
17from .linkage import extract_linkage_section
18from .models import Form, Linkage, Sense, WordEntry
19from .pos import extract_grammar_note_section, extract_pos_section
20from .section_titles import LINKAGE_SECTIONS, POS_DATA
21from .sound import (
22 SOUND_TEMPLATES,
23 extract_sound_section,
24 extract_sound_template,
25)
26from .tags import translate_raw_tags
27from .translation import extract_translation_section
30def extract_section_categories(
31 wxr: WiktextractContext,
32 page_data: list[WordEntry],
33 base_data: WordEntry,
34 level_node: LevelNode,
35) -> None:
36 for link_node in level_node.find_child(NodeKind.LINK):
37 clean_node(
38 wxr, page_data[-1] if len(page_data) > 0 else base_data, link_node
39 )
40 for t_node in level_node.find_child(NodeKind.TEMPLATE):
41 if t_node.template_name in ["C", "topics"]: 41 ↛ 42line 41 didn't jump to line 42 because the condition on line 41 was never true
42 clean_node(
43 wxr, page_data[-1] if len(page_data) > 0 else base_data, t_node
44 )
47def parse_section(
48 wxr: WiktextractContext,
49 page_data: list[WordEntry],
50 base_data: WordEntry,
51 level_node: LevelNode,
52) -> None:
53 title_text = clean_node(wxr, None, level_node.largs)
54 title_text = re.sub(r"\s*\d+$", "", title_text).strip("() ")
55 if "(" in title_text: 55 ↛ 56line 55 didn't jump to line 56 because the condition on line 55 was never true
56 title_text = title_text[: title_text.index("(")]
57 if title_text.removeprefix("보조 ").strip() in POS_DATA:
58 orig_page_data_len = len(page_data)
59 extract_pos_section(wxr, page_data, base_data, level_node, title_text)
60 if (
61 len(page_data) == orig_page_data_len
62 and title_text in LINKAGE_SECTIONS
63 and len(page_data) > 0
64 ): # try extract as linkage section
65 extract_linkage_section(
66 wxr, page_data[-1], level_node, LINKAGE_SECTIONS[title_text]
67 )
68 elif title_text in LINKAGE_SECTIONS:
69 extract_linkage_section(
70 wxr,
71 page_data[-1] if len(page_data) > 0 else base_data,
72 level_node,
73 LINKAGE_SECTIONS[title_text],
74 )
75 elif title_text == "번역":
76 extract_translation_section(
77 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
78 )
79 elif title_text == "발음":
80 extract_sound_section(
81 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
82 )
83 elif title_text == "어원": 83 ↛ 91line 83 didn't jump to line 91 because the condition on line 83 was always true
84 extract_etymology_section(
85 wxr,
86 page_data[-1]
87 if len(page_data) > 0 and len(page_data[-1].etymology_texts) == 0
88 else base_data,
89 level_node,
90 )
91 elif title_text == "어법 주의 사항":
92 extract_grammar_note_section(
93 wxr,
94 page_data[-1] if len(page_data) > 0 else base_data,
95 level_node,
96 )
97 elif title_text in ["다른 표기", "표기"]:
98 extract_alt_form_section(wxr, base_data, level_node)
99 elif title_text in [
100 "참고 문헌",
101 "독음",
102 "자원",
103 "교차언어",
104 "관사를 입력하세요",
105 "각주",
106 "갤러리",
107 "참조",
108 "이체자",
109 "외부 링크",
110 ]:
111 pass # ignore
112 else:
113 wxr.wtp.debug(f"unknown title: {title_text}", sortid="ko/page/63")
115 for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
116 parse_section(wxr, page_data, base_data, next_level)
118 extract_section_categories(wxr, page_data, base_data, level_node)
121def parse_language_section(
122 wxr: WiktextractContext, page_data: list[WordEntry], level2_node: LevelNode
123) -> None:
124 pre_data_len = len(page_data)
125 lang_name = clean_node(wxr, None, level2_node.largs)
126 if lang_name == "": 126 ↛ 127line 126 didn't jump to line 127 because the condition on line 126 was never true
127 lang_name = "unknown"
128 lang_code = name_to_code(lang_name, "ko")
129 if lang_code == "":
130 lang_code = "unknown"
131 if ( 131 ↛ 135line 131 didn't jump to line 135 because the condition on line 131 was never true
132 wxr.config.capture_language_codes is not None
133 and lang_code not in wxr.config.capture_language_codes
134 ):
135 return
136 wxr.wtp.start_section(lang_name)
137 base_data = WordEntry(
138 word=wxr.wtp.title,
139 lang_code=lang_code,
140 lang=lang_name,
141 pos="unknown",
142 )
143 extract_section_categories(wxr, page_data, base_data, level2_node)
144 for t_node in level2_node.find_child(NodeKind.TEMPLATE):
145 if t_node.template_name in SOUND_TEMPLATES: 145 ↛ 147line 145 didn't jump to line 147 because the condition on line 145 was always true
146 extract_sound_template(wxr, base_data, t_node)
147 elif t_node.template_name == "zh-see":
148 base_data.redirects.append(
149 clean_node(wxr, None, t_node.template_parameters.get(1, ""))
150 )
151 clean_node(wxr, base_data, t_node)
152 elif t_node.template_name in ["ja-see", "ja-see-kango"]:
153 extract_ja_see_template(wxr, base_data, t_node)
154 elif t_node.template_name == "zh-forms":
155 extract_zh_forms(wxr, base_data, t_node)
156 elif (
157 t_node.template_name.endswith("-kanjitab")
158 or t_node.template_name == "ja-kt"
159 ):
160 extract_ja_kanjitab_template(wxr, t_node, base_data)
161 if len(base_data.redirects) > 0: 161 ↛ 162line 161 didn't jump to line 162 because the condition on line 161 was never true
162 page_data.append(base_data)
163 for next_level in level2_node.find_child(LEVEL_KIND_FLAGS):
164 parse_section(wxr, page_data, base_data, next_level)
166 # no POS section
167 if len(page_data) == pre_data_len:
168 extract_pos_section(wxr, page_data, base_data, level2_node, "")
171def parse_page(
172 wxr: WiktextractContext, page_title: str, page_text: str
173) -> list[dict[str, Any]]:
174 # page layout
175 # https://ko.wiktionary.org/wiki/위키낱말사전:문서_양식
176 # https://ko.wiktionary.org/wiki/위키낱말사전:한국어_편집부
177 if page_title.startswith(("Appendix:", "T195546/NS111")): 177 ↛ 178line 177 didn't jump to line 178 because the condition on line 177 was never true
178 return []
179 wxr.wtp.start_page(page_title)
180 tree = wxr.wtp.parse(page_text)
181 page_data: list[WordEntry] = []
182 for level2_node in tree.find_child(NodeKind.LEVEL2):
183 parse_language_section(wxr, page_data, level2_node)
185 for data in page_data:
186 if len(data.senses) == 0:
187 data.senses.append(Sense(tags=["no-gloss"]))
188 return [m.model_dump(exclude_defaults=True) for m in page_data]
191def extract_alt_form_section(
192 wxr: WiktextractContext, base_data: WordEntry, level_node: LevelNode
193):
194 for t_node in level_node.find_child_recursively(NodeKind.TEMPLATE):
195 if t_node.template_name in ["alt", "alter"]:
196 extract_alt_template(wxr, base_data, t_node)
199def extract_alt_template(
200 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
201):
202 expanded_node = wxr.wtp.parse(
203 wxr.wtp.node_to_wikitext(t_node), expand_all=True
204 )
205 forms = []
206 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
207 for span_tag in expanded_node.find_html("span"):
208 span_lang = span_tag.attrs.get("lang", "")
209 span_class = span_tag.attrs.get("class", "").split()
210 if span_lang == lang_code:
211 word = clean_node(wxr, None, span_tag)
212 if word != "":
213 forms.append(Form(form=word))
214 elif span_lang.endswith("-Latn") and len(forms) > 0:
215 forms[-1].roman = clean_node(wxr, None, span_tag)
216 elif "label-content" in span_class and len(forms) > 0:
217 raw_tag = clean_node(wxr, None, span_tag)
218 if raw_tag != "":
219 for form in forms:
220 form.raw_tags.append(raw_tag)
221 translate_raw_tags(form)
222 base_data.forms.extend(forms)
225def extract_ja_see_template(
226 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
227):
228 for key, value in t_node.template_parameters.items():
229 if isinstance(key, int):
230 base_data.redirects.append(clean_node(wxr, None, value))
231 clean_node(wxr, base_data, t_node)
234def extract_zh_forms(
235 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
236):
237 base_data.literal_meaning = clean_node(
238 wxr, None, t_node.template_parameters.get("lit", "")
239 )
240 expanded_node = wxr.wtp.parse(
241 wxr.wtp.node_to_wikitext(t_node), expand_all=True
242 )
243 for table in expanded_node.find_child(NodeKind.TABLE):
244 for row in table.find_child(NodeKind.TABLE_ROW):
245 row_header = ""
246 row_header_tags = []
247 header_has_span = False
248 for cell in row.find_child(
249 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
250 ):
251 if cell.kind == NodeKind.TABLE_HEADER_CELL:
252 row_header, row_header_tags, header_has_span = (
253 extract_zh_forms_header_cell(wxr, base_data, cell)
254 )
255 elif not header_has_span:
256 extract_zh_forms_data_cell(
257 wxr, base_data, cell, row_header, row_header_tags
258 )
261def extract_zh_forms_header_cell(
262 wxr: WiktextractContext, base_data: WordEntry, header_cell: WikiNode
263) -> tuple[str, list[str], bool]:
264 row_header = ""
265 row_header_tags = []
266 header_has_span = False
267 first_span_index = len(header_cell.children)
268 for index, span_tag in header_cell.find_html("span", with_index=True):
269 if index < first_span_index:
270 first_span_index = index
271 header_has_span = True
272 row_header = clean_node(wxr, None, header_cell.children[:first_span_index])
273 for raw_tag in re.split(r"/| 및 ", row_header):
274 raw_tag = raw_tag.strip()
275 if raw_tag != "":
276 row_header_tags.append(raw_tag)
277 for span_tag in header_cell.find_html_recursively("span"):
278 span_lang = span_tag.attrs.get("lang", "")
279 form_nodes = []
280 sup_title = ""
281 for node in span_tag.children:
282 if isinstance(node, HTMLNode) and node.tag == "sup":
283 for sup_span in node.find_html("span"):
284 sup_title = sup_span.attrs.get("title", "")
285 else:
286 form_nodes.append(node)
287 if span_lang in ["zh-Hant", "zh-Hans"]:
288 for word in clean_node(wxr, None, form_nodes).split("/"):
289 if word not in [base_data.word, ""]:
290 form = Form(form=word, raw_tags=row_header_tags)
291 if sup_title != "":
292 form.raw_tags.append(sup_title)
293 translate_raw_tags(form)
294 base_data.forms.append(form)
295 return row_header, row_header_tags, header_has_span
298def extract_zh_forms_data_cell(
299 wxr: WiktextractContext,
300 base_data: WordEntry,
301 cell: WikiNode,
302 row_header: str,
303 row_header_tags: list[str],
304):
305 forms = []
306 for top_span_tag in cell.find_html("span"):
307 span_style = top_span_tag.attrs.get("style", "")
308 span_lang = top_span_tag.attrs.get("lang", "")
309 if span_style == "white-space:nowrap;":
310 extract_zh_forms_data_cell(
311 wxr, base_data, top_span_tag, row_header, row_header_tags
312 )
313 elif "font-size:80%" in span_style:
314 raw_tag = clean_node(wxr, None, top_span_tag)
315 if raw_tag != "":
316 for form in forms:
317 form.raw_tags.append(raw_tag)
318 translate_raw_tags(form)
319 elif span_lang in ["zh-Hant", "zh-Hans", "zh"]:
320 word = clean_node(wxr, None, top_span_tag)
321 if word not in ["", "/", base_data.word]:
322 form = Form(form=word)
323 if row_header != "anagram":
324 form.raw_tags = row_header_tags
325 if span_lang == "zh-Hant":
326 form.tags.append("Traditional-Chinese")
327 elif span_lang == "zh-Hans":
328 form.tags.append("Simplified-Chinese")
329 translate_raw_tags(form)
330 forms.append(form)
332 if row_header == "어구전철":
333 for form in forms:
334 base_data.anagrams.append(
335 Linkage(word=form.form, raw_tags=form.raw_tags, tags=form.tags)
336 )
337 else:
338 base_data.forms.extend(forms)