Coverage for src/wiktextract/extractor/ko/page.py: 62%
104 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-12 08:27 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-12 08:27 +0000
1import re
2from typing import Any
4from mediawiki_langcodes import name_to_code
5from wikitextprocessor.parser import (
6 LEVEL_KIND_FLAGS,
7 LevelNode,
8 NodeKind,
9 TemplateNode,
10)
12from ...page import clean_node
13from ...wxr_context import WiktextractContext
14from .etymology import extract_etymology_section
15from .linkage import extract_linkage_section
16from .models import Form, Sense, WordEntry
17from .pos import extract_grammar_note_section, extract_pos_section
18from .section_titles import LINKAGE_SECTIONS, POS_DATA
19from .sound import (
20 SOUND_TEMPLATES,
21 extract_sound_section,
22 extract_sound_template,
23)
24from .tags import translate_raw_tags
25from .translation import extract_translation_section
28def extract_section_categories(
29 wxr: WiktextractContext,
30 page_data: list[WordEntry],
31 base_data: WordEntry,
32 level_node: LevelNode,
33) -> None:
34 for link_node in level_node.find_child(NodeKind.LINK):
35 clean_node(
36 wxr, page_data[-1] if len(page_data) > 0 else base_data, link_node
37 )
38 for t_node in level_node.find_child(NodeKind.TEMPLATE):
39 if t_node.template_name in ["C", "topics"]: 39 ↛ 40line 39 didn't jump to line 40 because the condition on line 39 was never true
40 clean_node(
41 wxr, page_data[-1] if len(page_data) > 0 else base_data, t_node
42 )
45def parse_section(
46 wxr: WiktextractContext,
47 page_data: list[WordEntry],
48 base_data: WordEntry,
49 level_node: LevelNode,
50) -> None:
51 title_text = clean_node(wxr, None, level_node.largs)
52 title_text = re.sub(r"\s*\d+$", "", title_text).strip("() ")
53 if "(" in title_text: 53 ↛ 54line 53 didn't jump to line 54 because the condition on line 53 was never true
54 title_text = title_text[: title_text.index("(")]
55 if title_text.removeprefix("보조 ").strip() in POS_DATA:
56 orig_page_data_len = len(page_data)
57 extract_pos_section(wxr, page_data, base_data, level_node, title_text)
58 if (
59 len(page_data) == orig_page_data_len
60 and title_text in LINKAGE_SECTIONS
61 and len(page_data) > 0
62 ): # try extract as linkage section
63 extract_linkage_section(
64 wxr, page_data[-1], level_node, LINKAGE_SECTIONS[title_text]
65 )
66 elif title_text in LINKAGE_SECTIONS:
67 extract_linkage_section(
68 wxr,
69 page_data[-1] if len(page_data) > 0 else base_data,
70 level_node,
71 LINKAGE_SECTIONS[title_text],
72 )
73 elif title_text == "번역":
74 extract_translation_section(
75 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
76 )
77 elif title_text == "발음":
78 extract_sound_section(
79 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
80 )
81 elif title_text == "어원": 81 ↛ 89line 81 didn't jump to line 89 because the condition on line 81 was always true
82 extract_etymology_section(
83 wxr,
84 page_data[-1]
85 if len(page_data) > 0 and len(page_data[-1].etymology_texts) == 0
86 else base_data,
87 level_node,
88 )
89 elif title_text == "어법 주의 사항":
90 extract_grammar_note_section(
91 wxr,
92 page_data[-1] if len(page_data) > 0 else base_data,
93 level_node,
94 )
95 elif title_text in ["다른 표기", "표기"]:
96 extract_alt_form_section(wxr, base_data, level_node)
97 elif title_text in [
98 "참고 문헌",
99 "독음",
100 "자원",
101 "교차언어",
102 "관사를 입력하세요",
103 "각주",
104 "갤러리",
105 "참조",
106 "이체자",
107 ]:
108 pass # ignore
109 else:
110 wxr.wtp.debug(f"unknown title: {title_text}", sortid="ko/page/63")
112 for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
113 parse_section(wxr, page_data, base_data, next_level)
115 extract_section_categories(wxr, page_data, base_data, level_node)
118def parse_language_section(
119 wxr: WiktextractContext, page_data: list[WordEntry], level2_node: LevelNode
120) -> None:
121 pre_data_len = len(page_data)
122 lang_name = clean_node(wxr, None, level2_node.largs)
123 if lang_name == "": 123 ↛ 124line 123 didn't jump to line 124 because the condition on line 123 was never true
124 lang_name = "unknown"
125 lang_code = name_to_code(lang_name, "ko")
126 if lang_code == "":
127 lang_code = "unknown"
128 if ( 128 ↛ 132line 128 didn't jump to line 132 because the condition on line 128 was never true
129 wxr.config.capture_language_codes is not None
130 and lang_code not in wxr.config.capture_language_codes
131 ):
132 return
133 wxr.wtp.start_section(lang_name)
134 base_data = WordEntry(
135 word=wxr.wtp.title,
136 lang_code=lang_code,
137 lang=lang_name,
138 pos="unknown",
139 )
140 extract_section_categories(wxr, page_data, base_data, level2_node)
141 for t_node in level2_node.find_child(NodeKind.TEMPLATE):
142 if t_node.template_name in SOUND_TEMPLATES: 142 ↛ 141line 142 didn't jump to line 141 because the condition on line 142 was always true
143 extract_sound_template(wxr, base_data, t_node)
145 for next_level in level2_node.find_child(LEVEL_KIND_FLAGS):
146 parse_section(wxr, page_data, base_data, next_level)
148 # no POS section
149 if len(page_data) == pre_data_len:
150 extract_pos_section(wxr, page_data, base_data, level2_node, "")
153def parse_page(
154 wxr: WiktextractContext, page_title: str, page_text: str
155) -> list[dict[str, Any]]:
156 # page layout
157 # https://ko.wiktionary.org/wiki/위키낱말사전:문서_양식
158 # https://ko.wiktionary.org/wiki/위키낱말사전:한국어_편집부
159 if page_title.startswith(("Appendix:", "T195546/NS111")): 159 ↛ 160line 159 didn't jump to line 160 because the condition on line 159 was never true
160 return []
161 wxr.wtp.start_page(page_title)
162 tree = wxr.wtp.parse(page_text)
163 page_data: list[WordEntry] = []
164 for level2_node in tree.find_child(NodeKind.LEVEL2):
165 parse_language_section(wxr, page_data, level2_node)
167 for data in page_data:
168 if len(data.senses) == 0:
169 data.senses.append(Sense(tags=["no-gloss"]))
170 return [m.model_dump(exclude_defaults=True) for m in page_data]
173def extract_alt_form_section(
174 wxr: WiktextractContext, base_data: WordEntry, level_node: LevelNode
175):
176 for t_node in level_node.find_child_recursively(NodeKind.TEMPLATE):
177 if t_node.template_name in ["alt", "alter"]:
178 extract_alt_template(wxr, base_data, t_node)
181def extract_alt_template(
182 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
183):
184 expanded_node = wxr.wtp.parse(
185 wxr.wtp.node_to_wikitext(t_node), expand_all=True
186 )
187 forms = []
188 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
189 for span_tag in expanded_node.find_html("span"):
190 span_lang = span_tag.attrs.get("lang", "")
191 span_class = span_tag.attrs.get("class", "").split()
192 if span_lang == lang_code:
193 word = clean_node(wxr, None, span_tag)
194 if word != "":
195 forms.append(Form(form=word))
196 elif span_lang.endswith("-Latn") and len(forms) > 0:
197 forms[-1].roman = clean_node(wxr, None, span_tag)
198 elif "label-content" in span_class and len(forms) > 0:
199 raw_tag = clean_node(wxr, None, span_tag)
200 if raw_tag != "":
201 for form in forms:
202 form.raw_tags.append(raw_tag)
203 translate_raw_tags(form)
204 base_data.forms.extend(forms)