Coverage for src/wiktextract/extractor/ko/page.py: 62%
104 statements
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
1import re
2from typing import Any
4from mediawiki_langcodes import name_to_code
5from wikitextprocessor.parser import (
6 LEVEL_KIND_FLAGS,
7 LevelNode,
8 NodeKind,
9 TemplateNode,
10)
12from ...page import clean_node
13from ...wxr_context import WiktextractContext
14from .etymology import extract_etymology_section
15from .linkage import extract_linkage_section
16from .models import Form, Sense, WordEntry
17from .pos import extract_grammar_note_section, extract_pos_section
18from .section_titles import LINKAGE_SECTIONS, POS_DATA
19from .sound import (
20 SOUND_TEMPLATES,
21 extract_sound_section,
22 extract_sound_template,
23)
24from .tags import translate_raw_tags
25from .translation import extract_translation_section
28def extract_section_categories(
29 wxr: WiktextractContext,
30 page_data: list[WordEntry],
31 base_data: WordEntry,
32 level_node: LevelNode,
33) -> None:
34 for link_node in level_node.find_child(NodeKind.LINK):
35 clean_node(
36 wxr, page_data[-1] if len(page_data) > 0 else base_data, link_node
37 )
38 for t_node in level_node.find_child(NodeKind.TEMPLATE):
39 if t_node.template_name in ["C", "topics"]: 39 ↛ 40line 39 didn't jump to line 40 because the condition on line 39 was never true
40 clean_node(
41 wxr, page_data[-1] if len(page_data) > 0 else base_data, t_node
42 )
45def parse_section(
46 wxr: WiktextractContext,
47 page_data: list[WordEntry],
48 base_data: WordEntry,
49 level_node: LevelNode,
50) -> None:
51 title_text = clean_node(wxr, None, level_node.largs)
52 title_text = re.sub(r"\s*\d+$", "", title_text).strip("() ")
53 if "(" in title_text: 53 ↛ 54line 53 didn't jump to line 54 because the condition on line 53 was never true
54 title_text = title_text[: title_text.index("(")]
55 if title_text.removeprefix("보조 ").strip() in POS_DATA:
56 orig_page_data_len = len(page_data)
57 extract_pos_section(wxr, page_data, base_data, level_node, title_text)
58 if (
59 len(page_data) == orig_page_data_len
60 and title_text in LINKAGE_SECTIONS
61 and len(page_data) > 0
62 ): # try extract as linkage section
63 extract_linkage_section(
64 wxr, page_data[-1], level_node, LINKAGE_SECTIONS[title_text]
65 )
66 elif title_text in LINKAGE_SECTIONS:
67 extract_linkage_section(
68 wxr,
69 page_data[-1] if len(page_data) > 0 else base_data,
70 level_node,
71 LINKAGE_SECTIONS[title_text],
72 )
73 elif title_text == "번역":
74 extract_translation_section(
75 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
76 )
77 elif title_text == "발음":
78 extract_sound_section(
79 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
80 )
81 elif title_text == "어원": 81 ↛ 89line 81 didn't jump to line 89 because the condition on line 81 was always true
82 extract_etymology_section(
83 wxr,
84 page_data[-1]
85 if len(page_data) > 0 and len(page_data[-1].etymology_texts) == 0
86 else base_data,
87 level_node,
88 )
89 elif title_text == "어법 주의 사항":
90 extract_grammar_note_section(
91 wxr,
92 page_data[-1] if len(page_data) > 0 else base_data,
93 level_node,
94 )
95 elif title_text in ["다른 표기", "표기"]:
96 extract_alt_form_section(wxr, base_data, level_node)
97 elif title_text in [
98 "참고 문헌",
99 "독음",
100 "자원",
101 "교차언어",
102 "관사를 입력하세요",
103 "각주",
104 "갤러리",
105 "참조",
106 "이체자",
107 "외부 링크",
108 ]:
109 pass # ignore
110 else:
111 wxr.wtp.debug(f"unknown title: {title_text}", sortid="ko/page/63")
113 for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
114 parse_section(wxr, page_data, base_data, next_level)
116 extract_section_categories(wxr, page_data, base_data, level_node)
119def parse_language_section(
120 wxr: WiktextractContext, page_data: list[WordEntry], level2_node: LevelNode
121) -> None:
122 pre_data_len = len(page_data)
123 lang_name = clean_node(wxr, None, level2_node.largs)
124 if lang_name == "": 124 ↛ 125line 124 didn't jump to line 125 because the condition on line 124 was never true
125 lang_name = "unknown"
126 lang_code = name_to_code(lang_name, "ko")
127 if lang_code == "":
128 lang_code = "unknown"
129 if ( 129 ↛ 133line 129 didn't jump to line 133 because the condition on line 129 was never true
130 wxr.config.capture_language_codes is not None
131 and lang_code not in wxr.config.capture_language_codes
132 ):
133 return
134 wxr.wtp.start_section(lang_name)
135 base_data = WordEntry(
136 word=wxr.wtp.title,
137 lang_code=lang_code,
138 lang=lang_name,
139 pos="unknown",
140 )
141 extract_section_categories(wxr, page_data, base_data, level2_node)
142 for t_node in level2_node.find_child(NodeKind.TEMPLATE):
143 if t_node.template_name in SOUND_TEMPLATES: 143 ↛ 142line 143 didn't jump to line 142 because the condition on line 143 was always true
144 extract_sound_template(wxr, base_data, t_node)
146 for next_level in level2_node.find_child(LEVEL_KIND_FLAGS):
147 parse_section(wxr, page_data, base_data, next_level)
149 # no POS section
150 if len(page_data) == pre_data_len:
151 extract_pos_section(wxr, page_data, base_data, level2_node, "")
154def parse_page(
155 wxr: WiktextractContext, page_title: str, page_text: str
156) -> list[dict[str, Any]]:
157 # page layout
158 # https://ko.wiktionary.org/wiki/위키낱말사전:문서_양식
159 # https://ko.wiktionary.org/wiki/위키낱말사전:한국어_편집부
160 if page_title.startswith(("Appendix:", "T195546/NS111")): 160 ↛ 161line 160 didn't jump to line 161 because the condition on line 160 was never true
161 return []
162 wxr.wtp.start_page(page_title)
163 tree = wxr.wtp.parse(page_text)
164 page_data: list[WordEntry] = []
165 for level2_node in tree.find_child(NodeKind.LEVEL2):
166 parse_language_section(wxr, page_data, level2_node)
168 for data in page_data:
169 if len(data.senses) == 0:
170 data.senses.append(Sense(tags=["no-gloss"]))
171 return [m.model_dump(exclude_defaults=True) for m in page_data]
174def extract_alt_form_section(
175 wxr: WiktextractContext, base_data: WordEntry, level_node: LevelNode
176):
177 for t_node in level_node.find_child_recursively(NodeKind.TEMPLATE):
178 if t_node.template_name in ["alt", "alter"]:
179 extract_alt_template(wxr, base_data, t_node)
182def extract_alt_template(
183 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
184):
185 expanded_node = wxr.wtp.parse(
186 wxr.wtp.node_to_wikitext(t_node), expand_all=True
187 )
188 forms = []
189 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
190 for span_tag in expanded_node.find_html("span"):
191 span_lang = span_tag.attrs.get("lang", "")
192 span_class = span_tag.attrs.get("class", "").split()
193 if span_lang == lang_code:
194 word = clean_node(wxr, None, span_tag)
195 if word != "":
196 forms.append(Form(form=word))
197 elif span_lang.endswith("-Latn") and len(forms) > 0:
198 forms[-1].roman = clean_node(wxr, None, span_tag)
199 elif "label-content" in span_class and len(forms) > 0:
200 raw_tag = clean_node(wxr, None, span_tag)
201 if raw_tag != "":
202 for form in forms:
203 form.raw_tags.append(raw_tag)
204 translate_raw_tags(form)
205 base_data.forms.extend(forms)