Coverage for src/wiktextract/extractor/ko/page.py: 85%
72 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1import re
2from typing import Any
4from mediawiki_langcodes import name_to_code
5from wikitextprocessor.parser import LEVEL_KIND_FLAGS, LevelNode, NodeKind
7from ...page import clean_node
8from ...wxr_context import WiktextractContext
9from .etymology import extract_etymology_section
10from .linkage import extract_linkage_section
11from .models import Sense, WordEntry
12from .pos import extract_grammar_note_section, extract_pos_section
13from .section_titles import LINKAGE_SECTIONS, POS_DATA
14from .sound import (
15 SOUND_TEMPLATES,
16 extract_sound_section,
17 extract_sound_template,
18)
19from .translation import extract_translation_section
22def extract_section_categories(
23 wxr: WiktextractContext,
24 page_data: list[WordEntry],
25 base_data: WordEntry,
26 level_node: LevelNode,
27) -> None:
28 for link_node in level_node.find_child(NodeKind.LINK):
29 clean_node(
30 wxr, page_data[-1] if len(page_data) > 0 else base_data, link_node
31 )
34def parse_section(
35 wxr: WiktextractContext,
36 page_data: list[WordEntry],
37 base_data: WordEntry,
38 level_node: LevelNode,
39) -> None:
40 title_text = clean_node(wxr, None, level_node.largs)
41 title_text = re.sub(r"\s*\d+$", "", title_text).strip("() ")
42 if "(" in title_text: 42 ↛ 43line 42 didn't jump to line 43 because the condition on line 42 was never true
43 title_text = title_text[: title_text.index("(")]
44 if title_text.removeprefix("보조 ").strip() in POS_DATA:
45 orig_page_data_len = len(page_data)
46 extract_pos_section(wxr, page_data, base_data, level_node, title_text)
47 if (
48 len(page_data) == orig_page_data_len
49 and title_text in LINKAGE_SECTIONS
50 and len(page_data) > 0
51 ): # try extract as linkage section
52 extract_linkage_section(
53 wxr, page_data[-1], level_node, LINKAGE_SECTIONS[title_text]
54 )
55 elif title_text in LINKAGE_SECTIONS:
56 extract_linkage_section(
57 wxr,
58 page_data[-1] if len(page_data) > 0 else base_data,
59 level_node,
60 LINKAGE_SECTIONS[title_text],
61 )
62 elif title_text == "번역":
63 extract_translation_section(
64 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
65 )
66 elif title_text == "발음":
67 extract_sound_section(
68 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
69 )
70 elif title_text == "어원": 70 ↛ 78line 70 didn't jump to line 78 because the condition on line 70 was always true
71 extract_etymology_section(
72 wxr,
73 page_data[-1]
74 if len(page_data) > 0 and len(page_data[-1].etymology_texts) == 0
75 else base_data,
76 level_node,
77 )
78 elif title_text == "어법 주의 사항":
79 extract_grammar_note_section(
80 wxr,
81 page_data[-1] if len(page_data) > 0 else base_data,
82 level_node,
83 )
84 elif title_text in [
85 "참고 문헌",
86 "독음",
87 "자원",
88 "교차언어",
89 "관사를 입력하세요",
90 "각주",
91 "갤러리",
92 "참조",
93 "이체자",
94 ]:
95 pass # ignore
96 else:
97 wxr.wtp.debug(f"unknown title: {title_text}", sortid="ko/page/63")
99 for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
100 parse_section(wxr, page_data, base_data, next_level)
102 extract_section_categories(wxr, page_data, base_data, level_node)
105def parse_language_section(
106 wxr: WiktextractContext, page_data: list[WordEntry], level2_node: LevelNode
107) -> None:
108 pre_data_len = len(page_data)
109 lang_name = clean_node(wxr, None, level2_node.largs)
110 if lang_name == "": 110 ↛ 111line 110 didn't jump to line 111 because the condition on line 110 was never true
111 lang_name = "unknown"
112 lang_code = name_to_code(lang_name, "ko")
113 if lang_code == "":
114 lang_code = "unknown"
115 if ( 115 ↛ 119line 115 didn't jump to line 119 because the condition on line 115 was never true
116 wxr.config.capture_language_codes is not None
117 and lang_code not in wxr.config.capture_language_codes
118 ):
119 return
120 wxr.wtp.start_section(lang_name)
121 base_data = WordEntry(
122 word=wxr.wtp.title,
123 lang_code=lang_code,
124 lang=lang_name,
125 pos="unknown",
126 )
127 extract_section_categories(wxr, page_data, base_data, level2_node)
128 for t_node in level2_node.find_child(NodeKind.TEMPLATE):
129 if t_node.template_name in SOUND_TEMPLATES: 129 ↛ 128line 129 didn't jump to line 128 because the condition on line 129 was always true
130 extract_sound_template(wxr, base_data, t_node)
132 for next_level in level2_node.find_child(LEVEL_KIND_FLAGS):
133 parse_section(wxr, page_data, base_data, next_level)
135 # no POS section
136 if len(page_data) == pre_data_len:
137 extract_pos_section(wxr, page_data, base_data, level2_node, "")
140def parse_page(
141 wxr: WiktextractContext, page_title: str, page_text: str
142) -> list[dict[str, Any]]:
143 # page layout
144 # https://ko.wiktionary.org/wiki/위키낱말사전:문서_양식
145 # https://ko.wiktionary.org/wiki/위키낱말사전:한국어_편집부
146 wxr.wtp.start_page(page_title)
147 tree = wxr.wtp.parse(page_text)
148 page_data: list[WordEntry] = []
149 for level2_node in tree.find_child(NodeKind.LEVEL2):
150 parse_language_section(wxr, page_data, level2_node)
152 for data in page_data:
153 if len(data.senses) == 0:
154 data.senses.append(Sense(tags=["no-gloss"]))
155 return [m.model_dump(exclude_defaults=True) for m in page_data]