Coverage for src/wiktextract/extractor/ko/page.py: 88%
68 statements
« prev ^ index » next coverage.py v7.6.4, created at 2024-10-25 10:11 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2024-10-25 10:11 +0000
1import re
2from typing import Any
4from mediawiki_langcodes import name_to_code
5from wikitextprocessor.parser import LEVEL_KIND_FLAGS, LevelNode, NodeKind
7from ...page import clean_node
8from ...wxr_context import WiktextractContext
9from .etymology import extract_etymology_section
10from .linkage import extract_linkage_section
11from .models import Sense, WordEntry
12from .pos import extract_pos_section
13from .section_titles import LINKAGE_SECTIONS, POS_DATA
14from .sound import (
15 SOUND_TEMPLATES,
16 extract_sound_section,
17 extract_sound_template,
18)
19from .translation import extract_translation_section
22def extract_section_categories(
23 wxr: WiktextractContext,
24 page_data: list[WordEntry],
25 base_data: WordEntry,
26 level_node: LevelNode,
27) -> None:
28 for link_node in level_node.find_child(NodeKind.LINK):
29 clean_node(
30 wxr, page_data[-1] if len(page_data) > 0 else base_data, link_node
31 )
34def parse_section(
35 wxr: WiktextractContext,
36 page_data: list[WordEntry],
37 base_data: WordEntry,
38 level_node: LevelNode,
39) -> None:
40 title_text = clean_node(wxr, None, level_node.largs)
41 title_text = re.sub(r"\s*\d+$", "", title_text)
42 if title_text.removeprefix("보조 ").strip() in POS_DATA:
43 orig_page_data_len = len(page_data)
44 extract_pos_section(wxr, page_data, base_data, level_node, title_text)
45 if (
46 len(page_data) == orig_page_data_len
47 and title_text in LINKAGE_SECTIONS
48 and len(page_data) > 0
49 ): # try extract as linkage section
50 extract_linkage_section(
51 wxr, page_data[-1], level_node, LINKAGE_SECTIONS[title_text]
52 )
53 elif title_text in LINKAGE_SECTIONS:
54 extract_linkage_section(
55 wxr,
56 page_data[-1] if len(page_data) > 0 else base_data,
57 level_node,
58 LINKAGE_SECTIONS[title_text],
59 )
60 elif title_text == "번역":
61 extract_translation_section(
62 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
63 )
64 elif title_text == "발음":
65 extract_sound_section(
66 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
67 )
68 elif title_text == "어원": 68 ↛ 72line 68 didn't jump to line 72 because the condition on line 68 was always true
69 extract_etymology_section(
70 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
71 )
72 elif title_text in ["참고 문헌", "독음", "자원"]:
73 pass # ignore
74 else:
75 wxr.wtp.debug(f"unknown title: {title_text}", sortid="ko/page/63")
77 for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
78 parse_section(wxr, page_data, base_data, next_level)
80 extract_section_categories(wxr, page_data, base_data, level_node)
83def parse_language_section(
84 wxr: WiktextractContext, page_data: list[WordEntry], level2_node: LevelNode
85) -> None:
86 pre_data_len = len(page_data)
87 lang_name = clean_node(wxr, None, level2_node.largs)
88 if lang_name == "": 88 ↛ 89line 88 didn't jump to line 89 because the condition on line 88 was never true
89 lang_name = "unknown"
90 lang_code = name_to_code(lang_name, "ko")
91 if lang_code == "":
92 lang_code = "unknown"
93 if ( 93 ↛ 97line 93 didn't jump to line 97
94 wxr.config.capture_language_codes is not None
95 and lang_code not in wxr.config.capture_language_codes
96 ):
97 return
98 wxr.wtp.start_section(lang_name)
99 base_data = WordEntry(
100 word=wxr.wtp.title,
101 lang_code=lang_code,
102 lang=lang_name,
103 pos="unknown",
104 )
105 extract_section_categories(wxr, page_data, base_data, level2_node)
106 for t_node in level2_node.find_child(NodeKind.TEMPLATE):
107 if t_node.template_name in SOUND_TEMPLATES: 107 ↛ 106line 107 didn't jump to line 106 because the condition on line 107 was always true
108 extract_sound_template(wxr, base_data, t_node)
110 for next_level in level2_node.find_child(LEVEL_KIND_FLAGS):
111 parse_section(wxr, page_data, base_data, next_level)
113 # no POS section
114 if len(page_data) == pre_data_len:
115 extract_pos_section(wxr, page_data, base_data, level2_node, "")
118def parse_page(
119 wxr: WiktextractContext, page_title: str, page_text: str
120) -> list[dict[str, Any]]:
121 # page layout
122 # https://ko.wiktionary.org/wiki/위키낱말사전:문서_양식
123 # https://ko.wiktionary.org/wiki/위키낱말사전:한국어_편집부
124 wxr.wtp.start_page(page_title)
125 tree = wxr.wtp.parse(page_text)
126 page_data: list[WordEntry] = []
127 for level2_node in tree.find_child(NodeKind.LEVEL2):
128 parse_language_section(wxr, page_data, level2_node)
130 for data in page_data:
131 if len(data.senses) == 0: 131 ↛ 132line 131 didn't jump to line 132 because the condition on line 131 was never true
132 data.senses.append(Sense(tags=["no-gloss"]))
133 return [m.model_dump(exclude_defaults=True) for m in page_data]