Coverage for src / wiktextract / extractor / ja / page.py: 84%
90 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-17 07:22 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-17 07:22 +0000
1import re
2from typing import Any
4from mediawiki_langcodes import name_to_code
5from wikitextprocessor.parser import LEVEL_KIND_FLAGS, LevelNode, NodeKind
7from ...page import clean_node
8from ...wxr_context import WiktextractContext
9from .conjugation import extract_conjugation_section
10from .etymology import extract_etymology_section
11from .kanji import extract_ja_kanji
12from .linkage import extract_alt_form_section, extract_linkage_section
13from .models import Sense, WordEntry
14from .pos import extract_note_section, parse_pos_section
15from .section_titles import LINKAGES, POS_DATA
16from .sound import extract_homophone_section, extract_sound_section
17from .translation import extract_translation_section
20def parse_section(
21 wxr: WiktextractContext,
22 page_data: list[WordEntry],
23 base_data: WordEntry,
24 level_node: LevelNode,
25) -> None:
26 title_texts = re.sub(
27 r"[\s\d]+$", "", clean_node(wxr, None, level_node.largs)
28 )
29 for title_text in re.split(r":|:|・", title_texts): 29 ↛ 118line 29 didn't jump to line 118 because the loop on line 29 didn't complete
30 if title_text in POS_DATA:
31 pre_len = len(page_data)
32 parse_pos_section(wxr, page_data, base_data, level_node, title_text)
33 if (
34 len(page_data) == pre_len
35 and title_text in LINKAGES
36 and pre_len > 0
37 ):
38 extract_linkage_section(
39 wxr, page_data[-1], level_node, LINKAGES[title_text]
40 )
41 break
42 elif (
43 title_text in ["語源", "由来", "字源", "出典", "語誌"]
44 and wxr.config.capture_etymologies
45 ):
46 extract_etymology_section(wxr, page_data, base_data, level_node)
47 break
48 elif (
49 title_text.startswith(("発音", "音価"))
50 and wxr.config.capture_pronunciation
51 ):
52 extract_sound_section(wxr, page_data, base_data, level_node)
53 break
54 elif title_text in ["翻訳", "訳語"] and wxr.config.capture_translations:
55 extract_translation_section(
56 wxr,
57 page_data[-1] if len(page_data) > 0 else base_data,
58 level_node,
59 )
60 break
61 elif title_text in LINKAGES and wxr.config.capture_linkages:
62 extract_linkage_section(
63 wxr,
64 page_data[-1]
65 if len(page_data) > 0
66 and page_data[-1].lang_code == base_data.lang_code
67 else base_data,
68 level_node,
69 LINKAGES[title_text],
70 )
71 break
72 elif (
73 title_text in ["活用", "サ変動詞"]
74 and wxr.config.capture_inflections
75 ):
76 extract_conjugation_section(
77 wxr,
78 page_data[-1] if len(page_data) > 0 else base_data,
79 level_node,
80 )
81 break
82 elif title_text in [
83 "異表記",
84 "別表記",
85 "代替表記",
86 "異形",
87 "表記揺れ",
88 ]: # "異表記・別形", Template:alter
89 extract_alt_form_section(
90 wxr,
91 page_data[-1]
92 if len(page_data) > 0
93 and page_data[-1].lang_code == base_data.lang_code
94 else base_data,
95 level_node,
96 )
97 break
98 elif title_text in [ 98 ↛ 108line 98 didn't jump to line 108 because the condition on line 98 was never true
99 "用法",
100 "注意点",
101 "留意点",
102 "注意",
103 "備考",
104 "表記",
105 "補足",
106 "語法",
107 ]:
108 extract_note_section(
109 wxr,
110 page_data[-1] if len(page_data) > 0 else base_data,
111 level_node,
112 )
113 break
114 elif title_text == "同音異義語": 114 ↛ 29line 114 didn't jump to line 29 because the condition on line 114 was always true
115 extract_homophone_section(wxr, page_data, base_data, level_node)
116 break
117 else:
118 if title_text not in [
119 "脚注",
120 "参照",
121 "参考文献",
122 "参考",
123 "同音の漢字",
124 "参考辞書",
125 "外部リンク",
126 ]:
127 wxr.wtp.debug(
128 f"Unknown section: {title_text}",
129 sortid="extractor/ja/page/parse_section/93",
130 )
132 for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
133 parse_section(wxr, page_data, base_data, next_level)
135 for t_node in level_node.find_child(NodeKind.TEMPLATE):
136 if t_node.template_name.endswith("-cat"): 136 ↛ 137line 136 didn't jump to line 137 because the condition on line 136 was never true
137 clean_node(
138 wxr, page_data[-1] if len(page_data) > 0 else base_data, t_node
139 )
142def parse_page(
143 wxr: WiktextractContext, page_title: str, page_text: str
144) -> list[dict[str, Any]]:
145 # page layout
146 # https://ja.wiktionary.org/wiki/Wiktionary:スタイルマニュアル
147 if page_title.startswith( 147 ↛ 150line 147 didn't jump to line 150 because the condition on line 147 was never true
148 ("Appendix:", "シソーラス:")
149 ) or page_title.endswith("(活用)"):
150 return []
151 wxr.wtp.start_page(page_title)
152 tree = wxr.wtp.parse(page_text)
153 page_data: list[WordEntry] = []
154 for level2_node in tree.find_child(NodeKind.LEVEL2):
155 lang_name = clean_node(wxr, None, level2_node.largs)
156 if lang_name == "": 156 ↛ 157line 156 didn't jump to line 157 because the condition on line 156 was never true
157 lang_name = "unknown"
158 lang_code = "unknown"
159 else:
160 lang_code = name_to_code(lang_name, "ja")
161 if lang_code == "":
162 for template in level2_node.find_content(NodeKind.TEMPLATE):
163 if template.template_name == "L":
164 lang_code = template.template_parameters.get(1, "")
165 elif re.fullmatch(r"[a-z-]+", template.template_name): 165 ↛ 162line 165 didn't jump to line 162 because the condition on line 165 was always true
166 lang_code = template.template_name
167 if lang_code == "":
168 lang_code = "unknown"
169 wxr.wtp.start_section(lang_name)
170 base_data = WordEntry(
171 word=wxr.wtp.title,
172 lang_code=lang_code,
173 lang=lang_name,
174 pos="unknown",
175 )
176 for link_node in level2_node.find_child(NodeKind.LINK):
177 clean_node(wxr, base_data, link_node)
178 for t_node in level2_node.find_child(NodeKind.TEMPLATE):
179 if t_node.template_name.endswith("-cat"): 179 ↛ 181line 179 didn't jump to line 181 because the condition on line 179 was always true
180 clean_node(wxr, base_data, t_node)
181 elif t_node.template_name == "ja-kanji":
182 extract_ja_kanji(wxr, base_data, t_node)
183 for level3_node in level2_node.find_child(NodeKind.LEVEL3):
184 parse_section(wxr, page_data, base_data, level3_node)
186 for data in page_data:
187 if len(data.senses) == 0: 187 ↛ 188line 187 didn't jump to line 188 because the condition on line 187 was never true
188 data.senses.append(Sense(tags=["no-gloss"]))
189 return [m.model_dump(exclude_defaults=True) for m in page_data]