Coverage for src/wiktextract/extractor/ja/page.py: 82%
84 statements
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
1import re
2from typing import Any
4from mediawiki_langcodes import name_to_code
5from wikitextprocessor.parser import LEVEL_KIND_FLAGS, LevelNode, NodeKind
7from ...page import clean_node
8from ...wxr_context import WiktextractContext
9from .conjugation import extract_conjugation_section
10from .etymology import extract_etymology_section
11from .linkage import extract_alt_form_section, extract_linkage_section
12from .models import Sense, WordEntry
13from .pos import extract_note_section, parse_pos_section
14from .section_titles import LINKAGES, POS_DATA
15from .sound import extract_homophone_section, extract_sound_section
16from .translation import extract_translation_section
19def parse_section(
20 wxr: WiktextractContext,
21 page_data: list[WordEntry],
22 base_data: WordEntry,
23 level_node: LevelNode,
24) -> None:
25 title_texts = re.sub(
26 r"[\s\d]+$", "", clean_node(wxr, None, level_node.largs)
27 )
28 for title_text in re.split(r":|:|・", title_texts): 28 ↛ 117line 28 didn't jump to line 117 because the loop on line 28 didn't complete
29 if title_text in POS_DATA:
30 pre_len = len(page_data)
31 parse_pos_section(wxr, page_data, base_data, level_node, title_text)
32 if (
33 len(page_data) == pre_len
34 and title_text in LINKAGES
35 and pre_len > 0
36 ):
37 extract_linkage_section(
38 wxr, page_data[-1], level_node, LINKAGES[title_text]
39 )
40 break
41 elif (
42 title_text in ["語源", "由来", "字源", "出典", "語誌"]
43 and wxr.config.capture_etymologies
44 ):
45 extract_etymology_section(wxr, page_data, base_data, level_node)
46 break
47 elif (
48 title_text.startswith(("発音", "音価"))
49 and wxr.config.capture_pronunciation
50 ):
51 extract_sound_section(wxr, page_data, base_data, level_node)
52 break
53 elif title_text in ["翻訳", "訳語"] and wxr.config.capture_translations:
54 extract_translation_section(
55 wxr,
56 page_data[-1] if len(page_data) > 0 else base_data,
57 level_node,
58 )
59 break
60 elif title_text in LINKAGES and wxr.config.capture_linkages:
61 extract_linkage_section(
62 wxr,
63 page_data[-1]
64 if len(page_data) > 0
65 and page_data[-1].lang_code == base_data.lang_code
66 else base_data,
67 level_node,
68 LINKAGES[title_text],
69 )
70 break
71 elif ( 71 ↛ 75line 71 didn't jump to line 75 because the condition on line 71 was never true
72 title_text in ["活用", "サ変動詞"]
73 and wxr.config.capture_inflections
74 ):
75 extract_conjugation_section(
76 wxr,
77 page_data[-1] if len(page_data) > 0 else base_data,
78 level_node,
79 )
80 break
81 elif title_text in [ 81 ↛ 88line 81 didn't jump to line 88 because the condition on line 81 was never true
82 "異表記",
83 "別表記",
84 "代替表記",
85 "異形",
86 "表記揺れ",
87 ]: # "異表記・別形", Template:alter
88 extract_alt_form_section(
89 wxr,
90 page_data[-1]
91 if len(page_data) > 0
92 and page_data[-1].lang_code == base_data.lang_code
93 else base_data,
94 level_node,
95 )
96 break
97 elif title_text in [ 97 ↛ 107line 97 didn't jump to line 107 because the condition on line 97 was never true
98 "用法",
99 "注意点",
100 "留意点",
101 "注意",
102 "備考",
103 "表記",
104 "補足",
105 "語法",
106 ]:
107 extract_note_section(
108 wxr,
109 page_data[-1] if len(page_data) > 0 else base_data,
110 level_node,
111 )
112 break
113 elif title_text == "同音異義語": 113 ↛ 28line 113 didn't jump to line 28 because the condition on line 113 was always true
114 extract_homophone_section(wxr, page_data, base_data, level_node)
115 break
116 else:
117 if title_text not in [
118 "脚注",
119 "参照",
120 "参考文献",
121 "参考",
122 "同音の漢字",
123 "参考辞書",
124 "外部リンク",
125 ]:
126 wxr.wtp.debug(
127 f"Unknown section: {title_text}",
128 sortid="extractor/ja/page/parse_section/93",
129 )
131 for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
132 parse_section(wxr, page_data, base_data, next_level)
134 for t_node in level_node.find_child(NodeKind.TEMPLATE):
135 if t_node.template_name.endswith("-cat"): 135 ↛ 136line 135 didn't jump to line 136 because the condition on line 135 was never true
136 clean_node(
137 wxr, page_data[-1] if len(page_data) > 0 else base_data, t_node
138 )
141def parse_page(
142 wxr: WiktextractContext, page_title: str, page_text: str
143) -> list[dict[str, Any]]:
144 # page layout
145 # https://ja.wiktionary.org/wiki/Wiktionary:スタイルマニュアル
146 if page_title.startswith( 146 ↛ 149line 146 didn't jump to line 149 because the condition on line 146 was never true
147 ("Appendix:", "シソーラス:")
148 ) or page_title.endswith("(活用)"):
149 return []
150 wxr.wtp.start_page(page_title)
151 tree = wxr.wtp.parse(page_text)
152 page_data: list[WordEntry] = []
153 for level2_node in tree.find_child(NodeKind.LEVEL2):
154 lang_name = clean_node(wxr, None, level2_node.largs)
155 if lang_name == "": 155 ↛ 156line 155 didn't jump to line 156 because the condition on line 155 was never true
156 lang_name = "unknown"
157 lang_code = "unknown"
158 else:
159 lang_code = name_to_code(lang_name, "ja")
160 if lang_code == "":
161 for template in level2_node.find_content(NodeKind.TEMPLATE):
162 if template.template_name == "L":
163 lang_code = template.template_parameters.get(1, "")
164 elif re.fullmatch(r"[a-z-]+", template.template_name): 164 ↛ 161line 164 didn't jump to line 161 because the condition on line 164 was always true
165 lang_code = template.template_name
166 if lang_code == "":
167 lang_code = "unknown"
168 wxr.wtp.start_section(lang_name)
169 base_data = WordEntry(
170 word=wxr.wtp.title,
171 lang_code=lang_code,
172 lang=lang_name,
173 pos="unknown",
174 )
175 for link_node in level2_node.find_child(NodeKind.LINK):
176 clean_node(wxr, base_data, link_node)
177 for level3_node in level2_node.find_child(NodeKind.LEVEL3):
178 parse_section(wxr, page_data, base_data, level3_node)
180 for data in page_data:
181 if len(data.senses) == 0: 181 ↛ 182line 181 didn't jump to line 182 because the condition on line 181 was never true
182 data.senses.append(Sense(tags=["no-gloss"]))
183 return [m.model_dump(exclude_defaults=True) for m in page_data]