Coverage for src/wiktextract/extractor/ja/sound.py: 90%
194 statements
« prev ^ index » next coverage.py v7.11.0, created at 2025-10-24 07:36 +0000
« prev ^ index » next coverage.py v7.11.0, created at 2025-10-24 07:36 +0000
1import itertools
2import re
4from wikitextprocessor import (
5 HTMLNode,
6 LevelNode,
7 NodeKind,
8 TemplateNode,
9 WikiNode,
10)
12from ...page import clean_node
13from ...wxr_context import WiktextractContext
14from ..share import capture_text_in_parentheses, set_sound_file_url_fields
15from .models import Sound, WordEntry
16from .tags import translate_raw_tags
19def extract_sound_section(
20 wxr: WiktextractContext,
21 page_data: list[WordEntry],
22 base_data: WordEntry,
23 level_node: LevelNode,
24) -> None:
25 sounds = []
26 cats = {}
27 for node in level_node.children:
28 if isinstance(node, TemplateNode):
29 process_sound_template(wxr, node, sounds, cats)
30 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
31 for list_item in node.find_child(NodeKind.LIST_ITEM):
32 if base_data.lang_code == "zh":
33 extract_zh_sound_list_item(wxr, list_item, sounds, [])
34 else:
35 for t_node in list_item.find_child(NodeKind.TEMPLATE):
36 process_sound_template(wxr, t_node, sounds, cats)
38 if level_node.kind == NodeKind.LEVEL3:
39 base_data.sounds.extend(sounds)
40 base_data.categories.extend(cats.get("categories", []))
41 for data in page_data:
42 if data.lang_code == base_data.lang_code: 42 ↛ 41line 42 didn't jump to line 41 because the condition on line 42 was always true
43 data.sounds.extend(sounds)
44 data.categories.extend(cats.get("categories", []))
45 elif len(page_data) > 0: 45 ↛ 49line 45 didn't jump to line 49 because the condition on line 45 was always true
46 page_data[-1].sounds.extend(sounds)
47 page_data[-1].categories.extend(cats.get("categories", []))
48 else:
49 base_data.sounds.extend(sounds)
50 base_data.categories.extend(cats.get("categories", []))
53def process_sound_template(
54 wxr: WiktextractContext,
55 t_node: TemplateNode,
56 sounds: list[Sound],
57 cats: dict[str, list[str]],
58) -> None:
59 if t_node.template_name in ["音声", "audio"]:
60 extract_audio_template(wxr, t_node, sounds)
61 elif t_node.template_name in ["IPA", "X-SAMPA"]:
62 extract_ipa_template(wxr, t_node, sounds)
63 elif t_node.template_name == "homophones":
64 extract_homophones_template(wxr, t_node, sounds)
65 elif t_node.template_name == "ja-pron":
66 process_ja_pron_template(wxr, t_node, sounds)
67 elif t_node.template_name == "ja-accent-common":
68 process_ja_accent_common_template(wxr, t_node, sounds)
69 elif t_node.template_name in [ 69 ↛ 79line 69 didn't jump to line 79 because the condition on line 69 was always true
70 "cmn-pron",
71 "yue-pron",
72 "nan-pron",
73 "cdo-pron",
74 "hak-pron",
75 "wuu-pron",
76 ]:
77 extract_zh_sound_template(wxr, t_node, sounds)
79 clean_node(wxr, cats, t_node)
82def extract_audio_template(
83 wxr: WiktextractContext, t_node: TemplateNode, sounds: list[Sound]
84):
85 audio_file = clean_node(wxr, None, t_node.template_parameters.get(2, ""))
86 if audio_file not in ["", "-"]: 86 ↛ exitline 86 didn't return from function 'extract_audio_template' because the condition on line 86 was always true
87 sound = Sound()
88 raw_tag = clean_node(wxr, None, t_node.template_parameters.get(3, ""))
89 if len(raw_tag) > 0:
90 sound.raw_tags.append(raw_tag)
91 set_sound_file_url_fields(wxr, audio_file, sound)
92 sounds.append(sound)
95def extract_ipa_template(
96 wxr: WiktextractContext, t_node: TemplateNode, sounds: list[Sound]
97):
98 for index in itertools.count(1): 98 ↛ exitline 98 didn't return from function 'extract_ipa_template' because the loop on line 98 didn't complete
99 if index not in t_node.template_parameters:
100 break
101 ipa = clean_node(wxr, None, t_node.template_parameters[index])
102 if len(ipa) > 0: 102 ↛ 98line 102 didn't jump to line 98 because the condition on line 102 was always true
103 sound = Sound(ipa=f"/{ipa}/")
104 if t_node.template_name == "X-SAMPA":
105 sound.tags.append("X-SAMPA")
106 sounds.append(sound)
109def extract_homophones_template(
110 wxr: WiktextractContext, t_node: TemplateNode, sounds: list[Sound]
111):
112 homophones = []
113 for index in itertools.count(1): 113 ↛ 119line 113 didn't jump to line 119 because the loop on line 113 didn't complete
114 if index not in t_node.template_parameters:
115 break
116 homophone = clean_node(wxr, None, t_node.template_parameters[index])
117 if len(homophone) > 0: 117 ↛ 113line 117 didn't jump to line 113 because the condition on line 117 was always true
118 homophones.append(homophone)
119 if len(homophones) > 0: 119 ↛ exitline 119 didn't return from function 'extract_homophones_template' because the condition on line 119 was always true
120 sounds.append(Sound(homophones=homophones))
123JA_PRON_ACCENTS = {
124 "中高型": "Nakadaka",
125 "平板型": "Heiban",
126 "頭高型": "Atamadaka",
127 "尾高型": "Odaka",
128}
131def process_ja_pron_template(
132 wxr: WiktextractContext,
133 template_node: TemplateNode,
134 sounds: list[Sound],
135) -> None:
136 # https://ja.wiktionary.org/wiki/テンプレート:ja-pron
137 expanded_node = wxr.wtp.parse(
138 wxr.wtp.node_to_wikitext(template_node), expand_all=True
139 )
140 for list_item in expanded_node.find_child_recursively(NodeKind.LIST_ITEM):
141 if list_item.contain_node(NodeKind.TABLE): 141 ↛ 142line 141 didn't jump to line 142 because the condition on line 141 was never true
142 continue
143 else:
144 sound = Sound()
145 for span_tag in list_item.find_html_recursively("span"):
146 span_classes = span_tag.attrs.get("class", "")
147 if "qualifier-content" in span_classes:
148 raw_tag = clean_node(wxr, None, span_tag)
149 if len(raw_tag) > 0: 149 ↛ 145line 149 didn't jump to line 145 because the condition on line 149 was always true
150 sound.raw_tags.append(raw_tag)
151 elif "IPA" in span_classes:
152 sound.ipa = clean_node(wxr, None, span_tag)
153 elif "Latn" in span_classes:
154 sound.roman = clean_node(wxr, None, span_tag)
155 elif "Jpan" in span_classes:
156 sound.form = clean_node(wxr, None, span_tag)
157 for link_node in list_item.find_child(NodeKind.LINK):
158 link_text = clean_node(wxr, None, link_node)
159 if link_text in JA_PRON_ACCENTS:
160 sound.tags.append(JA_PRON_ACCENTS[link_text])
161 if len(sound.model_dump(exclude_defaults=True)) > 0:
162 sounds.append(sound)
164 for arg in ["a", "audio"]:
165 audio_file = clean_node(
166 wxr, None, template_node.template_parameters.get(arg, "")
167 )
168 if len(audio_file) > 0:
169 sound = Sound()
170 set_sound_file_url_fields(wxr, audio_file, sound)
171 sounds.append(sound)
174JA_ACCENT_COMMON_TYPES = {
175 "h": "Heiban",
176 "a": "Atamadaka",
177 "n": "Nakadaka",
178 "o": "Odaka",
179}
182def process_ja_accent_common_template(
183 wxr: WiktextractContext,
184 template_node: TemplateNode,
185 sounds: list[Sound],
186) -> None:
187 # https://ja.wiktionary.org/wiki/テンプレート:ja-accent-common
188 expanded_node = wxr.wtp.parse(
189 wxr.wtp.node_to_wikitext(template_node), expand_all=True
190 )
191 sound = Sound()
192 for link_node in expanded_node.find_child_recursively(NodeKind.LINK): 192 ↛ 197line 192 didn't jump to line 197 because the loop on line 192 didn't complete
193 raw_tag = clean_node(wxr, None, link_node)
194 if raw_tag != "": 194 ↛ 192line 194 didn't jump to line 192 because the condition on line 194 was always true
195 sound.raw_tags.append(raw_tag)
196 break
197 for span_tag in expanded_node.find_html_recursively("span"): 197 ↛ 202line 197 didn't jump to line 202 because the loop on line 197 didn't complete
198 span_text = clean_node(wxr, None, span_tag)
199 if len(span_text) > 0: 199 ↛ 197line 199 didn't jump to line 197 because the condition on line 199 was always true
200 sound.form = span_text
201 break
202 accent_type = clean_node(
203 wxr, None, template_node.template_parameters.get(1, "")
204 )
205 if accent_type in JA_ACCENT_COMMON_TYPES: 205 ↛ 207line 205 didn't jump to line 207 because the condition on line 205 was always true
206 sound.tags.append(JA_ACCENT_COMMON_TYPES[accent_type])
207 if sound.form != "": 207 ↛ exitline 207 didn't return from function 'process_ja_accent_common_template' because the condition on line 207 was always true
208 sounds.append(sound)
211def extract_homophone_section(
212 wxr: WiktextractContext,
213 page_data: list[WordEntry],
214 base_data: WordEntry,
215 level_node: LevelNode,
216) -> None:
217 sounds = []
218 for list_node in level_node.find_child(NodeKind.LIST):
219 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
220 for node in list_item.children:
221 if isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
222 word = clean_node(wxr, None, node)
223 if word != "": 223 ↛ 220line 223 didn't jump to line 220 because the condition on line 223 was always true
224 sounds.append(Sound(homophones=[word]))
225 elif (
226 isinstance(node, TemplateNode) and node.template_name == "l"
227 ):
228 from .linkage import extract_l_template
230 l_data = extract_l_template(wxr, node)
231 if l_data.word != "": 231 ↛ 220line 231 didn't jump to line 220 because the condition on line 231 was always true
232 sounds.append(
233 Sound(
234 homophones=[l_data.word],
235 sense=l_data.sense,
236 tags=l_data.tags,
237 raw_tags=l_data.raw_tags,
238 )
239 )
241 if level_node.kind == NodeKind.LEVEL3: 241 ↛ 246line 241 didn't jump to line 246 because the condition on line 241 was always true
242 base_data.sounds.extend(sounds)
243 for data in page_data: 243 ↛ 244line 243 didn't jump to line 244 because the loop on line 243 never started
244 if data.lang_code == base_data.lang_code:
245 data.sounds.extend(sounds)
246 elif len(page_data) > 0:
247 page_data[-1].sounds.extend(sounds)
248 else:
249 base_data.sounds.extend(sounds)
252def extract_zh_sound_template(
253 wxr: WiktextractContext, t_node: TemplateNode, sounds: list[Sound]
254):
255 # https://ja.wiktionary.org/wiki/カテゴリ:中国語_発音テンプレート
256 expanded_node = wxr.wtp.parse(
257 wxr.wtp.node_to_wikitext(t_node), expand_all=True
258 )
259 for list_node in expanded_node.find_child(NodeKind.LIST):
260 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
261 raw_tags = []
262 raw_tag_nodes = []
263 for node in list_item.children:
264 if isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
265 if len(raw_tags) == 0: 265 ↛ 272line 265 didn't jump to line 272 because the condition on line 265 was always true
266 for raw_tag in re.split(
267 r":|,", clean_node(wxr, None, raw_tag_nodes)
268 ):
269 raw_tag = raw_tag.strip()
270 if raw_tag != "":
271 raw_tags.append(raw_tag)
272 for child_list_item in node.find_child(NodeKind.LIST_ITEM):
273 extract_zh_sound_list_item(
274 wxr, child_list_item, sounds, raw_tags
275 )
276 else:
277 raw_tag_nodes.append(node)
280def extract_zh_sound_list_item(
281 wxr: WiktextractContext,
282 list_item: WikiNode,
283 sounds: list[Sound],
284 raw_tags: list[str],
285):
286 after_colon = False
287 tag_nodes = []
288 value_nodes = []
289 for node in list_item.children:
290 if isinstance(node, str) and ":" in node and not after_colon:
291 tag_nodes.append(node[: node.index(":")])
292 value_nodes.append(node[node.index(":") + 1 :])
293 after_colon = True
294 elif not after_colon:
295 if isinstance(node, TemplateNode) and node.template_name in [
296 "音声",
297 "audio",
298 ]:
299 extract_audio_template(wxr, node, sounds)
300 elif not (isinstance(node, HTMLNode) and node.tag == "small"):
301 tag_nodes.append(node)
302 else:
303 value_nodes.append(node)
304 for value in clean_node(wxr, None, value_nodes).split(","):
305 value = value.strip()
306 if value == "":
307 continue
308 sound = Sound(zh_pron=value, raw_tags=raw_tags)
309 texts_in_p, text_out_p = capture_text_in_parentheses(
310 clean_node(wxr, None, tag_nodes)
311 )
312 text_out_p = text_out_p.strip()
313 if text_out_p != "": 313 ↛ 315line 313 didn't jump to line 315 because the condition on line 313 was always true
314 sound.raw_tags.append(text_out_p)
315 for raw_tag_str in texts_in_p:
316 for raw_tag in raw_tag_str.split(","):
317 raw_tag = raw_tag.strip()
318 if raw_tag != "": 318 ↛ 316line 318 didn't jump to line 316 because the condition on line 318 was always true
319 sound.raw_tags.append(raw_tag)
320 translate_raw_tags(sound)
321 sounds.append(sound)