Coverage for src / wiktextract / extractor / ja / sound.py: 82%
239 statements
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-12 08:09 +0000
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-12 08:09 +0000
1import itertools
2import re
4from wikitextprocessor import (
5 HTMLNode,
6 LevelNode,
7 NodeKind,
8 TemplateNode,
9 WikiNode,
10)
12from ...page import clean_node
13from ...wxr_context import WiktextractContext
14from ..share import capture_text_in_parentheses, set_sound_file_url_fields
15from .models import Hyphenation, Sound, WordEntry
16from .tags import translate_raw_tags
19def extract_sound_section(
20 wxr: WiktextractContext,
21 page_data: list[WordEntry],
22 base_data: WordEntry,
23 level_node: LevelNode,
24) -> None:
25 sounds = []
26 cats = {}
27 for node in level_node.children:
28 if isinstance(node, TemplateNode):
29 process_sound_template(wxr, base_data, node, sounds, cats)
30 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
31 for list_item in node.find_child(NodeKind.LIST_ITEM):
32 if base_data.lang_code == "zh":
33 extract_zh_sound_list_item(wxr, list_item, sounds, [])
34 else:
35 for t_node in list_item.find_child(NodeKind.TEMPLATE):
36 process_sound_template(
37 wxr, base_data, t_node, sounds, cats
38 )
40 if level_node.kind == NodeKind.LEVEL3:
41 base_data.sounds.extend(sounds)
42 base_data.categories.extend(cats.get("categories", []))
43 for data in page_data:
44 if data.lang_code == base_data.lang_code: 44 ↛ 43line 44 didn't jump to line 43 because the condition on line 44 was always true
45 data.sounds.extend(sounds)
46 data.categories.extend(cats.get("categories", []))
47 elif len(page_data) > 0: 47 ↛ 51line 47 didn't jump to line 51 because the condition on line 47 was always true
48 page_data[-1].sounds.extend(sounds)
49 page_data[-1].categories.extend(cats.get("categories", []))
50 else:
51 base_data.sounds.extend(sounds)
52 base_data.categories.extend(cats.get("categories", []))
55def process_sound_template(
56 wxr: WiktextractContext,
57 base_data: WordEntry,
58 t_node: TemplateNode,
59 sounds: list[Sound],
60 cats: dict[str, list[str]],
61) -> None:
62 if t_node.template_name in ["音声", "audio"]:
63 extract_audio_template(wxr, t_node, sounds)
64 elif t_node.template_name in [
65 "IPA",
66 "X-SAMPA",
67 ] or t_node.template_name.endswith("-IPA"):
68 extract_ipa_template(wxr, t_node, sounds)
69 elif t_node.template_name == "homophones":
70 extract_homophones_template(wxr, t_node, sounds)
71 elif t_node.template_name == "ja-pron":
72 process_ja_pron_template(wxr, t_node, sounds)
73 elif t_node.template_name == "ja-accent-common":
74 process_ja_accent_common_template(wxr, t_node, sounds)
75 elif t_node.template_name in [
76 "cmn-pron",
77 "yue-pron",
78 "nan-pron",
79 "cdo-pron",
80 "hak-pron",
81 "wuu-pron",
82 ]:
83 extract_zh_sound_template(wxr, t_node, sounds)
84 elif t_node.template_name in ["rhymes", "rhyme"]: 84 ↛ 86line 84 didn't jump to line 86 because the condition on line 84 was always true
85 extract_rhymes_template(wxr, t_node, sounds)
86 elif t_node.template_name in ["hyphenation", "hyph"]:
87 extract_hyphenation_template(wxr, base_data, t_node)
89 clean_node(wxr, cats, t_node)
92def extract_audio_template(
93 wxr: WiktextractContext, t_node: TemplateNode, sounds: list[Sound]
94):
95 audio_file = clean_node(wxr, None, t_node.template_parameters.get(2, ""))
96 if audio_file not in ["", "-"]: 96 ↛ exitline 96 didn't return from function 'extract_audio_template' because the condition on line 96 was always true
97 sound = Sound()
98 raw_tag = clean_node(wxr, None, t_node.template_parameters.get(3, ""))
99 if len(raw_tag) > 0:
100 sound.raw_tags.append(raw_tag)
101 set_sound_file_url_fields(wxr, audio_file, sound)
102 sounds.append(sound)
105def extract_ipa_template(
106 wxr: WiktextractContext, t_node: TemplateNode, sounds: list[Sound]
107):
108 expanded_node = wxr.wtp.parse(
109 wxr.wtp.node_to_wikitext(t_node), expand_all=True
110 )
111 no_list_nodes = []
112 for node in expanded_node.children:
113 if isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 113 ↛ 114line 113 didn't jump to line 114 because the condition on line 113 was never true
114 for list_item in node.find_child(NodeKind.LIST_ITEM):
115 sounds.extend(extract_ipa_list_item(wxr, list_item))
116 else:
117 no_list_nodes.append(node)
118 if len(no_list_nodes) > 0: 118 ↛ exitline 118 didn't return from function 'extract_ipa_template' because the condition on line 118 was always true
119 tmp_node = WikiNode(NodeKind.ROOT, 0)
120 tmp_node.children = no_list_nodes
121 sounds.extend(extract_ipa_list_item(wxr, tmp_node))
124def extract_ipa_list_item(
125 wxr: WiktextractContext, list_item: WikiNode
126) -> list[Sound]:
127 raw_tags = []
128 sounds = []
129 for span_tag in list_item.find_html_recursively("span"):
130 span_class = span_tag.attrs.get("class", "").split()
131 if "qualifier-content" in span_class or "ib-content" in span_class: 131 ↛ 132line 131 didn't jump to line 132 because the condition on line 131 was never true
132 for raw_tag in clean_node(wxr, None, span_tag).split(","):
133 raw_tag = raw_tag.strip()
134 if raw_tag != "":
135 raw_tags.append(raw_tag)
136 elif "IPA" in span_class or "SAMPA" in span_class:
137 sound = Sound(
138 ipa=clean_node(wxr, None, span_tag), raw_tags=raw_tags
139 )
140 if sound.ipa != "": 140 ↛ 129line 140 didn't jump to line 129 because the condition on line 140 was always true
141 if "SAMPA" in span_class:
142 sound.ipa = f"/{sound.ipa}/"
143 sound.tags.append("X-SAMPA")
144 translate_raw_tags(sound)
145 sounds.append(sound)
146 return sounds
149def extract_homophones_template(
150 wxr: WiktextractContext, t_node: TemplateNode, sounds: list[Sound]
151):
152 homophones = []
153 for index in itertools.count(1): 153 ↛ 159line 153 didn't jump to line 159 because the loop on line 153 didn't complete
154 if index not in t_node.template_parameters:
155 break
156 homophone = clean_node(wxr, None, t_node.template_parameters[index])
157 if len(homophone) > 0: 157 ↛ 153line 157 didn't jump to line 153 because the condition on line 157 was always true
158 homophones.append(homophone)
159 if len(homophones) > 0: 159 ↛ exitline 159 didn't return from function 'extract_homophones_template' because the condition on line 159 was always true
160 sounds.append(Sound(homophones=homophones))
163JA_PRON_ACCENTS = {
164 "中高型": "Nakadaka",
165 "平板型": "Heiban",
166 "頭高型": "Atamadaka",
167 "尾高型": "Odaka",
168}
171def process_ja_pron_template(
172 wxr: WiktextractContext,
173 template_node: TemplateNode,
174 sounds: list[Sound],
175) -> None:
176 # https://ja.wiktionary.org/wiki/テンプレート:ja-pron
177 expanded_node = wxr.wtp.parse(
178 wxr.wtp.node_to_wikitext(template_node), expand_all=True
179 )
180 for list_item in expanded_node.find_child_recursively(NodeKind.LIST_ITEM):
181 if list_item.contain_node(NodeKind.TABLE): 181 ↛ 182line 181 didn't jump to line 182 because the condition on line 181 was never true
182 continue
183 else:
184 sound = Sound()
185 for span_tag in list_item.find_html_recursively("span"):
186 span_classes = span_tag.attrs.get("class", "").split()
187 if "qualifier-content" in span_classes:
188 raw_tag = clean_node(wxr, None, span_tag)
189 if len(raw_tag) > 0: 189 ↛ 185line 189 didn't jump to line 185 because the condition on line 189 was always true
190 sound.raw_tags.append(raw_tag)
191 elif "IPA" in span_classes:
192 sound.ipa = clean_node(wxr, None, span_tag)
193 elif "Latn" in span_classes:
194 sound.roman = clean_node(wxr, None, span_tag)
195 elif "Jpan" in span_classes:
196 sound.other = clean_node(wxr, None, span_tag)
197 for link_node in list_item.find_child(NodeKind.LINK):
198 link_text = clean_node(wxr, None, link_node)
199 if link_text in JA_PRON_ACCENTS:
200 sound.tags.append(JA_PRON_ACCENTS[link_text])
201 if sound.ipa != "" or sound.other != "":
202 translate_raw_tags(sound)
203 sounds.append(sound)
205 for arg in ["a", "audio"]:
206 audio_file = clean_node(
207 wxr, None, template_node.template_parameters.get(arg, "")
208 )
209 if len(audio_file) > 0:
210 sound = Sound()
211 set_sound_file_url_fields(wxr, audio_file, sound)
212 sounds.append(sound)
215JA_ACCENT_COMMON_TYPES = {
216 "h": "Heiban",
217 "a": "Atamadaka",
218 "n": "Nakadaka",
219 "o": "Odaka",
220}
223def process_ja_accent_common_template(
224 wxr: WiktextractContext,
225 template_node: TemplateNode,
226 sounds: list[Sound],
227) -> None:
228 # https://ja.wiktionary.org/wiki/テンプレート:ja-accent-common
229 expanded_node = wxr.wtp.parse(
230 wxr.wtp.node_to_wikitext(template_node), expand_all=True
231 )
232 sound = Sound()
233 for link_node in expanded_node.find_child_recursively(NodeKind.LINK): 233 ↛ 238line 233 didn't jump to line 238 because the loop on line 233 didn't complete
234 raw_tag = clean_node(wxr, None, link_node)
235 if raw_tag != "": 235 ↛ 233line 235 didn't jump to line 233 because the condition on line 235 was always true
236 sound.raw_tags.append(raw_tag)
237 break
238 for span_tag in expanded_node.find_html_recursively("span"): 238 ↛ 243line 238 didn't jump to line 243 because the loop on line 238 didn't complete
239 span_text = clean_node(wxr, None, span_tag)
240 if len(span_text) > 0: 240 ↛ 238line 240 didn't jump to line 238 because the condition on line 240 was always true
241 sound.other = span_text
242 break
243 accent_type = clean_node(
244 wxr, None, template_node.template_parameters.get(1, "")
245 )
246 if accent_type in JA_ACCENT_COMMON_TYPES: 246 ↛ 248line 246 didn't jump to line 248 because the condition on line 246 was always true
247 sound.tags.append(JA_ACCENT_COMMON_TYPES[accent_type])
248 if sound.other != "": 248 ↛ exitline 248 didn't return from function 'process_ja_accent_common_template' because the condition on line 248 was always true
249 translate_raw_tags(sound)
250 sounds.append(sound)
253def extract_homophone_section(
254 wxr: WiktextractContext,
255 page_data: list[WordEntry],
256 base_data: WordEntry,
257 level_node: LevelNode,
258) -> None:
259 sounds = []
260 for list_node in level_node.find_child(NodeKind.LIST):
261 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
262 for node in list_item.children:
263 if isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
264 word = clean_node(wxr, None, node)
265 if word != "": 265 ↛ 262line 265 didn't jump to line 262 because the condition on line 265 was always true
266 sounds.append(Sound(homophones=[word]))
267 elif (
268 isinstance(node, TemplateNode) and node.template_name == "l"
269 ):
270 from .linkage import extract_l_template
272 l_data = extract_l_template(wxr, node)
273 if l_data.word != "": 273 ↛ 262line 273 didn't jump to line 262 because the condition on line 273 was always true
274 sounds.append(
275 Sound(
276 homophones=[l_data.word],
277 sense=l_data.sense,
278 tags=l_data.tags,
279 raw_tags=l_data.raw_tags,
280 )
281 )
283 if level_node.kind == NodeKind.LEVEL3: 283 ↛ 288line 283 didn't jump to line 288 because the condition on line 283 was always true
284 base_data.sounds.extend(sounds)
285 for data in page_data: 285 ↛ 286line 285 didn't jump to line 286 because the loop on line 285 never started
286 if data.lang_code == base_data.lang_code:
287 data.sounds.extend(sounds)
288 elif len(page_data) > 0:
289 page_data[-1].sounds.extend(sounds)
290 else:
291 base_data.sounds.extend(sounds)
294def extract_zh_sound_template(
295 wxr: WiktextractContext, t_node: TemplateNode, sounds: list[Sound]
296):
297 # https://ja.wiktionary.org/wiki/カテゴリ:中国語_発音テンプレート
298 expanded_node = wxr.wtp.parse(
299 wxr.wtp.node_to_wikitext(t_node), expand_all=True
300 )
301 for list_node in expanded_node.find_child(NodeKind.LIST):
302 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
303 raw_tags = []
304 raw_tag_nodes = []
305 for node in list_item.children:
306 if isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
307 if len(raw_tags) == 0: 307 ↛ 314line 307 didn't jump to line 314 because the condition on line 307 was always true
308 for raw_tag in re.split(
309 r":|,", clean_node(wxr, None, raw_tag_nodes)
310 ):
311 raw_tag = raw_tag.strip()
312 if raw_tag != "":
313 raw_tags.append(raw_tag)
314 for child_list_item in node.find_child(NodeKind.LIST_ITEM):
315 extract_zh_sound_list_item(
316 wxr, child_list_item, sounds, raw_tags
317 )
318 else:
319 raw_tag_nodes.append(node)
322def extract_zh_sound_list_item(
323 wxr: WiktextractContext,
324 list_item: WikiNode,
325 sounds: list[Sound],
326 raw_tags: list[str],
327):
328 after_colon = False
329 tag_nodes = []
330 value_nodes = []
331 for node in list_item.children:
332 if isinstance(node, str) and ":" in node and not after_colon:
333 tag_nodes.append(node[: node.index(":")])
334 value_nodes.append(node[node.index(":") + 1 :])
335 after_colon = True
336 elif not after_colon:
337 if isinstance(node, TemplateNode) and node.template_name in [
338 "音声",
339 "audio",
340 ]:
341 extract_audio_template(wxr, node, sounds)
342 elif not (isinstance(node, HTMLNode) and node.tag == "small"):
343 tag_nodes.append(node)
344 else:
345 value_nodes.append(node)
346 for value in clean_node(wxr, None, value_nodes).split(","):
347 value = value.strip()
348 if value == "":
349 continue
350 sound = Sound(zh_pron=value, raw_tags=raw_tags)
351 texts_in_p, text_out_p = capture_text_in_parentheses(
352 clean_node(wxr, None, tag_nodes)
353 )
354 text_out_p = text_out_p.strip()
355 if text_out_p != "": 355 ↛ 357line 355 didn't jump to line 357 because the condition on line 355 was always true
356 sound.raw_tags.append(text_out_p)
357 for raw_tag_str in texts_in_p:
358 for raw_tag in raw_tag_str.split(","):
359 raw_tag = raw_tag.strip()
360 if raw_tag != "": 360 ↛ 358line 360 didn't jump to line 358 because the condition on line 360 was always true
361 sound.raw_tags.append(raw_tag)
362 translate_raw_tags(sound)
363 sounds.append(sound)
366def extract_rhymes_template(
367 wxr: WiktextractContext, t_node: TemplateNode, sounds: list[Sound]
368):
369 expanded_node = wxr.wtp.parse(
370 wxr.wtp.node_to_wikitext(t_node), expand_all=True
371 )
372 for span_node in expanded_node.find_html(
373 "span", attr_name="class", attr_value="IPA"
374 ):
375 rhyme = clean_node(wxr, None, span_node)
376 if rhyme != "": 376 ↛ 372line 376 didn't jump to line 372 because the condition on line 376 was always true
377 sounds.append(Sound(rhymes=rhyme))
380def extract_hyphenation_template(
381 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
382):
383 expanded_node = wxr.wtp.parse(
384 wxr.wtp.node_to_wikitext(t_node), expand_all=True
385 )
386 h_strs = []
387 if t_node.template_name == "hyph":
388 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
389 for span_tag in expanded_node.find_html(
390 "span", attr_name="lang", attr_value=lang_code
391 ):
392 h_strs.append(clean_node(wxr, None, span_tag))
393 else:
394 h_strs.append(
395 clean_node(wxr, base_data, t_node).removeprefix("分綴:").strip()
396 )
397 for h_str in h_strs:
398 h_data = Hyphenation(
399 parts=list(filter(None, map(str.strip, h_str.split("‧"))))
400 )
401 if len(h_data.parts) > 0:
402 base_data.hyphenations.append(h_data)