Coverage for src/wiktextract/extractor/ja/sound.py: 86%
149 statements
« prev ^ index » next coverage.py v7.9.0, created at 2025-06-13 07:43 +0000
« prev ^ index » next coverage.py v7.9.0, created at 2025-06-13 07:43 +0000
1import itertools
3from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode
5from ...page import clean_node
6from ...wxr_context import WiktextractContext
7from ..share import set_sound_file_url_fields
8from .models import Sound, WordEntry
9from .tags import translate_raw_tags
12def extract_sound_section(
13 wxr: WiktextractContext,
14 page_data: list[WordEntry],
15 base_data: WordEntry,
16 level_node: LevelNode,
17) -> None:
18 sounds = []
19 cats = {}
20 if base_data.lang_code == "zh":
21 extract_zh_sounds(wxr, level_node, sounds)
22 else:
23 for template_node in level_node.find_child_recursively(
24 NodeKind.TEMPLATE
25 ):
26 process_sound_template(wxr, template_node, sounds, cats)
28 if level_node.kind == NodeKind.LEVEL3:
29 base_data.sounds.extend(sounds)
30 base_data.categories.extend(cats.get("categories", []))
31 for data in page_data:
32 if data.lang_code == base_data.lang_code: 32 ↛ 31line 32 didn't jump to line 31 because the condition on line 32 was always true
33 data.sounds.extend(sounds)
34 data.categories.extend(cats.get("categories", []))
35 elif len(page_data) > 0: 35 ↛ 39line 35 didn't jump to line 39 because the condition on line 35 was always true
36 page_data[-1].sounds.extend(sounds)
37 page_data[-1].categories.extend(cats.get("categories", []))
38 else:
39 base_data.sounds.extend(sounds)
40 base_data.categories.extend(cats.get("categories", []))
43def process_sound_template(
44 wxr: WiktextractContext,
45 template_node: TemplateNode,
46 sounds: list[Sound],
47 cats: dict[str, list[str]],
48) -> None:
49 if template_node.template_name == "音声":
50 audio_file = clean_node(
51 wxr, None, template_node.template_parameters.get(2, "")
52 )
53 if audio_file not in ["", "-"]: 53 ↛ 91line 53 didn't jump to line 91 because the condition on line 53 was always true
54 sound = Sound()
55 raw_tag = clean_node(
56 wxr, None, template_node.template_parameters.get(3, "")
57 )
58 if len(raw_tag) > 0: 58 ↛ 60line 58 didn't jump to line 60 because the condition on line 58 was always true
59 sound.raw_tags.append(raw_tag)
60 set_sound_file_url_fields(wxr, audio_file, sound)
61 sounds.append(sound)
62 elif template_node.template_name in ["IPA", "X-SAMPA"]:
63 for index in itertools.count(1): 63 ↛ 91line 63 didn't jump to line 91 because the loop on line 63 didn't complete
64 if index not in template_node.template_parameters:
65 break
66 ipa = clean_node(
67 wxr, None, template_node.template_parameters[index]
68 )
69 if len(ipa) > 0: 69 ↛ 63line 69 didn't jump to line 63 because the condition on line 69 was always true
70 sound = Sound(ipa=ipa)
71 if template_node.template_name == "X-SAMPA":
72 sound.tags.append("X-SAMPA")
73 sounds.append(sound)
74 elif template_node.template_name == "homophones":
75 homophones = []
76 for index in itertools.count(1): 76 ↛ 84line 76 didn't jump to line 84 because the loop on line 76 didn't complete
77 if index not in template_node.template_parameters:
78 break
79 homophone = clean_node(
80 wxr, None, template_node.template_parameters[index]
81 )
82 if len(homophone) > 0: 82 ↛ 76line 82 didn't jump to line 76 because the condition on line 82 was always true
83 homophones.append(homophone)
84 if len(homophones) > 0: 84 ↛ 91line 84 didn't jump to line 91 because the condition on line 84 was always true
85 sounds.append(Sound(homophones=homophones))
86 elif template_node.template_name == "ja-pron":
87 process_ja_pron_template(wxr, template_node, sounds)
88 elif template_node.template_name == "ja-accent-common": 88 ↛ 91line 88 didn't jump to line 91 because the condition on line 88 was always true
89 process_ja_accent_common_template(wxr, template_node, sounds)
91 clean_node(wxr, cats, template_node)
94JA_PRON_ACCENTS = {
95 "中高型": "Nakadaka",
96 "平板型": "Heiban",
97 "頭高型": "Atamadaka",
98 "尾高型": "Odaka",
99}
102def process_ja_pron_template(
103 wxr: WiktextractContext,
104 template_node: TemplateNode,
105 sounds: list[Sound],
106) -> None:
107 # https://ja.wiktionary.org/wiki/テンプレート:ja-pron
108 expanded_node = wxr.wtp.parse(
109 wxr.wtp.node_to_wikitext(template_node), expand_all=True
110 )
111 for list_item in expanded_node.find_child_recursively(NodeKind.LIST_ITEM):
112 if list_item.contain_node(NodeKind.TABLE): 112 ↛ 113line 112 didn't jump to line 113 because the condition on line 112 was never true
113 continue
114 else:
115 sound = Sound()
116 for span_tag in list_item.find_html_recursively("span"):
117 span_classes = span_tag.attrs.get("class", "")
118 if "qualifier-content" in span_classes:
119 raw_tag = clean_node(wxr, None, span_tag)
120 if len(raw_tag) > 0: 120 ↛ 116line 120 didn't jump to line 116 because the condition on line 120 was always true
121 sound.raw_tags.append(raw_tag)
122 elif "IPA" in span_classes:
123 sound.ipa = clean_node(wxr, None, span_tag)
124 elif "Latn" in span_classes:
125 sound.roman = clean_node(wxr, None, span_tag)
126 elif "Jpan" in span_classes:
127 sound.form = clean_node(wxr, None, span_tag)
128 for link_node in list_item.find_child(NodeKind.LINK):
129 link_text = clean_node(wxr, None, link_node)
130 if link_text in JA_PRON_ACCENTS:
131 sound.tags.append(JA_PRON_ACCENTS[link_text])
132 if len(sound.model_dump(exclude_defaults=True)) > 0:
133 sounds.append(sound)
135 for arg in ["a", "audio"]:
136 audio_file = clean_node(
137 wxr, None, template_node.template_parameters.get(arg, "")
138 )
139 if len(audio_file) > 0:
140 sound = Sound()
141 set_sound_file_url_fields(wxr, audio_file, sound)
142 sounds.append(sound)
145JA_ACCENT_COMMON_TYPES = {
146 "h": "Heiban",
147 "a": "Atamadaka",
148 "n": "Nakadaka",
149 "o": "Odaka",
150}
153def process_ja_accent_common_template(
154 wxr: WiktextractContext,
155 template_node: TemplateNode,
156 sounds: list[Sound],
157) -> None:
158 # https://ja.wiktionary.org/wiki/テンプレート:ja-accent-common
159 expanded_node = wxr.wtp.parse(
160 wxr.wtp.node_to_wikitext(template_node), expand_all=True
161 )
162 sound = Sound()
163 for link_node in expanded_node.find_child_recursively(NodeKind.LINK): 163 ↛ 168line 163 didn't jump to line 168 because the loop on line 163 didn't complete
164 raw_tag = clean_node(wxr, None, link_node)
165 if raw_tag != "": 165 ↛ 163line 165 didn't jump to line 163 because the condition on line 165 was always true
166 sound.raw_tags.append(raw_tag)
167 break
168 for span_tag in expanded_node.find_html_recursively("span"): 168 ↛ 173line 168 didn't jump to line 173 because the loop on line 168 didn't complete
169 span_text = clean_node(wxr, None, span_tag)
170 if len(span_text) > 0: 170 ↛ 168line 170 didn't jump to line 168 because the condition on line 170 was always true
171 sound.form = span_text
172 break
173 accent_type = clean_node(
174 wxr, None, template_node.template_parameters.get(1, "")
175 )
176 if accent_type in JA_ACCENT_COMMON_TYPES: 176 ↛ 178line 176 didn't jump to line 178 because the condition on line 176 was always true
177 sound.tags.append(JA_ACCENT_COMMON_TYPES[accent_type])
178 if sound.form != "": 178 ↛ exitline 178 didn't return from function 'process_ja_accent_common_template' because the condition on line 178 was always true
179 sounds.append(sound)
182def extract_zh_sounds(
183 wxr: WiktextractContext, level_node: LevelNode, sounds: list[Sound]
184) -> None:
185 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM):
186 after_colon = False
187 tag_nodes = []
188 value_nodes = []
189 for child in list_item.children:
190 if isinstance(child, str) and ":" in child and not after_colon:
191 tag_nodes.append(child[: child.index(":")])
192 value_nodes.append(child[child.index(":") + 1 :])
193 after_colon = True
194 elif not after_colon: 194 ↛ 197line 194 didn't jump to line 197 because the condition on line 194 was always true
195 tag_nodes.append(child)
196 else:
197 value_nodes.append(child)
198 sound = Sound(
199 zh_pron=clean_node(wxr, None, value_nodes),
200 raw_tags=[clean_node(wxr, None, tag_nodes)],
201 )
202 translate_raw_tags(sound)
203 sounds.append(sound)
206def extract_homophone_section(
207 wxr: WiktextractContext,
208 page_data: list[WordEntry],
209 base_data: WordEntry,
210 level_node: LevelNode,
211) -> None:
212 sounds = []
213 for list_node in level_node.find_child(NodeKind.LIST):
214 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
215 for node in list_item.children:
216 if isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
217 word = clean_node(wxr, None, node)
218 if word != "": 218 ↛ 215line 218 didn't jump to line 215 because the condition on line 218 was always true
219 sounds.append(Sound(homophones=[word]))
220 elif (
221 isinstance(node, TemplateNode) and node.template_name == "l"
222 ):
223 from .linkage import extract_l_template
225 l_data = extract_l_template(wxr, node)
226 if l_data.word != "": 226 ↛ 215line 226 didn't jump to line 215 because the condition on line 226 was always true
227 sounds.append(
228 Sound(
229 homophones=[l_data.word],
230 sense=l_data.sense,
231 tags=l_data.tags,
232 raw_tags=l_data.raw_tags,
233 )
234 )
236 if level_node.kind == NodeKind.LEVEL3: 236 ↛ 241line 236 didn't jump to line 241 because the condition on line 236 was always true
237 base_data.sounds.extend(sounds)
238 for data in page_data: 238 ↛ 239line 238 didn't jump to line 239 because the loop on line 238 never started
239 if data.lang_code == base_data.lang_code:
240 data.sounds.extend(sounds)
241 elif len(page_data) > 0:
242 page_data[-1].sounds.extend(sounds)
243 else:
244 base_data.sounds.extend(sounds)