Coverage for src / wiktextract / extractor / ms / sound.py: 57%
182 statements
« prev ^ index » next coverage.py v7.13.1, created at 2026-01-02 00:27 +0000
« prev ^ index » next coverage.py v7.13.1, created at 2026-01-02 00:27 +0000
1from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode
3from ...page import clean_node
4from ...wxr_context import WiktextractContext
5from ..share import set_sound_file_url_fields
6from .models import Hyphenation, Sound, WordEntry
7from .tags import translate_raw_tags
10def extract_sound_section(
11 wxr: WiktextractContext,
12 page_data: list[WordEntry],
13 base_data: WordEntry,
14 level_node: LevelNode,
15) -> None:
16 sounds = []
17 for list_node in level_node.find_child(NodeKind.LIST):
18 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
19 sounds.extend(extract_sound_list_item(wxr, list_item))
20 for node in level_node.find_child(NodeKind.TEMPLATE): 20 ↛ 21line 20 didn't jump to line 21 because the loop on line 20 never started
21 sounds.extend(extract_sound_templates(wxr, node, []))
23 if len(page_data) == 0 or page_data[-1].lang_code != base_data.lang_code: 23 ↛ 24line 23 didn't jump to line 24 because the condition on line 23 was never true
24 for sound in sounds:
25 if len(sound.hyphenations) > 0:
26 base_data.hyphenations.extend(sound.hyphenations)
27 else:
28 base_data.sounds.append(sound)
29 for cat in sound.categories:
30 if cat not in base_data:
31 base_data.categories.append(cat)
32 elif level_node.kind == NodeKind.LEVEL3: 32 ↛ 44line 32 didn't jump to line 44 because the condition on line 32 was always true
33 for data in page_data:
34 if data.lang_code == page_data[-1].lang_code: 34 ↛ 33line 34 didn't jump to line 33 because the condition on line 34 was always true
35 for sound in sounds:
36 if len(sound.hyphenations) > 0:
37 data.hyphenations.extend(sound.hyphenations)
38 else:
39 data.sounds.append(sound)
40 for cat in sound.categories:
41 if cat not in data.categories:
42 data.categories.append(cat)
43 else:
44 for sound in sounds:
45 if len(sound.hyphenations) > 0:
46 page_data[-1].hyphenations.extend(sound.hyphenations)
47 else:
48 page_data[-1].sounds.append(sound)
49 for cat in sound.categories:
50 if cat not in page_data[-1].categories:
51 page_data[-1].categories.append(cat)
54def extract_sound_list_item(
55 wxr: WiktextractContext, list_item: WikiNode
56) -> list[Sound]:
57 raw_tags = []
58 cats = {}
59 sounds = []
60 for node in list_item.children:
61 if isinstance(node, TemplateNode):
62 if node.template_name in ["a", "accent"]:
63 raw_tag = clean_node(wxr, cats, node).strip("() ")
64 if raw_tag != "": 64 ↛ 60line 64 didn't jump to line 60 because the condition on line 64 was always true
65 raw_tags.append(raw_tag)
66 else:
67 sounds.extend(extract_sound_templates(wxr, node, raw_tags))
68 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
69 for child_list_item in node.find_child(NodeKind.LIST_ITEM):
70 sounds.extend(extract_sound_list_item(wxr, child_list_item))
71 elif isinstance(node, str) and node.strip().endswith(":"): 71 ↛ 72line 71 didn't jump to line 72 because the condition on line 71 was never true
72 raw_tag = node.strip(": ")
73 if raw_tag != "":
74 raw_tags.append(raw_tag)
75 for sound in sounds:
76 sound.categories.extend(cats.get("categories", []))
77 return sounds
80def extract_sound_templates(
81 wxr: WiktextractContext, t_node: TemplateNode, raw_tags: list[str]
82) -> list[Sound]:
83 sounds = []
84 if t_node.template_name == "dewan":
85 sounds.extend(extract_dewan_template(wxr, t_node))
86 elif t_node.template_name in ["audio-AFA", "audio-IPA"]:
87 sounds.extend(extract_audio_ipa_template(wxr, t_node, raw_tags))
88 elif t_node.template_name.lower() in ["ko-afa", "ko-ipa", "ko-pron"]: 88 ↛ 89line 88 didn't jump to line 89 because the condition on line 88 was never true
89 sounds.extend(extract_ko_ipa_template(wxr, t_node))
90 elif t_node.template_name.lower() in [
91 "afa",
92 "ipa",
93 ] or t_node.template_name.lower().endswith(("-afa", "-ipa")):
94 sounds.extend(extract_ipa_template(wxr, t_node, raw_tags))
95 elif t_node.template_name in ["penyempangan", "hyphenation", "hyph"]:
96 sounds.extend(extract_hyph_template(wxr, t_node))
97 elif t_node.template_name == "audio":
98 sounds.extend(extract_audio_template(wxr, t_node))
99 elif t_node.template_name in ["rima", "rhymes", "rhyme"]: 99 ↛ 101line 99 didn't jump to line 101 because the condition on line 99 was always true
100 sounds.extend(extract_rhyme_template(wxr, t_node))
101 return sounds
104def extract_dewan_template(
105 wxr: WiktextractContext, t_node: TemplateNode
106) -> list[Sound]:
107 sounds = []
108 cats = {}
109 text = clean_node(wxr, cats, t_node).removeprefix("Kamus Dewan:").strip()
110 if text != "": 110 ↛ 118line 110 didn't jump to line 118 because the condition on line 110 was always true
111 sounds.append(
112 Sound(
113 other=text,
114 raw_tags=["Kamus Dewan"],
115 categories=cats.get("categories", []),
116 )
117 )
118 return sounds
121def extract_ipa_template(
122 wxr: WiktextractContext, t_node: TemplateNode, raw_tags: list[str]
123) -> list[Sound]:
124 sounds = []
125 cats = {}
126 expanded_template = wxr.wtp.parse(
127 wxr.wtp.node_to_wikitext(t_node), expand_all=True
128 )
129 clean_node(wxr, cats, expanded_template)
130 for span_tag in expanded_template.find_html(
131 "span", attr_name="class", attr_value="IPA"
132 ):
133 ipa = clean_node(wxr, None, span_tag)
134 if ipa != "": 134 ↛ 130line 134 didn't jump to line 130 because the condition on line 134 was always true
135 sound = Sound(
136 ipa=ipa,
137 raw_tags=raw_tags,
138 categories=cats.get("categories", []),
139 )
140 translate_raw_tags(sound)
141 sounds.append(sound)
142 return sounds
145def extract_hyph_template(
146 wxr: WiktextractContext, t_node: TemplateNode
147) -> list[Sound]:
148 sounds = []
149 expanded_template = wxr.wtp.parse(
150 wxr.wtp.node_to_wikitext(t_node), expand_all=True
151 )
152 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
153 for span_tag in expanded_template.find_html(
154 "span", attr_name="lang", attr_value=lang_code
155 ):
156 text = clean_node(wxr, None, span_tag)
157 if text != "": 157 ↛ 153line 157 didn't jump to line 153 because the condition on line 157 was always true
158 sounds.append(
159 Sound(hyphenations=[Hyphenation(parts=text.split("‧"))])
160 )
161 return sounds
164def extract_audio_template(
165 wxr: WiktextractContext, t_node: TemplateNode
166) -> list[Sound]:
167 sounds = []
168 filename = clean_node(wxr, None, t_node.template_parameters.get(2, ""))
169 cats = {}
170 clean_node(wxr, cats, t_node)
171 if filename != "": 171 ↛ 175line 171 didn't jump to line 175 because the condition on line 171 was always true
172 sound = Sound(categories=cats.get("categories", []))
173 set_sound_file_url_fields(wxr, filename, sound)
174 sounds.append(sound)
175 return sounds
178def extract_rhyme_template(
179 wxr: WiktextractContext, t_node: TemplateNode
180) -> list[Sound]:
181 sounds = []
182 expanded_template = wxr.wtp.parse(
183 wxr.wtp.node_to_wikitext(t_node), expand_all=True
184 )
185 cats = {}
186 clean_node(wxr, cats, expanded_template)
187 for link in expanded_template.find_child(NodeKind.LINK):
188 sound = Sound(categories=cats.get("categories", []))
189 text = clean_node(wxr, None, link)
190 if text != "":
191 sound.rhymes = text
192 sounds.append(sound)
193 return sounds
196def extract_audio_ipa_template(
197 wxr: WiktextractContext,
198 t_node: TemplateNode,
199 raw_tags: list[str],
200) -> list[Sound]:
201 sounds = []
202 filename = clean_node(wxr, None, t_node.template_parameters.get(2, ""))
203 cats = {}
204 clean_node(wxr, cats, t_node)
205 if filename != "": 205 ↛ 210line 205 didn't jump to line 210 because the condition on line 205 was always true
206 ipa = clean_node(wxr, None, t_node.template_parameters.get(3, ""))
207 sound = Sound(ipa=ipa, categories=cats.get("categories", []))
208 set_sound_file_url_fields(wxr, filename, sound)
209 sounds.append(sound)
210 return sounds
213def extract_ko_ipa_template(
214 wxr: WiktextractContext, t_node: TemplateNode
215) -> list[Sound]:
216 sounds = []
217 expanded_node = wxr.wtp.parse(
218 wxr.wtp.node_to_wikitext(t_node), expand_all=True
219 )
220 for ul_node in expanded_node.find_html("ul"):
221 for li_node in ul_node.find_html("li"):
222 if "ko-pron__ph" in li_node.attrs.get("class", ""):
223 for span_node in li_node.find_html(
224 "span", attr_name="lang", attr_value="ko"
225 ):
226 hangeul_str = clean_node(wxr, None, span_node).strip("[]")
227 for hangeul in hangeul_str.split("/"):
228 if hangeul != "":
229 sounds.append(
230 Sound(hangeul=hangeul, tags=["phonetic"])
231 )
232 else:
233 raw_tags = []
234 for i_node in li_node.find_html("i"):
235 for raw_tag in clean_node(wxr, None, i_node).split("/"):
236 if raw_tag not in ["", "AFA"]:
237 raw_tags.append(raw_tag)
238 for span_node in li_node.find_html(
239 "span", attr_name="class", attr_value="IPA"
240 ):
241 ipas = clean_node(wxr, None, span_node)
242 for ipa in ipas.split("~"):
243 ipa = ipa.strip()
244 if ipa != "":
245 sound = Sound(ipa=ipa, raw_tags=raw_tags)
246 translate_raw_tags(sound)
247 sounds.append(sound)
249 for table in expanded_node.find_html("table"):
250 for tr in table.find_html("tr"):
251 raw_tag = ""
252 for th in tr.find_html("th"):
253 raw_tag = clean_node(wxr, None, th)
254 for td in tr.find_html("td"):
255 roman = clean_node(wxr, None, td)
256 if roman != "":
257 sound = Sound(roman=roman)
258 if raw_tag != "":
259 sound.raw_tags.append(raw_tag)
260 translate_raw_tags(sound)
261 sounds.append(sound)
263 audio_file = clean_node(
264 wxr,
265 None,
266 t_node.template_parameters.get(
267 "a", t_node.template_parameters.get("audio", "")
268 ),
269 )
270 if audio_file != "":
271 sound = Sound()
272 set_sound_file_url_fields(wxr, audio_file, sound)
273 sounds.append(sound)
274 if len(sounds) > 0:
275 clean_node(wxr, sounds[-1], expanded_node)
276 return sounds