Coverage for src/wiktextract/extractor/ms/sound.py: 78%
135 statements
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-04 10:58 +0000
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-04 10:58 +0000
1from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode
3from ...page import clean_node
4from ...wxr_context import WiktextractContext
5from ..share import set_sound_file_url_fields
6from .models import Sound, WordEntry
7from .tags import translate_raw_tags
10def extract_sound_section(
11 wxr: WiktextractContext,
12 page_data: list[WordEntry],
13 base_data: WordEntry,
14 level_node: LevelNode,
15) -> None:
16 sounds = []
17 for list_node in level_node.find_child(NodeKind.LIST):
18 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
19 sounds.extend(extract_sound_list_item(wxr, list_item))
20 for node in level_node.find_child(NodeKind.TEMPLATE): 20 ↛ 21line 20 didn't jump to line 21 because the loop on line 20 never started
21 extract_sound_templates(wxr, node, [])
23 if len(page_data) == 0 or page_data[-1].lang_code != base_data.lang_code: 23 ↛ 24line 23 didn't jump to line 24 because the condition on line 23 was never true
24 for sound in sounds:
25 if sound.hyphenation != "":
26 base_data.hyphenation = sound.hyphenation
27 else:
28 base_data.sounds.append(sound)
29 for cat in sound.categories:
30 if cat not in base_data:
31 base_data.categories.append(cat)
32 elif level_node.kind == NodeKind.LEVEL3: 32 ↛ 44line 32 didn't jump to line 44 because the condition on line 32 was always true
33 for data in page_data:
34 if data.lang_code == page_data[-1].lang_code: 34 ↛ 33line 34 didn't jump to line 33 because the condition on line 34 was always true
35 for sound in sounds:
36 if sound.hyphenation != "":
37 data.hyphenation = sound.hyphenation
38 else:
39 data.sounds.append(sound)
40 for cat in sound.categories:
41 if cat not in data.categories:
42 data.categories.append(cat)
43 else:
44 for sound in sounds:
45 if sound.hyphenation != "":
46 page_data[-1].hyphenation = sound.hyphenation
47 else:
48 page_data[-1].sounds.append(sound)
49 for cat in sound.categories:
50 if cat not in page_data[-1].categories:
51 page_data[-1].categories.append(cat)
54def extract_sound_list_item(
55 wxr: WiktextractContext, list_item: WikiNode
56) -> list[Sound]:
57 raw_tags = []
58 cats = {}
59 sounds = []
60 for node in list_item.children:
61 if isinstance(node, TemplateNode):
62 if node.template_name in ["a", "accent"]:
63 raw_tag = clean_node(wxr, cats, node).strip("() ")
64 if raw_tag != "": 64 ↛ 60line 64 didn't jump to line 60 because the condition on line 64 was always true
65 raw_tags.append(raw_tag)
66 else:
67 sounds.extend(extract_sound_templates(wxr, node, raw_tags))
68 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
69 for child_list_item in node.find_child(NodeKind.LIST_ITEM):
70 sounds.extend(extract_sound_list_item(wxr, child_list_item))
71 elif isinstance(node, str) and node.strip().endswith(":"): 71 ↛ 72line 71 didn't jump to line 72 because the condition on line 71 was never true
72 raw_tag = node.strip(": ")
73 if raw_tag != "":
74 raw_tags.append(raw_tag)
75 for sound in sounds:
76 sound.categories.extend(cats.get("categories", []))
77 return sounds
80def extract_sound_templates(
81 wxr: WiktextractContext, t_node: TemplateNode, raw_tags: list[str]
82) -> list[Sound]:
83 sounds = []
84 if t_node.template_name == "dewan":
85 sounds.extend(extract_dewan_template(wxr, t_node))
86 elif t_node.template_name in ["audio-AFA", "audio-IPA"]:
87 sounds.extend(extract_audio_ipa_template(wxr, t_node, raw_tags))
88 elif t_node.template_name.lower() in [
89 "afa",
90 "ipa",
91 ] or t_node.template_name.lower().endswith(("-afa", "-ipa")):
92 sounds.extend(extract_ipa_template(wxr, t_node, raw_tags))
93 elif t_node.template_name in ["penyempangan", "hyphenation", "hyph"]:
94 sounds.extend(extract_hyph_template(wxr, t_node))
95 elif t_node.template_name == "audio":
96 sounds.extend(extract_audio_template(wxr, t_node))
97 elif t_node.template_name in ["rima", "rhymes", "rhyme"]: 97 ↛ 99line 97 didn't jump to line 99 because the condition on line 97 was always true
98 sounds.extend(extract_rhyme_template(wxr, t_node))
99 return sounds
102def extract_dewan_template(
103 wxr: WiktextractContext, t_node: TemplateNode
104) -> list[Sound]:
105 sounds = []
106 cats = {}
107 text = clean_node(wxr, cats, t_node).removeprefix("Kamus Dewan:").strip()
108 if text != "": 108 ↛ 116line 108 didn't jump to line 116 because the condition on line 108 was always true
109 sounds.append(
110 Sound(
111 other=text,
112 raw_tags=["Kamus Dewan"],
113 categories=cats.get("categories", []),
114 )
115 )
116 return sounds
119def extract_ipa_template(
120 wxr: WiktextractContext, t_node: TemplateNode, raw_tags: list[str]
121) -> list[Sound]:
122 sounds = []
123 cats = {}
124 expanded_template = wxr.wtp.parse(
125 wxr.wtp.node_to_wikitext(t_node), expand_all=True
126 )
127 clean_node(wxr, cats, expanded_template)
128 for span_tag in expanded_template.find_html(
129 "span", attr_name="class", attr_value="IPA"
130 ):
131 ipa = clean_node(wxr, None, span_tag)
132 if ipa != "": 132 ↛ 128line 132 didn't jump to line 128 because the condition on line 132 was always true
133 sound = Sound(
134 ipa=ipa,
135 raw_tags=raw_tags,
136 categories=cats.get("categories", []),
137 )
138 translate_raw_tags(sound)
139 sounds.append(sound)
140 return sounds
143def extract_hyph_template(
144 wxr: WiktextractContext, t_node: TemplateNode
145) -> list[Sound]:
146 sounds = []
147 expanded_template = wxr.wtp.parse(
148 wxr.wtp.node_to_wikitext(t_node), expand_all=True
149 )
150 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
151 for span_tag in expanded_template.find_html(
152 "span", attr_name="lang", attr_value=lang_code
153 ):
154 text = clean_node(wxr, None, span_tag)
155 if text != "": 155 ↛ 151line 155 didn't jump to line 151 because the condition on line 155 was always true
156 sounds.append(Sound(hyphenation=text))
157 return sounds
160def extract_audio_template(
161 wxr: WiktextractContext, t_node: TemplateNode
162) -> list[Sound]:
163 sounds = []
164 filename = clean_node(wxr, None, t_node.template_parameters.get(2, ""))
165 cats = {}
166 clean_node(wxr, cats, t_node)
167 if filename != "": 167 ↛ 171line 167 didn't jump to line 171 because the condition on line 167 was always true
168 sound = Sound(categories=cats.get("categories", []))
169 set_sound_file_url_fields(wxr, filename, sound)
170 sounds.append(sound)
171 return sounds
174def extract_rhyme_template(
175 wxr: WiktextractContext, t_node: TemplateNode
176) -> list[Sound]:
177 sounds = []
178 expanded_template = wxr.wtp.parse(
179 wxr.wtp.node_to_wikitext(t_node), expand_all=True
180 )
181 cats = {}
182 clean_node(wxr, cats, expanded_template)
183 for link in expanded_template.find_child(NodeKind.LINK):
184 sound = Sound(categories=cats.get("categories", []))
185 text = clean_node(wxr, None, link)
186 if text != "":
187 sound.rhymes = text
188 sounds.append(sound)
189 return sounds
192def extract_audio_ipa_template(
193 wxr: WiktextractContext,
194 t_node: TemplateNode,
195 raw_tags: list[str],
196) -> list[Sound]:
197 sounds = []
198 filename = clean_node(wxr, None, t_node.template_parameters.get(2, ""))
199 cats = {}
200 clean_node(wxr, cats, t_node)
201 if filename != "": 201 ↛ 206line 201 didn't jump to line 206 because the condition on line 201 was always true
202 ipa = clean_node(wxr, None, t_node.template_parameters.get(3, ""))
203 sound = Sound(ipa=ipa, categories=cats.get("categories", []))
204 set_sound_file_url_fields(wxr, filename, sound)
205 sounds.append(sound)
206 return sounds