Coverage for src/wiktextract/extractor/vi/sound.py: 81%
149 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-12 08:27 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-12 08:27 +0000
1from dataclasses import dataclass
3from wikitextprocessor import (
4 HTMLNode,
5 LevelNode,
6 NodeKind,
7 TemplateNode,
8 WikiNode,
9)
11from ...page import clean_node
12from ...wxr_context import WiktextractContext
13from ..share import set_sound_file_url_fields
14from .models import Hyphenation, Sound, WordEntry
15from .tags import translate_raw_tags
18def extract_sound_section(
19 wxr: WiktextractContext, base_data: WordEntry, level_node: LevelNode
20):
21 for node in level_node.children:
22 if isinstance(node, TemplateNode):
23 if node.template_name == "vie-pron":
24 extract_vie_pron_template(wxr, base_data, node)
25 elif node.template_name in [ 25 ↛ 30line 25 didn't jump to line 30 because the condition on line 25 was never true
26 "âm thanh-IPA",
27 "pron-audio",
28 "audio-for-pron",
29 ]:
30 extract_pron_audio_template(wxr, base_data, node)
31 elif node.template_name == "tyz-IPA": 31 ↛ 21line 31 didn't jump to line 21 because the condition on line 31 was always true
32 extract_tyz_ipa_template(wxr, base_data, node)
33 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
34 for list_item in node.find_child(NodeKind.LIST_ITEM):
35 extract_sound_list_item(wxr, base_data, list_item)
38def extract_sound_list_item(
39 wxr: WiktextractContext, base_data: WordEntry, list_item: WikiNode
40):
41 for node in list_item.children:
42 if isinstance(node, TemplateNode):
43 if node.template_name.lower() in ["âm thanh", "audio", "âm thanh"]: 43 ↛ 44line 43 didn't jump to line 44 because the condition on line 43 was never true
44 extract_audio_template(wxr, base_data, node)
45 elif node.template_name in [
46 "IPA",
47 "IPA2",
48 "IPA3",
49 "IPA4",
50 "fra-IPA",
51 "fr-IPA",
52 ]:
53 extract_ipa_template(wxr, base_data, node, "IPA")
54 elif node.template_name in ["enPR", "AHD"]:
55 extract_ipa_template(wxr, base_data, node, "enPR")
56 elif node.template_name in ["rhymes", "rhyme"]:
57 extract_rhymes_template(wxr, base_data, node)
58 elif node.template_name in ["hyphenation", "hyph"]: 58 ↛ 41line 58 didn't jump to line 41 because the condition on line 58 was always true
59 extract_hyphenation_template(wxr, base_data, node)
60 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 60 ↛ 61line 60 didn't jump to line 61 because the condition on line 60 was never true
61 for child_list_item in node.find_child(NodeKind.LIST_ITEM):
62 extract_sound_list_item(wxr, base_data, child_list_item)
65@dataclass
66class TableHeader:
67 text: str
68 index: int
69 span: int
72def extract_vie_pron_template(
73 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
74):
75 expanded_node = wxr.wtp.parse(
76 wxr.wtp.node_to_wikitext(t_node), expand_all=True
77 )
78 for table in expanded_node.find_child(NodeKind.TABLE):
79 col_headers = []
80 for row in table.find_child(NodeKind.TABLE_ROW):
81 col_index = 0
82 for cell in row.find_child(
83 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
84 ):
85 if cell.kind == NodeKind.TABLE_HEADER_CELL:
86 if col_index == 0:
87 col_headers.clear()
88 colspan = int(cell.attrs.get("colspan", "1"))
89 col_headers.append(
90 TableHeader(
91 clean_node(wxr, None, cell), col_index, colspan
92 )
93 )
94 col_index += colspan
95 else:
96 colspan = int(cell.attrs.get("colspan", "1"))
97 for span_tag in cell.find_html(
98 "span", attr_name="class", attr_value="IPA"
99 ):
100 extract_vie_pron_span_tag(
101 wxr,
102 base_data,
103 span_tag,
104 col_index,
105 colspan,
106 col_headers,
107 )
108 col_index += colspan
109 for td_tag in cell.find_html("td"):
110 colspan = int(td_tag.attrs.get("colspan", "1"))
111 for span_tag in td_tag.find_html(
112 "span", attr_name="class", attr_value="IPA"
113 ):
114 extract_vie_pron_span_tag(
115 wxr,
116 base_data,
117 span_tag,
118 col_index,
119 colspan,
120 col_headers,
121 )
122 col_index += colspan
124 for link in expanded_node.find_child(NodeKind.LINK):
125 clean_node(wxr, base_data, link)
128def extract_vie_pron_span_tag(
129 wxr: WiktextractContext,
130 base_data: WordEntry,
131 span_tag: HTMLNode,
132 index: str,
133 colspan: int,
134 col_headers: list[TableHeader],
135):
136 ipa = clean_node(wxr, None, span_tag)
137 if ipa != "": 137 ↛ exitline 137 didn't return from function 'extract_vie_pron_span_tag' because the condition on line 137 was always true
138 sound = Sound(ipa=ipa)
139 for header in col_headers:
140 if (
141 index < header.index + header.span
142 and index + colspan > header.index
143 ):
144 sound.raw_tags.append(header.text)
145 translate_raw_tags(sound)
146 base_data.sounds.append(sound)
149def extract_pron_audio_template(
150 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
151):
152 file = clean_node(wxr, None, t_node.template_parameters.get("file", ""))
153 if file == "":
154 return
155 sound = Sound()
156 set_sound_file_url_fields(wxr, file, sound)
157 place = clean_node(wxr, None, t_node.template_parameters.get("place", ""))
158 if place != "":
159 sound.raw_tags.append(place)
160 sound.ipa = clean_node(
161 wxr, None, t_node.template_parameters.get("pron", "")
162 )
163 translate_raw_tags(sound)
164 base_data.sounds.append(sound)
167def extract_audio_template(
168 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
169):
170 file = clean_node(wxr, None, t_node.template_parameters.get(2, ""))
171 if file == "":
172 return
173 sound = Sound()
174 set_sound_file_url_fields(wxr, file, sound)
175 raw_tag = clean_node(wxr, None, t_node.template_parameters.get(3, ""))
176 if raw_tag != "":
177 sound.raw_tags.append(raw_tag)
178 translate_raw_tags(sound)
179 base_data.sounds.append(sound)
182def extract_tyz_ipa_template(
183 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
184):
185 expanded_node = wxr.wtp.parse(
186 wxr.wtp.node_to_wikitext(t_node), expand_all=True
187 )
188 for list in expanded_node.find_child(NodeKind.LIST):
189 for list_item in list.find_child(NodeKind.LIST_ITEM):
190 sound = Sound()
191 for node in list_item.children:
192 if isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC:
193 raw_tag = clean_node(wxr, None, node)
194 if raw_tag != "": 194 ↛ 191line 194 didn't jump to line 191 because the condition on line 194 was always true
195 sound.raw_tags.append(raw_tag)
196 elif (
197 isinstance(node, HTMLNode)
198 and node.tag == "span"
199 and "IPA" in node.attrs.get("class", "").split()
200 ):
201 sound.ipa = clean_node(wxr, None, node)
202 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
203 clean_node(wxr, base_data, node)
204 if sound.ipa != "": 204 ↛ 189line 204 didn't jump to line 189 because the condition on line 204 was always true
205 base_data.sounds.append(sound)
208def extract_ipa_template(
209 wxr: WiktextractContext,
210 base_data: WordEntry,
211 t_node: TemplateNode,
212 ipa_class: str,
213):
214 # https://vi.wiktionary.org/wiki/Bản_mẫu:IPA
215 expanded_node = wxr.wtp.parse(
216 wxr.wtp.node_to_wikitext(t_node), expand_all=True
217 )
218 raw_tags = []
219 for span_tag in expanded_node.find_html("span"):
220 class_names = span_tag.attrs.get("class", "").split()
221 if "qualifier-content" in class_names:
222 raw_tag = clean_node(wxr, None, span_tag)
223 if raw_tag != "": 223 ↛ 219line 223 didn't jump to line 219 because the condition on line 223 was always true
224 raw_tags.append(raw_tag)
225 elif ipa_class in class_names:
226 ipa = clean_node(wxr, None, span_tag)
227 if ipa != "": 227 ↛ 219line 227 didn't jump to line 219 because the condition on line 227 was always true
228 sound = Sound(ipa=ipa, raw_tags=raw_tags)
229 translate_raw_tags(sound)
230 base_data.sounds.append(sound)
232 for link in expanded_node.find_child(NodeKind.LINK):
233 clean_node(wxr, base_data, link)
236def extract_rhymes_template(
237 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
238):
239 # https://vi.wiktionary.org/wiki/Bản_mẫu:rhymes
240 expanded_node = wxr.wtp.parse(
241 wxr.wtp.node_to_wikitext(t_node), expand_all=True
242 )
243 for span_tag in expanded_node.find_html_recursively(
244 "span", attr_name="class", attr_value="IPA"
245 ):
246 rhyme = clean_node(wxr, None, span_tag)
247 if rhyme != "": 247 ↛ 243line 247 didn't jump to line 243 because the condition on line 247 was always true
248 base_data.sounds.append(Sound(rhymes=rhyme))
250 for link in expanded_node.find_child(NodeKind.LINK):
251 clean_node(wxr, base_data, link)
254def extract_hyphenation_template(
255 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
256):
257 # https://vi.wiktionary.org/wiki/Bản_mẫu:hyphenation
258 expanded_node = wxr.wtp.parse(
259 wxr.wtp.node_to_wikitext(t_node), expand_all=True
260 )
261 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
262 for span_tag in expanded_node.find_html(
263 "span", attr_name="lang", attr_value=lang_code
264 ):
265 h_str = clean_node(wxr, None, span_tag)
266 h_data = Hyphenation()
267 for part in h_str.split("‧"):
268 part = part.strip()
269 if part != "": 269 ↛ 267line 269 didn't jump to line 267 because the condition on line 269 was always true
270 h_data.parts.append(part)
271 if len(h_data.parts) > 0: 271 ↛ 262line 271 didn't jump to line 262 because the condition on line 271 was always true
272 base_data.hyphenations.append(h_data)