Coverage for src/wiktextract/extractor/vi/sound.py: 36%
355 statements
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
1import re
2from dataclasses import dataclass
4from wikitextprocessor import (
5 HTMLNode,
6 LevelNode,
7 NodeKind,
8 TemplateNode,
9 WikiNode,
10)
12from ...page import clean_node
13from ...wxr_context import WiktextractContext
14from ..share import set_sound_file_url_fields
15from .models import Hyphenation, Sound, WordEntry
16from .tags import translate_raw_tags
19def extract_sound_section(
20 wxr: WiktextractContext, base_data: WordEntry, level_node: LevelNode
21):
22 for node in level_node.children:
23 if isinstance(node, TemplateNode):
24 if node.template_name == "vie-pron":
25 extract_vie_pron_template(wxr, base_data, node)
26 elif node.template_name in [ 26 ↛ 31line 26 didn't jump to line 31 because the condition on line 26 was never true
27 "âm thanh-IPA",
28 "pron-audio",
29 "audio-for-pron",
30 ]:
31 extract_pron_audio_template(wxr, base_data, node)
32 elif node.template_name == "tyz-IPA": 32 ↛ 34line 32 didn't jump to line 34 because the condition on line 32 was always true
33 extract_tyz_ipa_template(wxr, base_data, node)
34 elif node.template_name in ["zh-pron", "zho-pron"]:
35 extract_zh_pron_template(wxr, base_data, node)
36 elif node.template_name in ["th-pron", "tha-pron"]:
37 extract_th_pron_template(wxr, base_data, node)
38 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
39 for list_item in node.find_child(NodeKind.LIST_ITEM):
40 extract_sound_list_item(wxr, base_data, list_item)
43def extract_sound_list_item(
44 wxr: WiktextractContext, base_data: WordEntry, list_item: WikiNode
45):
46 for node in list_item.children:
47 if isinstance(node, TemplateNode):
48 if node.template_name in ["âm thanh", "Audio", "Âm thanh"]: 48 ↛ 49line 48 didn't jump to line 49 because the condition on line 48 was never true
49 extract_audio_template(wxr, base_data, node, 1)
50 elif node.template_name in ["âm thanh-2", "audio"]: 50 ↛ 51line 50 didn't jump to line 51 because the condition on line 50 was never true
51 extract_audio_template(wxr, base_data, node, 2)
52 elif node.template_name in [
53 "IPA",
54 "IPA2",
55 "IPA3",
56 "IPA4",
57 "fra-IPA",
58 "fr-IPA",
59 ]:
60 extract_ipa_template(wxr, base_data, node, "IPA")
61 elif node.template_name in ["enPR", "AHD"]:
62 extract_ipa_template(wxr, base_data, node, "enPR")
63 elif node.template_name in ["rhymes", "rhyme"]:
64 extract_rhymes_template(wxr, base_data, node)
65 elif node.template_name in ["hyphenation", "hyph"]: 65 ↛ 46line 65 didn't jump to line 46 because the condition on line 65 was always true
66 extract_hyphenation_template(wxr, base_data, node)
67 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 67 ↛ 68line 67 didn't jump to line 68 because the condition on line 67 was never true
68 for child_list_item in node.find_child(NodeKind.LIST_ITEM):
69 extract_sound_list_item(wxr, base_data, child_list_item)
72@dataclass
73class TableHeader:
74 text: str
75 index: int
76 span: int
79def extract_vie_pron_template(
80 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
81):
82 expanded_node = wxr.wtp.parse(
83 wxr.wtp.node_to_wikitext(t_node), expand_all=True
84 )
85 for table in expanded_node.find_child(NodeKind.TABLE):
86 col_headers = []
87 for row in table.find_child(NodeKind.TABLE_ROW):
88 col_index = 0
89 for cell in row.find_child(
90 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
91 ):
92 if cell.kind == NodeKind.TABLE_HEADER_CELL:
93 if col_index == 0:
94 col_headers.clear()
95 colspan = int(cell.attrs.get("colspan", "1"))
96 col_headers.append(
97 TableHeader(
98 clean_node(wxr, None, cell), col_index, colspan
99 )
100 )
101 col_index += colspan
102 else:
103 colspan = int(cell.attrs.get("colspan", "1"))
104 for span_tag in cell.find_html(
105 "span", attr_name="class", attr_value="IPA"
106 ):
107 extract_vie_pron_span_tag(
108 wxr,
109 base_data,
110 span_tag,
111 col_index,
112 colspan,
113 col_headers,
114 )
115 col_index += colspan
116 for td_tag in cell.find_html("td"):
117 colspan = int(td_tag.attrs.get("colspan", "1"))
118 for span_tag in td_tag.find_html(
119 "span", attr_name="class", attr_value="IPA"
120 ):
121 extract_vie_pron_span_tag(
122 wxr,
123 base_data,
124 span_tag,
125 col_index,
126 colspan,
127 col_headers,
128 )
129 col_index += colspan
131 for link in expanded_node.find_child(NodeKind.LINK):
132 clean_node(wxr, base_data, link)
135def extract_vie_pron_span_tag(
136 wxr: WiktextractContext,
137 base_data: WordEntry,
138 span_tag: HTMLNode,
139 index: str,
140 colspan: int,
141 col_headers: list[TableHeader],
142):
143 ipa = clean_node(wxr, None, span_tag)
144 if ipa != "": 144 ↛ exitline 144 didn't return from function 'extract_vie_pron_span_tag' because the condition on line 144 was always true
145 sound = Sound(ipa=ipa)
146 for header in col_headers:
147 if (
148 index < header.index + header.span
149 and index + colspan > header.index
150 ):
151 sound.raw_tags.append(header.text)
152 translate_raw_tags(sound)
153 base_data.sounds.append(sound)
156def extract_pron_audio_template(
157 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
158):
159 file = clean_node(wxr, None, t_node.template_parameters.get("file", ""))
160 if file == "":
161 return
162 sound = Sound()
163 set_sound_file_url_fields(wxr, file, sound)
164 place = clean_node(wxr, None, t_node.template_parameters.get("place", ""))
165 if place != "":
166 sound.raw_tags.append(place)
167 sound.ipa = clean_node(
168 wxr, None, t_node.template_parameters.get("pron", "")
169 )
170 translate_raw_tags(sound)
171 base_data.sounds.append(sound)
174def extract_audio_template(
175 wxr: WiktextractContext,
176 base_data: WordEntry,
177 t_node: TemplateNode,
178 start_arg: int,
179):
180 # https://vi.wiktionary.org/wiki/Bản_mẫu:âm_thanh
181 # https://vi.wiktionary.org/wiki/Bản_mẫu:âm_thanh-2
182 file = clean_node(wxr, None, t_node.template_parameters.get(start_arg, ""))
183 if file == "":
184 return
185 sound = Sound()
186 set_sound_file_url_fields(wxr, file, sound)
187 raw_tag = clean_node(
188 wxr, None, t_node.template_parameters.get(start_arg + 1, "")
189 )
190 if raw_tag != "":
191 sound.raw_tags.append(raw_tag)
192 translate_raw_tags(sound)
193 base_data.sounds.append(sound)
196def extract_tyz_ipa_template(
197 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
198):
199 expanded_node = wxr.wtp.parse(
200 wxr.wtp.node_to_wikitext(t_node), expand_all=True
201 )
202 for list in expanded_node.find_child(NodeKind.LIST):
203 for list_item in list.find_child(NodeKind.LIST_ITEM):
204 sound = Sound()
205 for node in list_item.children:
206 if isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC:
207 raw_tag = clean_node(wxr, None, node)
208 if raw_tag != "": 208 ↛ 205line 208 didn't jump to line 205 because the condition on line 208 was always true
209 sound.raw_tags.append(raw_tag)
210 elif (
211 isinstance(node, HTMLNode)
212 and node.tag == "span"
213 and "IPA" in node.attrs.get("class", "").split()
214 ):
215 sound.ipa = clean_node(wxr, None, node)
216 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
217 clean_node(wxr, base_data, node)
218 if sound.ipa != "": 218 ↛ 203line 218 didn't jump to line 203 because the condition on line 218 was always true
219 base_data.sounds.append(sound)
222def extract_ipa_template(
223 wxr: WiktextractContext,
224 base_data: WordEntry,
225 t_node: TemplateNode,
226 ipa_class: str,
227):
228 # https://vi.wiktionary.org/wiki/Bản_mẫu:IPA
229 expanded_node = wxr.wtp.parse(
230 wxr.wtp.node_to_wikitext(t_node), expand_all=True
231 )
232 raw_tags = []
233 for span_tag in expanded_node.find_html("span"):
234 class_names = span_tag.attrs.get("class", "").split()
235 if "qualifier-content" in class_names:
236 raw_tag = clean_node(wxr, None, span_tag)
237 if raw_tag != "": 237 ↛ 233line 237 didn't jump to line 233 because the condition on line 237 was always true
238 raw_tags.append(raw_tag)
239 elif ipa_class in class_names:
240 ipa = clean_node(wxr, None, span_tag)
241 if ipa != "": 241 ↛ 233line 241 didn't jump to line 233 because the condition on line 241 was always true
242 sound = Sound(ipa=ipa, raw_tags=raw_tags)
243 translate_raw_tags(sound)
244 base_data.sounds.append(sound)
246 for link in expanded_node.find_child(NodeKind.LINK):
247 clean_node(wxr, base_data, link)
250def extract_rhymes_template(
251 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
252):
253 # https://vi.wiktionary.org/wiki/Bản_mẫu:rhymes
254 expanded_node = wxr.wtp.parse(
255 wxr.wtp.node_to_wikitext(t_node), expand_all=True
256 )
257 for span_tag in expanded_node.find_html_recursively(
258 "span", attr_name="class", attr_value="IPA"
259 ):
260 rhyme = clean_node(wxr, None, span_tag)
261 if rhyme != "": 261 ↛ 257line 261 didn't jump to line 257 because the condition on line 261 was always true
262 base_data.sounds.append(Sound(rhymes=rhyme))
264 for link in expanded_node.find_child(NodeKind.LINK):
265 clean_node(wxr, base_data, link)
268def extract_hyphenation_template(
269 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
270):
271 # https://vi.wiktionary.org/wiki/Bản_mẫu:hyphenation
272 expanded_node = wxr.wtp.parse(
273 wxr.wtp.node_to_wikitext(t_node), expand_all=True
274 )
275 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
276 for span_tag in expanded_node.find_html(
277 "span", attr_name="lang", attr_value=lang_code
278 ):
279 h_str = clean_node(wxr, None, span_tag)
280 h_data = Hyphenation()
281 for part in h_str.split("‧"):
282 part = part.strip()
283 if part != "": 283 ↛ 281line 283 didn't jump to line 281 because the condition on line 283 was always true
284 h_data.parts.append(part)
285 if len(h_data.parts) > 0: 285 ↛ 276line 285 didn't jump to line 276 because the condition on line 285 was always true
286 base_data.hyphenations.append(h_data)
289def extract_homophone_section(
290 wxr: WiktextractContext, base_data: WordEntry, level_node: LevelNode
291):
292 for list in level_node.find_child(NodeKind.LIST):
293 for list_item in list.find_child(NodeKind.LIST_ITEM):
294 for link_node in list_item.find_child(NodeKind.LINK):
295 homophone = clean_node(wxr, None, link_node)
296 if homophone != "":
297 base_data.sounds.append(Sound(homophone=homophone))
300def extract_zh_pron_template(
301 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
302):
303 expanded_node = wxr.wtp.parse(
304 wxr.wtp.node_to_wikitext(t_node), expand_all=True
305 )
306 seen_lists = set()
307 sounds = []
308 for list_node in expanded_node.find_child_recursively(NodeKind.LIST):
309 if list_node not in seen_lists:
310 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
311 sounds.extend(
312 extract_zh_pron_list_item(wxr, list_item, [], seen_lists)
313 )
314 for sound in sounds:
315 translate_raw_tags(sound)
316 base_data.sounds.extend(sounds)
317 clean_node(wxr, base_data, expanded_node)
320def extract_zh_pron_list_item(
321 wxr: WiktextractContext,
322 list_item_node: WikiNode,
323 raw_tags: list[str],
324 seen_lists: set[WikiNode],
325) -> list[Sound]:
326 current_tags = raw_tags[:]
327 sounds = []
328 is_first_small_tag = True
329 for node in list_item_node.children:
330 if isinstance(node, WikiNode):
331 if node.kind == NodeKind.LINK:
332 link_str = clean_node(wxr, None, node.largs)
333 node_str = clean_node(wxr, None, node)
334 if link_str.startswith(("File:", "Tập tin:")):
335 filename = link_str.removeprefix("File:").removeprefix(
336 "Tập tin:"
337 )
338 sound_data = Sound(raw_tags=current_tags)
339 set_sound_file_url_fields(wxr, filename, sound_data)
340 sounds.append(sound_data)
341 elif node_str != "":
342 current_tags.append(node_str.strip("()"))
343 elif isinstance(node, HTMLNode):
344 if node.tag == "small":
345 # remove "ghi chú"(help) <sup> tag
346 if is_first_small_tag:
347 raw_tag_text = clean_node(
348 wxr,
349 None,
350 [
351 n
352 for n in node.children
353 if not (
354 isinstance(n, HTMLNode) and n.tag == "sup"
355 )
356 ],
357 ).rstrip(":")
358 current_tags.extend(split_zh_pron_raw_tag(raw_tag_text))
359 elif len(sounds) > 0:
360 sounds[-1].raw_tags.extend(
361 split_zh_pron_raw_tag(clean_node(wxr, None, node))
362 )
363 is_first_small_tag = False
364 elif node.tag == "span":
365 sounds.extend(extract_zh_pron_span(wxr, node, current_tags))
366 elif (
367 node.tag == "table"
368 and len(current_tags) > 0
369 and current_tags[-1] == "Đồng âm"
370 ):
371 sounds.extend(
372 extract_zh_pron_homophones_table(
373 wxr, node, current_tags
374 )
375 )
376 elif node.kind == NodeKind.LIST:
377 seen_lists.add(node)
378 for next_list_item in node.find_child(NodeKind.LIST_ITEM):
379 sounds.extend(
380 extract_zh_pron_list_item(
381 wxr,
382 next_list_item,
383 current_tags,
384 seen_lists,
385 )
386 )
387 return sounds
390def split_zh_pron_raw_tag(raw_tag_text: str) -> list[str]:
391 raw_tags = []
392 if "(" not in raw_tag_text:
393 for raw_tag in re.split(r",|:|;| và ", raw_tag_text):
394 raw_tag = raw_tag.strip().removeprefix("bao gồm").strip()
395 if raw_tag != "":
396 raw_tags.append(raw_tag)
397 else:
398 processed_offsets = []
399 for match in re.finditer(r"\([^()]+\)", raw_tag_text):
400 processed_offsets.append((match.start(), match.end()))
401 raw_tags.extend(
402 split_zh_pron_raw_tag(
403 raw_tag_text[match.start() + 1 : match.end() - 1]
404 )
405 )
406 not_processed = ""
407 last_end = 0
408 for start, end in processed_offsets:
409 not_processed += raw_tag_text[last_end:start]
410 last_end = end
411 not_processed += raw_tag_text[last_end:]
412 if not_processed != raw_tag_text:
413 raw_tags = split_zh_pron_raw_tag(not_processed) + raw_tags
414 else:
415 raw_tags.append(not_processed)
416 return raw_tags
419def extract_zh_pron_span(
420 wxr: WiktextractContext, span_tag: HTMLNode, raw_tags: list[str]
421) -> list[Sound]:
422 sounds = []
423 small_tags = []
424 pron_nodes = []
425 roman = ""
426 phonetic_pron = ""
427 for index, node in enumerate(span_tag.children):
428 if isinstance(node, HTMLNode) and node.tag == "small":
429 small_tags = split_zh_pron_raw_tag(clean_node(wxr, None, node))
430 elif (
431 isinstance(node, HTMLNode)
432 and node.tag == "span"
433 and "-Latn" in node.attrs.get("lang", "")
434 ):
435 roman = clean_node(wxr, None, node).strip("() ")
436 elif isinstance(node, str) and node.strip() == "[Phonetic:":
437 phonetic_pron = clean_node(
438 wxr, None, span_tag.children[index + 1 :]
439 ).strip("] ")
440 break
441 else:
442 pron_nodes.append(node)
443 for zh_pron in split_zh_pron(clean_node(wxr, None, pron_nodes)):
444 zh_pron = zh_pron.strip("[]: ")
445 if len(zh_pron) > 0:
446 if "IPA" in span_tag.attrs.get("class", ""):
447 sounds.append(
448 Sound(ipa=zh_pron, roman=roman, raw_tags=raw_tags)
449 )
450 else:
451 sounds.append(
452 Sound(zh_pron=zh_pron, roman=roman, raw_tags=raw_tags)
453 )
454 if len(sounds) > 0:
455 sounds[-1].raw_tags.extend(small_tags)
456 if phonetic_pron != "":
457 sounds.append(
458 Sound(
459 zh_pron=phonetic_pron,
460 roman=roman,
461 raw_tags=raw_tags + ["Phonetic"],
462 )
463 )
464 return sounds
467def split_zh_pron(zh_pron: str) -> list[str]:
468 # split by comma and other symbols that outside parentheses
469 parentheses = 0
470 pron_list = []
471 pron = ""
472 for c in zh_pron:
473 if (
474 (c in [",", ";", "→"] or (c == "/" and not zh_pron.startswith("/")))
475 and parentheses == 0
476 and len(pron.strip()) > 0
477 ):
478 pron_list.append(pron.strip())
479 pron = ""
480 elif c == "(":
481 parentheses += 1
482 pron += c
483 elif c == ")":
484 parentheses -= 1
485 pron += c
486 else:
487 pron += c
489 if pron.strip() != "":
490 pron_list.append(pron)
491 return pron_list
494def extract_zh_pron_homophones_table(
495 wxr: WiktextractContext, table: HTMLNode, raw_tags: list[str]
496) -> list[Sound]:
497 sounds = []
498 for td_tag in table.find_html_recursively("td"):
499 for span_tag in td_tag.find_html("span"):
500 span_class = span_tag.attrs.get("class", "")
501 span_lang = span_tag.attrs.get("lang", "")
502 span_str = clean_node(wxr, None, span_tag)
503 if (
504 span_str not in ["", "/"]
505 and span_lang != ""
506 and span_class in ["Hant", "Hans", "Hani"]
507 ):
508 sound = Sound(homophone=span_str, raw_tags=raw_tags)
509 if span_class == "Hant":
510 sound.tags.append("Traditional-Chinese")
511 elif span_class == "Hans":
512 sound.tags.append("Simplified-Chinese")
513 sounds.append(sound)
514 return sounds
517def extract_th_pron_template(
518 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
519):
520 # https://vi.wiktionary.org/wiki/Bản mẫu:th-pron
521 @dataclass
522 class TableHeader:
523 text: str
524 rowspan: int
526 expanded_node = wxr.wtp.parse(
527 wxr.wtp.node_to_wikitext(t_node), expand_all=True
528 )
529 sounds = []
530 for table_tag in expanded_node.find_html("table"):
531 row_headers = []
532 for tr_tag in table_tag.find_html("tr"):
533 field = "other"
534 new_headers = []
535 for header in row_headers:
536 if header.rowspan > 1:
537 header.rowspan -= 1
538 new_headers.append(header)
539 row_headers = new_headers
540 for th_tag in tr_tag.find_html("th"):
541 header_str = clean_node(wxr, None, th_tag)
542 if header_str.startswith("(Tiêu chuẩn) IPA"):
543 field = "ipa"
544 elif header_str.startswith("Từ đồng âm"):
545 field = "homophone"
546 elif header_str == "Âm thanh":
547 field = "audio"
548 elif header_str != "":
549 rowspan = 1
550 rowspan_str = th_tag.attrs.get("rowspan", "1")
551 if re.fullmatch(r"\d+", rowspan_str):
552 rowspan = int(rowspan_str)
553 row_headers.append(TableHeader(header_str, rowspan))
555 for td_tag in tr_tag.find_html("td"):
556 if field == "audio":
557 for link_node in td_tag.find_child(NodeKind.LINK):
558 filename = clean_node(wxr, None, link_node.largs[0])
559 if filename != "":
560 sound = Sound()
561 set_sound_file_url_fields(wxr, filename, sound)
562 sounds.append(sound)
563 elif field == "homophone":
564 for span_tag in td_tag.find_html_recursively(
565 "span", attr_name="lang", attr_value="th"
566 ):
567 word = clean_node(wxr, None, span_tag)
568 if word != "":
569 sounds.append(Sound(homophone=word))
570 else:
571 raw_tag = ""
572 for html_node in td_tag.find_child_recursively(
573 NodeKind.HTML
574 ):
575 if html_node.tag == "small":
576 node_str = clean_node(wxr, None, html_node)
577 if node_str.startswith("[") and node_str.endswith(
578 "]"
579 ):
580 raw_tag = node_str.strip("[]")
581 elif len(sounds) > 0:
582 sounds[-1].roman = node_str
583 elif html_node.tag == "span":
584 node_str = clean_node(wxr, None, html_node)
585 span_lang = html_node.attrs.get("lang", "")
586 span_class = html_node.attrs.get("class", "")
587 if node_str != "" and (
588 span_lang == "th" or span_class in ["IPA", "tr"]
589 ):
590 sound = Sound()
591 setattr(sound, field, node_str)
592 if raw_tag != "":
593 sound.raw_tags.append(raw_tag)
594 for header in row_headers:
595 sound.raw_tags.append(header.text)
596 translate_raw_tags(sound)
597 sounds.append(sound)
599 base_data.sounds.extend(sounds)
600 clean_node(wxr, base_data, expanded_node)