Coverage for src / wiktextract / extractor / vi / sound.py: 27%
467 statements
« prev ^ index » next coverage.py v7.13.1, created at 2025-12-29 01:50 +0000
« prev ^ index » next coverage.py v7.13.1, created at 2025-12-29 01:50 +0000
1import re
2from dataclasses import dataclass
4from wikitextprocessor import (
5 HTMLNode,
6 LevelNode,
7 NodeKind,
8 TemplateNode,
9 WikiNode,
10)
12from ...page import clean_node
13from ...wxr_context import WiktextractContext
14from ..share import set_sound_file_url_fields
15from .models import Hyphenation, Sound, WordEntry
16from .tags import translate_raw_tags
19def extract_sound_section(
20 wxr: WiktextractContext, base_data: WordEntry, level_node: LevelNode
21):
22 for node in level_node.children:
23 if isinstance(node, TemplateNode):
24 extract_sound_template(wxr, base_data, node)
25 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
26 for list_item in node.find_child(NodeKind.LIST_ITEM):
27 extract_sound_list_item(wxr, base_data, list_item)
30def extract_sound_template(
31 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
32):
33 if t_node.template_name == "vie-pron":
34 extract_vie_pron_template(wxr, base_data, t_node)
35 elif t_node.template_name in [ 35 ↛ 40line 35 didn't jump to line 40 because the condition on line 35 was never true
36 "âm thanh-IPA",
37 "pron-audio",
38 "audio-for-pron",
39 ]:
40 extract_pron_audio_template(wxr, base_data, t_node)
41 elif t_node.template_name in ["zh-pron", "zho-pron"]: 41 ↛ 42line 41 didn't jump to line 42 because the condition on line 41 was never true
42 extract_zh_pron_template(wxr, base_data, t_node)
43 elif t_node.template_name in ["th-pron", "tha-pron"]: 43 ↛ 44line 43 didn't jump to line 44 because the condition on line 43 was never true
44 extract_th_pron_template(wxr, base_data, t_node)
45 elif t_node.template_name in ["ja-pron", "ja-IPA", "jpn-IPA", "jpn-pron"]: 45 ↛ 46line 45 didn't jump to line 46 because the condition on line 45 was never true
46 extract_ja_pron_template(wxr, base_data, t_node)
47 elif t_node.template_name in ["âm thanh", "Audio", "Âm thanh"]: 47 ↛ 48line 47 didn't jump to line 48 because the condition on line 47 was never true
48 extract_audio_template(wxr, base_data, t_node, 1)
49 elif t_node.template_name in ["âm thanh-2", "audio"]: 49 ↛ 50line 49 didn't jump to line 50 because the condition on line 49 was never true
50 extract_audio_template(wxr, base_data, t_node, 2)
51 elif t_node.template_name.lower() in ["ko-ipa", "kor-ipa"]: 51 ↛ 52line 51 didn't jump to line 52 because the condition on line 51 was never true
52 extract_ko_ipa_template(wxr, base_data, t_node)
53 elif t_node.template_name in [
54 "IPA",
55 "IPA2",
56 "IPA3",
57 "IPA4",
58 ] or t_node.template_name.endswith("-IPA"):
59 extract_ipa_template(wxr, base_data, t_node, "IPA")
60 elif t_node.template_name in ["enPR", "AHD"]:
61 extract_ipa_template(wxr, base_data, t_node, "enPR")
62 elif t_node.template_name in ["rhymes", "rhyme"]:
63 extract_rhymes_template(wxr, base_data, t_node)
64 elif t_node.template_name in ["hyphenation", "hyph"]: 64 ↛ 66line 64 didn't jump to line 66 because the condition on line 64 was always true
65 extract_hyphenation_template(wxr, base_data, t_node)
66 elif t_node.template_name in ["homophones", "homophone", "hmp"]:
67 extract_homophones_template(wxr, base_data, t_node)
70def extract_sound_list_item(
71 wxr: WiktextractContext, base_data: WordEntry, list_item: WikiNode
72):
73 for node in list_item.children:
74 if isinstance(node, TemplateNode):
75 extract_sound_template(wxr, base_data, node)
76 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 76 ↛ 77line 76 didn't jump to line 77 because the condition on line 76 was never true
77 for child_list_item in node.find_child(NodeKind.LIST_ITEM):
78 extract_sound_list_item(wxr, base_data, child_list_item)
81@dataclass
82class TableHeader:
83 text: str
84 index: int
85 span: int
88def extract_vie_pron_template(
89 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
90):
91 expanded_node = wxr.wtp.parse(
92 wxr.wtp.node_to_wikitext(t_node), expand_all=True
93 )
94 for table in expanded_node.find_child(NodeKind.TABLE):
95 col_headers = []
96 for row in table.find_child(NodeKind.TABLE_ROW):
97 col_index = 0
98 for cell in row.find_child(
99 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
100 ):
101 if cell.kind == NodeKind.TABLE_HEADER_CELL:
102 if col_index == 0:
103 col_headers.clear()
104 colspan = int(cell.attrs.get("colspan", "1"))
105 col_headers.append(
106 TableHeader(
107 clean_node(wxr, None, cell), col_index, colspan
108 )
109 )
110 col_index += colspan
111 else:
112 colspan = int(cell.attrs.get("colspan", "1"))
113 for span_tag in cell.find_html(
114 "span", attr_name="class", attr_value="IPA"
115 ):
116 extract_vie_pron_span_tag(
117 wxr,
118 base_data,
119 span_tag,
120 col_index,
121 colspan,
122 col_headers,
123 )
124 col_index += colspan
125 for td_tag in cell.find_html("td"):
126 colspan = int(td_tag.attrs.get("colspan", "1"))
127 for span_tag in td_tag.find_html(
128 "span", attr_name="class", attr_value="IPA"
129 ):
130 extract_vie_pron_span_tag(
131 wxr,
132 base_data,
133 span_tag,
134 col_index,
135 colspan,
136 col_headers,
137 )
138 col_index += colspan
140 for link in expanded_node.find_child(NodeKind.LINK):
141 clean_node(wxr, base_data, link)
144def extract_vie_pron_span_tag(
145 wxr: WiktextractContext,
146 base_data: WordEntry,
147 span_tag: HTMLNode,
148 index: str,
149 colspan: int,
150 col_headers: list[TableHeader],
151):
152 ipa = clean_node(wxr, None, span_tag)
153 if ipa != "": 153 ↛ exitline 153 didn't return from function 'extract_vie_pron_span_tag' because the condition on line 153 was always true
154 sound = Sound(ipa=ipa)
155 for header in col_headers:
156 if (
157 index < header.index + header.span
158 and index + colspan > header.index
159 ):
160 sound.raw_tags.append(header.text)
161 translate_raw_tags(sound)
162 base_data.sounds.append(sound)
165def extract_pron_audio_template(
166 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
167):
168 file = clean_node(wxr, None, t_node.template_parameters.get("file", ""))
169 if file == "":
170 return
171 sound = Sound()
172 set_sound_file_url_fields(wxr, file, sound)
173 place = clean_node(wxr, None, t_node.template_parameters.get("place", ""))
174 if place != "":
175 sound.raw_tags.append(place)
176 sound.ipa = clean_node(
177 wxr, None, t_node.template_parameters.get("pron", "")
178 )
179 translate_raw_tags(sound)
180 base_data.sounds.append(sound)
183def extract_audio_template(
184 wxr: WiktextractContext,
185 base_data: WordEntry,
186 t_node: TemplateNode,
187 start_arg: int,
188):
189 # https://vi.wiktionary.org/wiki/Bản_mẫu:âm_thanh
190 # https://vi.wiktionary.org/wiki/Bản_mẫu:âm_thanh-2
191 file = clean_node(wxr, None, t_node.template_parameters.get(start_arg, ""))
192 if file == "":
193 return
194 sound = Sound()
195 set_sound_file_url_fields(wxr, file, sound)
196 raw_tag = clean_node(
197 wxr, None, t_node.template_parameters.get(start_arg + 1, "")
198 )
199 if raw_tag != "":
200 sound.raw_tags.append(raw_tag)
201 expanded_node = wxr.wtp.parse(
202 wxr.wtp.node_to_wikitext(t_node), expand_all=True
203 )
204 for span_node in expanded_node.find_html_recursively(
205 "span", attr_name="class", attr_value="ib-content"
206 ):
207 for raw_tag in clean_node(wxr, None, span_node).split(","):
208 if raw_tag != "":
209 sound.raw_tags.append(raw_tag)
210 translate_raw_tags(sound)
211 clean_node(wxr, base_data, expanded_node)
212 base_data.sounds.append(sound)
215def extract_ipa_template(
216 wxr: WiktextractContext,
217 base_data: WordEntry,
218 t_node: TemplateNode,
219 ipa_class: str,
220):
221 # https://vi.wiktionary.org/wiki/Bản_mẫu:IPA
222 expanded_node = wxr.wtp.parse(
223 wxr.wtp.node_to_wikitext(t_node), expand_all=True
224 )
225 no_list_nodes = []
226 for node in expanded_node.children:
227 if isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
228 for list_item in node.find_child(NodeKind.LIST_ITEM):
229 extract_ipa_list_item(wxr, base_data, list_item, ipa_class)
230 else:
231 no_list_nodes.append(node)
232 if len(no_list_nodes) > 0: 232 ↛ 236line 232 didn't jump to line 236 because the condition on line 232 was always true
233 tmp_node = WikiNode(NodeKind.ROOT, 0)
234 tmp_node.children = no_list_nodes
235 extract_ipa_list_item(wxr, base_data, tmp_node, ipa_class)
236 clean_node(wxr, base_data, expanded_node)
239def extract_ipa_list_item(
240 wxr: WiktextractContext,
241 base_data: WordEntry,
242 list_item: WikiNode,
243 class_name: str,
244):
245 raw_tags = []
246 for italic_node in list_item.find_child(NodeKind.ITALIC):
247 raw_tag = clean_node(wxr, None, italic_node)
248 if raw_tag != "": 248 ↛ 246line 248 didn't jump to line 246 because the condition on line 248 was always true
249 raw_tags.append(raw_tag)
250 for span_tag in list_item.find_html_recursively("span"):
251 span_class = span_tag.attrs.get("class", "").split()
252 if "qualifier-content" in span_class or "label-content" in span_class:
253 for raw_tag in clean_node(wxr, None, span_tag).split(","):
254 raw_tag = raw_tag.strip()
255 if raw_tag != "": 255 ↛ 253line 255 didn't jump to line 253 because the condition on line 255 was always true
256 raw_tags.append(raw_tag)
257 elif class_name in span_class:
258 sound = Sound(
259 ipa=clean_node(wxr, None, span_tag), raw_tags=raw_tags
260 )
261 if sound.ipa != "": 261 ↛ 250line 261 didn't jump to line 250 because the condition on line 261 was always true
262 translate_raw_tags(sound)
263 base_data.sounds.append(sound)
266def extract_rhymes_template(
267 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
268):
269 # https://vi.wiktionary.org/wiki/Bản_mẫu:rhymes
270 expanded_node = wxr.wtp.parse(
271 wxr.wtp.node_to_wikitext(t_node), expand_all=True
272 )
273 for span_tag in expanded_node.find_html_recursively(
274 "span", attr_name="class", attr_value="IPA"
275 ):
276 rhyme = clean_node(wxr, None, span_tag)
277 if rhyme != "": 277 ↛ 273line 277 didn't jump to line 273 because the condition on line 277 was always true
278 base_data.sounds.append(Sound(rhymes=rhyme))
280 for link in expanded_node.find_child(NodeKind.LINK):
281 clean_node(wxr, base_data, link)
284def extract_hyphenation_template(
285 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
286):
287 # https://vi.wiktionary.org/wiki/Bản_mẫu:hyphenation
288 expanded_node = wxr.wtp.parse(
289 wxr.wtp.node_to_wikitext(t_node), expand_all=True
290 )
291 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
292 for span_tag in expanded_node.find_html(
293 "span", attr_name="lang", attr_value=lang_code
294 ):
295 h_str = clean_node(wxr, None, span_tag)
296 h_data = Hyphenation(
297 parts=list(filter(None, map(str.strip, h_str.split("‧"))))
298 )
299 if len(h_data.parts) > 0: 299 ↛ 292line 299 didn't jump to line 292 because the condition on line 299 was always true
300 base_data.hyphenations.append(h_data)
303def extract_homophone_section(
304 wxr: WiktextractContext, base_data: WordEntry, level_node: LevelNode
305):
306 for list in level_node.find_child(NodeKind.LIST):
307 for list_item in list.find_child(NodeKind.LIST_ITEM):
308 for link_node in list_item.find_child(NodeKind.LINK):
309 homophone = clean_node(wxr, None, link_node)
310 if homophone != "":
311 base_data.sounds.append(Sound(homophone=homophone))
314def extract_zh_pron_template(
315 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
316):
317 expanded_node = wxr.wtp.parse(
318 wxr.wtp.node_to_wikitext(t_node), expand_all=True
319 )
320 seen_lists = set()
321 sounds = []
322 for list_node in expanded_node.find_child_recursively(NodeKind.LIST):
323 if list_node not in seen_lists:
324 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
325 sounds.extend(
326 extract_zh_pron_list_item(wxr, list_item, [], seen_lists)
327 )
328 for sound in sounds:
329 translate_raw_tags(sound)
330 base_data.sounds.extend(sounds)
331 clean_node(wxr, base_data, expanded_node)
334def extract_zh_pron_list_item(
335 wxr: WiktextractContext,
336 list_item_node: WikiNode,
337 raw_tags: list[str],
338 seen_lists: set[WikiNode],
339) -> list[Sound]:
340 current_tags = raw_tags[:]
341 sounds = []
342 is_first_small_tag = True
343 for node in list_item_node.children:
344 if isinstance(node, WikiNode):
345 if node.kind == NodeKind.LINK:
346 link_str = clean_node(wxr, None, node.largs)
347 node_str = clean_node(wxr, None, node)
348 if link_str.startswith(("File:", "Tập tin:")):
349 filename = link_str.removeprefix("File:").removeprefix(
350 "Tập tin:"
351 )
352 sound_data = Sound(raw_tags=current_tags)
353 set_sound_file_url_fields(wxr, filename, sound_data)
354 sounds.append(sound_data)
355 elif node_str != "":
356 current_tags.append(node_str.strip("()"))
357 elif isinstance(node, HTMLNode):
358 if node.tag == "small":
359 # remove "ghi chú"(help) <sup> tag
360 if is_first_small_tag:
361 raw_tag_text = clean_node(
362 wxr,
363 None,
364 [
365 n
366 for n in node.children
367 if not (
368 isinstance(n, HTMLNode) and n.tag == "sup"
369 )
370 ],
371 ).rstrip(":")
372 current_tags.extend(split_zh_pron_raw_tag(raw_tag_text))
373 elif len(sounds) > 0:
374 sounds[-1].raw_tags.extend(
375 split_zh_pron_raw_tag(clean_node(wxr, None, node))
376 )
377 is_first_small_tag = False
378 elif node.tag == "span":
379 sounds.extend(extract_zh_pron_span(wxr, node, current_tags))
380 elif (
381 node.tag == "table"
382 and len(current_tags) > 0
383 and current_tags[-1] == "Đồng âm"
384 ):
385 sounds.extend(
386 extract_zh_pron_homophones_table(
387 wxr, node, current_tags
388 )
389 )
390 elif node.kind == NodeKind.LIST:
391 seen_lists.add(node)
392 for next_list_item in node.find_child(NodeKind.LIST_ITEM):
393 sounds.extend(
394 extract_zh_pron_list_item(
395 wxr,
396 next_list_item,
397 current_tags,
398 seen_lists,
399 )
400 )
401 return sounds
404def split_zh_pron_raw_tag(raw_tag_text: str) -> list[str]:
405 raw_tags = []
406 if "(" not in raw_tag_text:
407 for raw_tag in re.split(r",|:|;| và ", raw_tag_text):
408 raw_tag = raw_tag.strip().removeprefix("bao gồm").strip()
409 if raw_tag != "":
410 raw_tags.append(raw_tag)
411 else:
412 processed_offsets = []
413 for match in re.finditer(r"\([^()]+\)", raw_tag_text):
414 processed_offsets.append((match.start(), match.end()))
415 raw_tags.extend(
416 split_zh_pron_raw_tag(
417 raw_tag_text[match.start() + 1 : match.end() - 1]
418 )
419 )
420 not_processed = ""
421 last_end = 0
422 for start, end in processed_offsets:
423 not_processed += raw_tag_text[last_end:start]
424 last_end = end
425 not_processed += raw_tag_text[last_end:]
426 if not_processed != raw_tag_text:
427 raw_tags = split_zh_pron_raw_tag(not_processed) + raw_tags
428 else:
429 raw_tags.append(not_processed)
430 return raw_tags
433def extract_zh_pron_span(
434 wxr: WiktextractContext, span_tag: HTMLNode, raw_tags: list[str]
435) -> list[Sound]:
436 sounds = []
437 small_tags = []
438 pron_nodes = []
439 roman = ""
440 phonetic_pron = ""
441 for index, node in enumerate(span_tag.children):
442 if isinstance(node, HTMLNode) and node.tag == "small":
443 small_tags = split_zh_pron_raw_tag(clean_node(wxr, None, node))
444 elif (
445 isinstance(node, HTMLNode)
446 and node.tag == "span"
447 and "-Latn" in node.attrs.get("lang", "")
448 ):
449 roman = clean_node(wxr, None, node).strip("() ")
450 elif isinstance(node, str) and node.strip() == "[Phonetic:":
451 phonetic_pron = clean_node(
452 wxr, None, span_tag.children[index + 1 :]
453 ).strip("] ")
454 break
455 else:
456 pron_nodes.append(node)
457 for zh_pron in split_zh_pron(clean_node(wxr, None, pron_nodes)):
458 zh_pron = zh_pron.strip("[]: ")
459 if len(zh_pron) > 0:
460 if "IPA" in span_tag.attrs.get("class", ""):
461 sounds.append(
462 Sound(ipa=zh_pron, roman=roman, raw_tags=raw_tags)
463 )
464 else:
465 sounds.append(
466 Sound(zh_pron=zh_pron, roman=roman, raw_tags=raw_tags)
467 )
468 if len(sounds) > 0:
469 sounds[-1].raw_tags.extend(small_tags)
470 if phonetic_pron != "":
471 sounds.append(
472 Sound(
473 zh_pron=phonetic_pron,
474 roman=roman,
475 raw_tags=raw_tags + ["Phonetic"],
476 )
477 )
478 return sounds
481def split_zh_pron(zh_pron: str) -> list[str]:
482 # split by comma and other symbols that outside parentheses
483 parentheses = 0
484 pron_list = []
485 pron = ""
486 for c in zh_pron:
487 if (
488 (c in [",", ";", "→"] or (c == "/" and not zh_pron.startswith("/")))
489 and parentheses == 0
490 and len(pron.strip()) > 0
491 ):
492 pron_list.append(pron.strip())
493 pron = ""
494 elif c == "(":
495 parentheses += 1
496 pron += c
497 elif c == ")":
498 parentheses -= 1
499 pron += c
500 else:
501 pron += c
503 if pron.strip() != "":
504 pron_list.append(pron)
505 return pron_list
508def extract_zh_pron_homophones_table(
509 wxr: WiktextractContext, table: HTMLNode, raw_tags: list[str]
510) -> list[Sound]:
511 sounds = []
512 for td_tag in table.find_html_recursively("td"):
513 for span_tag in td_tag.find_html("span"):
514 span_class = span_tag.attrs.get("class", "")
515 span_lang = span_tag.attrs.get("lang", "")
516 span_str = clean_node(wxr, None, span_tag)
517 if (
518 span_str not in ["", "/"]
519 and span_lang != ""
520 and span_class in ["Hant", "Hans", "Hani"]
521 ):
522 sound = Sound(homophone=span_str, raw_tags=raw_tags)
523 if span_class == "Hant":
524 sound.tags.append("Traditional-Chinese")
525 elif span_class == "Hans":
526 sound.tags.append("Simplified-Chinese")
527 sounds.append(sound)
528 return sounds
531def extract_th_pron_template(
532 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
533):
534 # https://vi.wiktionary.org/wiki/Bản mẫu:th-pron
535 @dataclass
536 class TableHeader:
537 raw_tags: list[str]
538 rowspan: int
540 expanded_node = wxr.wtp.parse(
541 wxr.wtp.node_to_wikitext(t_node), expand_all=True
542 )
543 sounds = []
544 for table_tag in expanded_node.find_html("table"):
545 row_headers = []
546 for tr_tag in table_tag.find_html("tr"):
547 field = "other"
548 new_headers = []
549 for header in row_headers:
550 if header.rowspan > 1:
551 header.rowspan -= 1
552 new_headers.append(header)
553 row_headers = new_headers
554 for th_tag in tr_tag.find_html("th"):
555 header_str = clean_node(wxr, None, th_tag)
556 if header_str.startswith("(Tiêu chuẩn) IPA"):
557 field = "ipa"
558 elif header_str.startswith("Từ đồng âm"):
559 field = "homophone"
560 elif header_str == "Âm thanh":
561 field = "audio"
562 elif header_str != "":
563 rowspan = 1
564 rowspan_str = th_tag.attrs.get("rowspan", "1")
565 if re.fullmatch(r"\d+", rowspan_str):
566 rowspan = int(rowspan_str)
567 header = TableHeader([], rowspan)
568 for line in header_str.splitlines():
569 for raw_tag in line.strip("{}\n ").split(";"):
570 raw_tag = raw_tag.strip()
571 if raw_tag != "":
572 header.raw_tags.append(raw_tag)
573 row_headers.append(header)
575 for td_tag in tr_tag.find_html("td"):
576 if field == "audio":
577 for link_node in td_tag.find_child(NodeKind.LINK):
578 filename = clean_node(wxr, None, link_node.largs[0])
579 if filename != "":
580 sound = Sound()
581 set_sound_file_url_fields(wxr, filename, sound)
582 sounds.append(sound)
583 elif field == "homophone":
584 for span_tag in td_tag.find_html_recursively(
585 "span", attr_name="lang", attr_value="th"
586 ):
587 word = clean_node(wxr, None, span_tag)
588 if word != "":
589 sounds.append(Sound(homophone=word))
590 else:
591 raw_tags = []
592 for html_node in td_tag.find_child_recursively(
593 NodeKind.HTML
594 ):
595 if html_node.tag == "small":
596 node_str = clean_node(wxr, None, html_node)
597 if node_str.startswith("[") and node_str.endswith(
598 "]"
599 ):
600 for raw_tag in node_str.strip("[]").split(","):
601 raw_tag = raw_tag.strip()
602 if raw_tag != "":
603 raw_tags.append(raw_tag)
604 elif len(sounds) > 0:
605 sounds[-1].roman = node_str
606 elif html_node.tag == "span":
607 node_str = clean_node(wxr, None, html_node)
608 span_lang = html_node.attrs.get("lang", "")
609 span_class = html_node.attrs.get("class", "")
610 if node_str != "" and (
611 span_lang == "th" or span_class in ["IPA", "tr"]
612 ):
613 sound = Sound(raw_tags=raw_tags)
614 for header in row_headers:
615 sound.raw_tags.extend(header.raw_tags)
616 translate_raw_tags(sound)
617 if "romanization" in sound.tags:
618 field = "roman"
619 setattr(sound, field, node_str)
620 sounds.append(sound)
622 base_data.sounds.extend(sounds)
623 clean_node(wxr, base_data, expanded_node)
626def extract_homophones_template(
627 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
628):
629 expanded_node = wxr.wtp.parse(
630 wxr.wtp.node_to_wikitext(t_node), expand_all=True
631 )
632 homophones = []
633 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
634 for top_span in expanded_node.find_html(
635 "span", attr_name="class", attr_value="homophones"
636 ):
637 for span_tag in top_span.find_html("span"):
638 span_lang = span_tag.attrs.get("lang", "")
639 span_class = span_tag.attrs.get("class", "").split()
640 if "tr" in span_class and len(homophones) > 0:
641 homophones[-1].roman = clean_node(wxr, None, span_tag)
642 elif span_lang == lang_code:
643 homophone = clean_node(wxr, None, span_tag)
644 if homophone != "":
645 homophones.append(Sound(homophone=homophone))
646 elif "qualifier-content" in span_class and len(homophones) > 0:
647 raw_tag = clean_node(wxr, None, span_tag)
648 if raw_tag != "":
649 homophones[-1].raw_tags.append(raw_tag)
650 translate_raw_tags(homophones[-1])
652 base_data.sounds.extend(homophones)
653 for link_node in expanded_node.find_child(NodeKind.LINK):
654 clean_node(wxr, base_data, link_node)
657def extract_ja_pron_template(
658 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
659):
660 JA_PRON_ACCENTS = {"Nakadaka", "Heiban", "Atamadaka", "Odaka"}
661 expanded_node = wxr.wtp.parse(
662 wxr.wtp.node_to_wikitext(t_node), expand_all=True
663 )
664 for li_tag in expanded_node.find_html_recursively("li"):
665 sound = Sound()
666 for span_tag in li_tag.find_html("span"):
667 span_class = span_tag.attrs.get("class", "").split()
668 if "usage-label-accent" in span_class:
669 raw_tag = clean_node(wxr, None, span_tag).strip("() ")
670 if raw_tag != "":
671 sound.raw_tags.append(raw_tag)
672 elif "IPA" in span_class:
673 sound.ipa = clean_node(wxr, None, span_tag)
674 elif "Latn" in span_class:
675 sound.roman = clean_node(wxr, None, span_tag)
676 elif span_tag.attrs.get("lang", "") == "ja":
677 sound.other = clean_node(wxr, None, span_tag)
678 for link_node in li_tag.find_child(NodeKind.LINK):
679 link_text = clean_node(wxr, None, link_node)
680 if link_text in JA_PRON_ACCENTS:
681 sound.tags.append(link_text)
682 if sound.ipa != "" or sound.other != "":
683 translate_raw_tags(sound)
684 base_data.sounds.append(sound)
685 audio_file = t_node.template_parameters.get(
686 "a", t_node.template_parameters.get("audio", "")
687 ).strip()
688 if audio_file != "":
689 sound = Sound()
690 set_sound_file_url_fields(wxr, audio_file, sound)
691 base_data.sounds.append(sound)
693 clean_node(wxr, base_data, expanded_node)
696def extract_ko_ipa_template(
697 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
698):
699 sounds = []
700 expanded_node = wxr.wtp.parse(
701 wxr.wtp.node_to_wikitext(t_node), expand_all=True
702 )
703 clean_node(wxr, word_entry, expanded_node)
704 for ul_node in expanded_node.find_html("ul"):
705 for li_node in ul_node.find_html("li"):
706 if "ko-pron__ph" in li_node.attrs.get("class", ""):
707 for span_node in li_node.find_html(
708 "span", attr_name="lang", attr_value="ko"
709 ):
710 hangeul_str = clean_node(wxr, None, span_node).strip("[]")
711 for hangeul in hangeul_str.split("/"):
712 if hangeul != "":
713 sounds.append(
714 Sound(hangeul=hangeul, tags=["phonetic"])
715 )
716 else:
717 raw_tags = []
718 for i_node in li_node.find_html("i"):
719 for raw_tag in clean_node(wxr, None, i_node).split("/"):
720 if raw_tag not in ["", "IPA"]:
721 raw_tags.append(raw_tag)
722 for span_node in li_node.find_html(
723 "span", attr_name="class", attr_value="IPA"
724 ):
725 ipas = clean_node(wxr, None, span_node)
726 for ipa in ipas.split("~"):
727 ipa = ipa.strip()
728 if ipa != "":
729 sound = Sound(ipa=ipa, raw_tags=raw_tags)
730 translate_raw_tags(sound)
731 sounds.append(sound)
733 for table in expanded_node.find_html("table"):
734 for tr in table.find_html("tr"):
735 raw_tag = ""
736 for th in tr.find_html("th"):
737 raw_tag = clean_node(wxr, None, th)
738 for td in tr.find_html("td"):
739 roman = clean_node(wxr, None, td)
740 if roman != "":
741 sound = Sound(roman=roman)
742 if raw_tag != "":
743 sound.raw_tags.append(raw_tag)
744 translate_raw_tags(sound)
745 sounds.append(sound)
747 audio_file = clean_node(
748 wxr,
749 None,
750 t_node.template_parameters.get(
751 "a", t_node.template_parameters.get("audio", "")
752 ),
753 )
754 if audio_file != "":
755 sound = Sound()
756 set_sound_file_url_fields(wxr, audio_file, sound)
757 sounds.append(sound)
758 word_entry.sounds.extend(sounds)