Coverage for src / wiktextract / extractor / vi / sound.py: 30%
415 statements
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-05 07:46 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-05 07:46 +0000
1import re
2from dataclasses import dataclass
4from wikitextprocessor import (
5 HTMLNode,
6 LevelNode,
7 NodeKind,
8 TemplateNode,
9 WikiNode,
10)
12from ...page import clean_node
13from ...wxr_context import WiktextractContext
14from ..share import set_sound_file_url_fields
15from .models import Hyphenation, Sound, WordEntry
16from .tags import translate_raw_tags
19def extract_sound_section(
20 wxr: WiktextractContext, base_data: WordEntry, level_node: LevelNode
21):
22 for node in level_node.children:
23 if isinstance(node, TemplateNode):
24 extract_sound_template(wxr, base_data, node)
25 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
26 for list_item in node.find_child(NodeKind.LIST_ITEM):
27 extract_sound_list_item(wxr, base_data, list_item)
30def extract_sound_template(
31 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
32):
33 if t_node.template_name == "vie-pron":
34 extract_vie_pron_template(wxr, base_data, t_node)
35 elif t_node.template_name in [ 35 ↛ 40line 35 didn't jump to line 40 because the condition on line 35 was never true
36 "âm thanh-IPA",
37 "pron-audio",
38 "audio-for-pron",
39 ]:
40 extract_pron_audio_template(wxr, base_data, t_node)
41 elif t_node.template_name in ["zh-pron", "zho-pron"]: 41 ↛ 42line 41 didn't jump to line 42 because the condition on line 41 was never true
42 extract_zh_pron_template(wxr, base_data, t_node)
43 elif t_node.template_name in ["th-pron", "tha-pron"]: 43 ↛ 44line 43 didn't jump to line 44 because the condition on line 43 was never true
44 extract_th_pron_template(wxr, base_data, t_node)
45 elif t_node.template_name in ["ja-pron", "ja-IPA", "jpn-IPA", "jpn-pron"]: 45 ↛ 46line 45 didn't jump to line 46 because the condition on line 45 was never true
46 extract_ja_pron_template(wxr, base_data, t_node)
47 elif t_node.template_name in ["âm thanh", "Audio", "Âm thanh"]: 47 ↛ 48line 47 didn't jump to line 48 because the condition on line 47 was never true
48 extract_audio_template(wxr, base_data, t_node, 1)
49 elif t_node.template_name in ["âm thanh-2", "audio"]: 49 ↛ 50line 49 didn't jump to line 50 because the condition on line 49 was never true
50 extract_audio_template(wxr, base_data, t_node, 2)
51 elif t_node.template_name in [
52 "IPA",
53 "IPA2",
54 "IPA3",
55 "IPA4",
56 ] or t_node.template_name.endswith("-IPA"):
57 extract_ipa_template(wxr, base_data, t_node, "IPA")
58 elif t_node.template_name in ["enPR", "AHD"]:
59 extract_ipa_template(wxr, base_data, t_node, "enPR")
60 elif t_node.template_name in ["rhymes", "rhyme"]:
61 extract_rhymes_template(wxr, base_data, t_node)
62 elif t_node.template_name in ["hyphenation", "hyph"]: 62 ↛ 64line 62 didn't jump to line 64 because the condition on line 62 was always true
63 extract_hyphenation_template(wxr, base_data, t_node)
64 elif t_node.template_name in ["homophones", "homophone", "hmp"]:
65 extract_homophones_template(wxr, base_data, t_node)
68def extract_sound_list_item(
69 wxr: WiktextractContext, base_data: WordEntry, list_item: WikiNode
70):
71 for node in list_item.children:
72 if isinstance(node, TemplateNode):
73 extract_sound_template(wxr, base_data, node)
74 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 74 ↛ 75line 74 didn't jump to line 75 because the condition on line 74 was never true
75 for child_list_item in node.find_child(NodeKind.LIST_ITEM):
76 extract_sound_list_item(wxr, base_data, child_list_item)
79@dataclass
80class TableHeader:
81 text: str
82 index: int
83 span: int
86def extract_vie_pron_template(
87 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
88):
89 expanded_node = wxr.wtp.parse(
90 wxr.wtp.node_to_wikitext(t_node), expand_all=True
91 )
92 for table in expanded_node.find_child(NodeKind.TABLE):
93 col_headers = []
94 for row in table.find_child(NodeKind.TABLE_ROW):
95 col_index = 0
96 for cell in row.find_child(
97 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
98 ):
99 if cell.kind == NodeKind.TABLE_HEADER_CELL:
100 if col_index == 0:
101 col_headers.clear()
102 colspan = int(cell.attrs.get("colspan", "1"))
103 col_headers.append(
104 TableHeader(
105 clean_node(wxr, None, cell), col_index, colspan
106 )
107 )
108 col_index += colspan
109 else:
110 colspan = int(cell.attrs.get("colspan", "1"))
111 for span_tag in cell.find_html(
112 "span", attr_name="class", attr_value="IPA"
113 ):
114 extract_vie_pron_span_tag(
115 wxr,
116 base_data,
117 span_tag,
118 col_index,
119 colspan,
120 col_headers,
121 )
122 col_index += colspan
123 for td_tag in cell.find_html("td"):
124 colspan = int(td_tag.attrs.get("colspan", "1"))
125 for span_tag in td_tag.find_html(
126 "span", attr_name="class", attr_value="IPA"
127 ):
128 extract_vie_pron_span_tag(
129 wxr,
130 base_data,
131 span_tag,
132 col_index,
133 colspan,
134 col_headers,
135 )
136 col_index += colspan
138 for link in expanded_node.find_child(NodeKind.LINK):
139 clean_node(wxr, base_data, link)
142def extract_vie_pron_span_tag(
143 wxr: WiktextractContext,
144 base_data: WordEntry,
145 span_tag: HTMLNode,
146 index: str,
147 colspan: int,
148 col_headers: list[TableHeader],
149):
150 ipa = clean_node(wxr, None, span_tag)
151 if ipa != "": 151 ↛ exitline 151 didn't return from function 'extract_vie_pron_span_tag' because the condition on line 151 was always true
152 sound = Sound(ipa=ipa)
153 for header in col_headers:
154 if (
155 index < header.index + header.span
156 and index + colspan > header.index
157 ):
158 sound.raw_tags.append(header.text)
159 translate_raw_tags(sound)
160 base_data.sounds.append(sound)
163def extract_pron_audio_template(
164 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
165):
166 file = clean_node(wxr, None, t_node.template_parameters.get("file", ""))
167 if file == "":
168 return
169 sound = Sound()
170 set_sound_file_url_fields(wxr, file, sound)
171 place = clean_node(wxr, None, t_node.template_parameters.get("place", ""))
172 if place != "":
173 sound.raw_tags.append(place)
174 sound.ipa = clean_node(
175 wxr, None, t_node.template_parameters.get("pron", "")
176 )
177 translate_raw_tags(sound)
178 base_data.sounds.append(sound)
181def extract_audio_template(
182 wxr: WiktextractContext,
183 base_data: WordEntry,
184 t_node: TemplateNode,
185 start_arg: int,
186):
187 # https://vi.wiktionary.org/wiki/Bản_mẫu:âm_thanh
188 # https://vi.wiktionary.org/wiki/Bản_mẫu:âm_thanh-2
189 file = clean_node(wxr, None, t_node.template_parameters.get(start_arg, ""))
190 if file == "":
191 return
192 sound = Sound()
193 set_sound_file_url_fields(wxr, file, sound)
194 raw_tag = clean_node(
195 wxr, None, t_node.template_parameters.get(start_arg + 1, "")
196 )
197 if raw_tag != "":
198 sound.raw_tags.append(raw_tag)
199 translate_raw_tags(sound)
200 base_data.sounds.append(sound)
203def extract_ipa_template(
204 wxr: WiktextractContext,
205 base_data: WordEntry,
206 t_node: TemplateNode,
207 ipa_class: str,
208):
209 # https://vi.wiktionary.org/wiki/Bản_mẫu:IPA
210 expanded_node = wxr.wtp.parse(
211 wxr.wtp.node_to_wikitext(t_node), expand_all=True
212 )
213 no_list_nodes = []
214 for node in expanded_node.children:
215 if isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
216 for list_item in node.find_child(NodeKind.LIST_ITEM):
217 extract_ipa_list_item(wxr, base_data, list_item, ipa_class)
218 else:
219 no_list_nodes.append(node)
220 if len(no_list_nodes) > 0: 220 ↛ 224line 220 didn't jump to line 224 because the condition on line 220 was always true
221 tmp_node = WikiNode(NodeKind.ROOT, 0)
222 tmp_node.children = no_list_nodes
223 extract_ipa_list_item(wxr, base_data, tmp_node, ipa_class)
224 clean_node(wxr, base_data, expanded_node)
227def extract_ipa_list_item(
228 wxr: WiktextractContext,
229 base_data: WordEntry,
230 list_item: WikiNode,
231 class_name: str,
232):
233 raw_tags = []
234 for italic_node in list_item.find_child(NodeKind.ITALIC):
235 raw_tag = clean_node(wxr, None, italic_node)
236 if raw_tag != "": 236 ↛ 234line 236 didn't jump to line 234 because the condition on line 236 was always true
237 raw_tags.append(raw_tag)
238 for span_tag in list_item.find_html_recursively("span"):
239 span_class = span_tag.attrs.get("class", "").split()
240 if "qualifier-content" in span_class or "label-content" in span_class:
241 for raw_tag in clean_node(wxr, None, span_tag).split(","):
242 raw_tag = raw_tag.strip()
243 if raw_tag != "": 243 ↛ 241line 243 didn't jump to line 241 because the condition on line 243 was always true
244 raw_tags.append(raw_tag)
245 elif class_name in span_class:
246 sound = Sound(
247 ipa=clean_node(wxr, None, span_tag), raw_tags=raw_tags
248 )
249 if sound.ipa != "": 249 ↛ 238line 249 didn't jump to line 238 because the condition on line 249 was always true
250 translate_raw_tags(sound)
251 base_data.sounds.append(sound)
254def extract_rhymes_template(
255 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
256):
257 # https://vi.wiktionary.org/wiki/Bản_mẫu:rhymes
258 expanded_node = wxr.wtp.parse(
259 wxr.wtp.node_to_wikitext(t_node), expand_all=True
260 )
261 for span_tag in expanded_node.find_html_recursively(
262 "span", attr_name="class", attr_value="IPA"
263 ):
264 rhyme = clean_node(wxr, None, span_tag)
265 if rhyme != "": 265 ↛ 261line 265 didn't jump to line 261 because the condition on line 265 was always true
266 base_data.sounds.append(Sound(rhymes=rhyme))
268 for link in expanded_node.find_child(NodeKind.LINK):
269 clean_node(wxr, base_data, link)
272def extract_hyphenation_template(
273 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
274):
275 # https://vi.wiktionary.org/wiki/Bản_mẫu:hyphenation
276 expanded_node = wxr.wtp.parse(
277 wxr.wtp.node_to_wikitext(t_node), expand_all=True
278 )
279 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
280 for span_tag in expanded_node.find_html(
281 "span", attr_name="lang", attr_value=lang_code
282 ):
283 h_str = clean_node(wxr, None, span_tag)
284 h_data = Hyphenation(
285 parts=list(filter(None, map(str.strip, h_str.split("‧"))))
286 )
287 if len(h_data.parts) > 0: 287 ↛ 280line 287 didn't jump to line 280 because the condition on line 287 was always true
288 base_data.hyphenations.append(h_data)
291def extract_homophone_section(
292 wxr: WiktextractContext, base_data: WordEntry, level_node: LevelNode
293):
294 for list in level_node.find_child(NodeKind.LIST):
295 for list_item in list.find_child(NodeKind.LIST_ITEM):
296 for link_node in list_item.find_child(NodeKind.LINK):
297 homophone = clean_node(wxr, None, link_node)
298 if homophone != "":
299 base_data.sounds.append(Sound(homophone=homophone))
302def extract_zh_pron_template(
303 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
304):
305 expanded_node = wxr.wtp.parse(
306 wxr.wtp.node_to_wikitext(t_node), expand_all=True
307 )
308 seen_lists = set()
309 sounds = []
310 for list_node in expanded_node.find_child_recursively(NodeKind.LIST):
311 if list_node not in seen_lists:
312 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
313 sounds.extend(
314 extract_zh_pron_list_item(wxr, list_item, [], seen_lists)
315 )
316 for sound in sounds:
317 translate_raw_tags(sound)
318 base_data.sounds.extend(sounds)
319 clean_node(wxr, base_data, expanded_node)
322def extract_zh_pron_list_item(
323 wxr: WiktextractContext,
324 list_item_node: WikiNode,
325 raw_tags: list[str],
326 seen_lists: set[WikiNode],
327) -> list[Sound]:
328 current_tags = raw_tags[:]
329 sounds = []
330 is_first_small_tag = True
331 for node in list_item_node.children:
332 if isinstance(node, WikiNode):
333 if node.kind == NodeKind.LINK:
334 link_str = clean_node(wxr, None, node.largs)
335 node_str = clean_node(wxr, None, node)
336 if link_str.startswith(("File:", "Tập tin:")):
337 filename = link_str.removeprefix("File:").removeprefix(
338 "Tập tin:"
339 )
340 sound_data = Sound(raw_tags=current_tags)
341 set_sound_file_url_fields(wxr, filename, sound_data)
342 sounds.append(sound_data)
343 elif node_str != "":
344 current_tags.append(node_str.strip("()"))
345 elif isinstance(node, HTMLNode):
346 if node.tag == "small":
347 # remove "ghi chú"(help) <sup> tag
348 if is_first_small_tag:
349 raw_tag_text = clean_node(
350 wxr,
351 None,
352 [
353 n
354 for n in node.children
355 if not (
356 isinstance(n, HTMLNode) and n.tag == "sup"
357 )
358 ],
359 ).rstrip(":")
360 current_tags.extend(split_zh_pron_raw_tag(raw_tag_text))
361 elif len(sounds) > 0:
362 sounds[-1].raw_tags.extend(
363 split_zh_pron_raw_tag(clean_node(wxr, None, node))
364 )
365 is_first_small_tag = False
366 elif node.tag == "span":
367 sounds.extend(extract_zh_pron_span(wxr, node, current_tags))
368 elif (
369 node.tag == "table"
370 and len(current_tags) > 0
371 and current_tags[-1] == "Đồng âm"
372 ):
373 sounds.extend(
374 extract_zh_pron_homophones_table(
375 wxr, node, current_tags
376 )
377 )
378 elif node.kind == NodeKind.LIST:
379 seen_lists.add(node)
380 for next_list_item in node.find_child(NodeKind.LIST_ITEM):
381 sounds.extend(
382 extract_zh_pron_list_item(
383 wxr,
384 next_list_item,
385 current_tags,
386 seen_lists,
387 )
388 )
389 return sounds
392def split_zh_pron_raw_tag(raw_tag_text: str) -> list[str]:
393 raw_tags = []
394 if "(" not in raw_tag_text:
395 for raw_tag in re.split(r",|:|;| và ", raw_tag_text):
396 raw_tag = raw_tag.strip().removeprefix("bao gồm").strip()
397 if raw_tag != "":
398 raw_tags.append(raw_tag)
399 else:
400 processed_offsets = []
401 for match in re.finditer(r"\([^()]+\)", raw_tag_text):
402 processed_offsets.append((match.start(), match.end()))
403 raw_tags.extend(
404 split_zh_pron_raw_tag(
405 raw_tag_text[match.start() + 1 : match.end() - 1]
406 )
407 )
408 not_processed = ""
409 last_end = 0
410 for start, end in processed_offsets:
411 not_processed += raw_tag_text[last_end:start]
412 last_end = end
413 not_processed += raw_tag_text[last_end:]
414 if not_processed != raw_tag_text:
415 raw_tags = split_zh_pron_raw_tag(not_processed) + raw_tags
416 else:
417 raw_tags.append(not_processed)
418 return raw_tags
421def extract_zh_pron_span(
422 wxr: WiktextractContext, span_tag: HTMLNode, raw_tags: list[str]
423) -> list[Sound]:
424 sounds = []
425 small_tags = []
426 pron_nodes = []
427 roman = ""
428 phonetic_pron = ""
429 for index, node in enumerate(span_tag.children):
430 if isinstance(node, HTMLNode) and node.tag == "small":
431 small_tags = split_zh_pron_raw_tag(clean_node(wxr, None, node))
432 elif (
433 isinstance(node, HTMLNode)
434 and node.tag == "span"
435 and "-Latn" in node.attrs.get("lang", "")
436 ):
437 roman = clean_node(wxr, None, node).strip("() ")
438 elif isinstance(node, str) and node.strip() == "[Phonetic:":
439 phonetic_pron = clean_node(
440 wxr, None, span_tag.children[index + 1 :]
441 ).strip("] ")
442 break
443 else:
444 pron_nodes.append(node)
445 for zh_pron in split_zh_pron(clean_node(wxr, None, pron_nodes)):
446 zh_pron = zh_pron.strip("[]: ")
447 if len(zh_pron) > 0:
448 if "IPA" in span_tag.attrs.get("class", ""):
449 sounds.append(
450 Sound(ipa=zh_pron, roman=roman, raw_tags=raw_tags)
451 )
452 else:
453 sounds.append(
454 Sound(zh_pron=zh_pron, roman=roman, raw_tags=raw_tags)
455 )
456 if len(sounds) > 0:
457 sounds[-1].raw_tags.extend(small_tags)
458 if phonetic_pron != "":
459 sounds.append(
460 Sound(
461 zh_pron=phonetic_pron,
462 roman=roman,
463 raw_tags=raw_tags + ["Phonetic"],
464 )
465 )
466 return sounds
469def split_zh_pron(zh_pron: str) -> list[str]:
470 # split by comma and other symbols that outside parentheses
471 parentheses = 0
472 pron_list = []
473 pron = ""
474 for c in zh_pron:
475 if (
476 (c in [",", ";", "→"] or (c == "/" and not zh_pron.startswith("/")))
477 and parentheses == 0
478 and len(pron.strip()) > 0
479 ):
480 pron_list.append(pron.strip())
481 pron = ""
482 elif c == "(":
483 parentheses += 1
484 pron += c
485 elif c == ")":
486 parentheses -= 1
487 pron += c
488 else:
489 pron += c
491 if pron.strip() != "":
492 pron_list.append(pron)
493 return pron_list
496def extract_zh_pron_homophones_table(
497 wxr: WiktextractContext, table: HTMLNode, raw_tags: list[str]
498) -> list[Sound]:
499 sounds = []
500 for td_tag in table.find_html_recursively("td"):
501 for span_tag in td_tag.find_html("span"):
502 span_class = span_tag.attrs.get("class", "")
503 span_lang = span_tag.attrs.get("lang", "")
504 span_str = clean_node(wxr, None, span_tag)
505 if (
506 span_str not in ["", "/"]
507 and span_lang != ""
508 and span_class in ["Hant", "Hans", "Hani"]
509 ):
510 sound = Sound(homophone=span_str, raw_tags=raw_tags)
511 if span_class == "Hant":
512 sound.tags.append("Traditional-Chinese")
513 elif span_class == "Hans":
514 sound.tags.append("Simplified-Chinese")
515 sounds.append(sound)
516 return sounds
519def extract_th_pron_template(
520 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
521):
522 # https://vi.wiktionary.org/wiki/Bản mẫu:th-pron
523 @dataclass
524 class TableHeader:
525 raw_tags: list[str]
526 rowspan: int
528 expanded_node = wxr.wtp.parse(
529 wxr.wtp.node_to_wikitext(t_node), expand_all=True
530 )
531 sounds = []
532 for table_tag in expanded_node.find_html("table"):
533 row_headers = []
534 for tr_tag in table_tag.find_html("tr"):
535 field = "other"
536 new_headers = []
537 for header in row_headers:
538 if header.rowspan > 1:
539 header.rowspan -= 1
540 new_headers.append(header)
541 row_headers = new_headers
542 for th_tag in tr_tag.find_html("th"):
543 header_str = clean_node(wxr, None, th_tag)
544 if header_str.startswith("(Tiêu chuẩn) IPA"):
545 field = "ipa"
546 elif header_str.startswith("Từ đồng âm"):
547 field = "homophone"
548 elif header_str == "Âm thanh":
549 field = "audio"
550 elif header_str != "":
551 rowspan = 1
552 rowspan_str = th_tag.attrs.get("rowspan", "1")
553 if re.fullmatch(r"\d+", rowspan_str):
554 rowspan = int(rowspan_str)
555 header = TableHeader([], rowspan)
556 for line in header_str.splitlines():
557 for raw_tag in line.strip("{}\n ").split(";"):
558 raw_tag = raw_tag.strip()
559 if raw_tag != "":
560 header.raw_tags.append(raw_tag)
561 row_headers.append(header)
563 for td_tag in tr_tag.find_html("td"):
564 if field == "audio":
565 for link_node in td_tag.find_child(NodeKind.LINK):
566 filename = clean_node(wxr, None, link_node.largs[0])
567 if filename != "":
568 sound = Sound()
569 set_sound_file_url_fields(wxr, filename, sound)
570 sounds.append(sound)
571 elif field == "homophone":
572 for span_tag in td_tag.find_html_recursively(
573 "span", attr_name="lang", attr_value="th"
574 ):
575 word = clean_node(wxr, None, span_tag)
576 if word != "":
577 sounds.append(Sound(homophone=word))
578 else:
579 raw_tags = []
580 for html_node in td_tag.find_child_recursively(
581 NodeKind.HTML
582 ):
583 if html_node.tag == "small":
584 node_str = clean_node(wxr, None, html_node)
585 if node_str.startswith("[") and node_str.endswith(
586 "]"
587 ):
588 for raw_tag in node_str.strip("[]").split(","):
589 raw_tag = raw_tag.strip()
590 if raw_tag != "":
591 raw_tags.append(raw_tag)
592 elif len(sounds) > 0:
593 sounds[-1].roman = node_str
594 elif html_node.tag == "span":
595 node_str = clean_node(wxr, None, html_node)
596 span_lang = html_node.attrs.get("lang", "")
597 span_class = html_node.attrs.get("class", "")
598 if node_str != "" and (
599 span_lang == "th" or span_class in ["IPA", "tr"]
600 ):
601 sound = Sound(raw_tags=raw_tags)
602 for header in row_headers:
603 sound.raw_tags.extend(header.raw_tags)
604 translate_raw_tags(sound)
605 if "romanization" in sound.tags:
606 field = "roman"
607 setattr(sound, field, node_str)
608 sounds.append(sound)
610 base_data.sounds.extend(sounds)
611 clean_node(wxr, base_data, expanded_node)
614def extract_homophones_template(
615 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
616):
617 expanded_node = wxr.wtp.parse(
618 wxr.wtp.node_to_wikitext(t_node), expand_all=True
619 )
620 homophones = []
621 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
622 for top_span in expanded_node.find_html(
623 "span", attr_name="class", attr_value="homophones"
624 ):
625 for span_tag in top_span.find_html("span"):
626 span_lang = span_tag.attrs.get("lang", "")
627 span_class = span_tag.attrs.get("class", "").split()
628 if "tr" in span_class and len(homophones) > 0:
629 homophones[-1].roman = clean_node(wxr, None, span_tag)
630 elif span_lang == lang_code:
631 homophone = clean_node(wxr, None, span_tag)
632 if homophone != "":
633 homophones.append(Sound(homophone=homophone))
634 elif "qualifier-content" in span_class and len(homophones) > 0:
635 raw_tag = clean_node(wxr, None, span_tag)
636 if raw_tag != "":
637 homophones[-1].raw_tags.append(raw_tag)
638 translate_raw_tags(homophones[-1])
640 base_data.sounds.extend(homophones)
641 for link_node in expanded_node.find_child(NodeKind.LINK):
642 clean_node(wxr, base_data, link_node)
645def extract_ja_pron_template(
646 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
647):
648 JA_PRON_ACCENTS = {"Nakadaka", "Heiban", "Atamadaka", "Odaka"}
649 expanded_node = wxr.wtp.parse(
650 wxr.wtp.node_to_wikitext(t_node), expand_all=True
651 )
652 for li_tag in expanded_node.find_html_recursively("li"):
653 sound = Sound()
654 for span_tag in li_tag.find_html("span"):
655 span_class = span_tag.attrs.get("class", "").split()
656 if "usage-label-accent" in span_class:
657 raw_tag = clean_node(wxr, None, span_tag).strip("() ")
658 if raw_tag != "":
659 sound.raw_tags.append(raw_tag)
660 elif "IPA" in span_class:
661 sound.ipa = clean_node(wxr, None, span_tag)
662 elif "Latn" in span_class:
663 sound.roman = clean_node(wxr, None, span_tag)
664 elif span_tag.attrs.get("lang", "") == "ja":
665 sound.other = clean_node(wxr, None, span_tag)
666 for link_node in li_tag.find_child(NodeKind.LINK):
667 link_text = clean_node(wxr, None, link_node)
668 if link_text in JA_PRON_ACCENTS:
669 sound.tags.append(link_text)
670 if sound.ipa != "" or sound.other != "":
671 translate_raw_tags(sound)
672 base_data.sounds.append(sound)
673 audio_file = t_node.template_parameters.get(
674 "a", t_node.template_parameters.get("audio", "")
675 ).strip()
676 if audio_file != "":
677 sound = Sound()
678 set_sound_file_url_fields(wxr, audio_file, sound)
679 base_data.sounds.append(sound)
681 clean_node(wxr, base_data, expanded_node)