Coverage for src / wiktextract / extractor / zh / pronunciation.py: 70%
510 statements
« prev ^ index » next coverage.py v7.13.1, created at 2026-01-19 11:25 +0000
« prev ^ index » next coverage.py v7.13.1, created at 2026-01-19 11:25 +0000
1import re
2from dataclasses import dataclass
4from wikitextprocessor import (
5 HTMLNode,
6 NodeKind,
7 TemplateNode,
8 WikiNode,
9)
11from ...page import clean_node
12from ...wxr_context import WiktextractContext
13from ..share import set_sound_file_url_fields
14from .models import Hyphenation, Sound, WordEntry
15from .tags import translate_raw_tags
18def extract_pronunciation_section(
19 wxr: WiktextractContext, base_data: WordEntry, level_node: WikiNode
20) -> None:
21 for t_node in level_node.find_child(NodeKind.TEMPLATE):
22 if t_node.template_name == "zh-forms":
23 from .page import process_zh_forms
25 process_zh_forms(wxr, base_data, t_node)
26 else:
27 new_sounds, new_cats = process_pron_template(wxr, base_data, t_node)
28 base_data.sounds.extend(new_sounds)
29 base_data.categories.extend(new_cats)
30 for list_item_node in level_node.find_child_recursively(NodeKind.LIST_ITEM):
31 new_sounds, new_cats = process_pron_item_list_item(
32 wxr, base_data, list_item_node
33 )
34 base_data.sounds.extend(new_sounds)
35 base_data.categories.extend(new_cats)
38def process_pron_item_list_item(
39 wxr: WiktextractContext, base_data: WordEntry, list_item_node: WikiNode
40) -> tuple[list[Sound], list[str]]:
41 raw_tags = []
42 sounds = []
43 categories = []
44 for t_node in list_item_node.find_child(NodeKind.TEMPLATE):
45 if t_node.template_name.lower() in ["hyph", "hyphenation"]: 45 ↛ 46line 45 didn't jump to line 46 because the condition on line 45 was never true
46 extract_hyphenation_template(wxr, base_data, t_node)
47 else:
48 new_sounds, new_cats = process_pron_template(
49 wxr, base_data, t_node, raw_tags
50 )
51 sounds.extend(new_sounds)
52 categories.extend(new_cats)
53 return sounds, categories
56def process_pron_template(
57 wxr: WiktextractContext,
58 base_data: WordEntry,
59 template_node: TemplateNode,
60 raw_tags: list[str] = [],
61) -> tuple[list[Sound], list[str]]:
62 template_name = template_node.template_name.lower()
63 sounds = []
64 categories = []
65 new_sounds = []
66 new_cats = []
67 if template_name == "zh-pron":
68 new_sounds, new_cats = process_zh_pron_template(wxr, template_node)
69 elif template_name in ["rhymes", "rhyme"]: 69 ↛ 70line 69 didn't jump to line 70 because the condition on line 69 was never true
70 new_sounds, new_cats = extract_rhymes_template(wxr, template_node)
71 elif template_name in ["homophones", "homophone", "hmp"]:
72 new_sounds, new_cats = extract_homophones_template(wxr, template_node)
73 elif template_name in ["a", "accent"]:
74 # https://zh.wiktionary.org/wiki/Template:Accent
75 raw_tags.append(clean_node(wxr, None, template_node).strip("()"))
76 elif template_name in ["audio", "音"]:
77 new_sounds, new_cats = process_audio_template(
78 wxr, template_node, raw_tags
79 )
80 elif template_name == "ko-ipa":
81 new_sounds, new_cats = extract_ko_ipa_template(
82 wxr, template_node, raw_tags
83 )
84 elif template_name == "ipa" or template_name.endswith("-ipa"):
85 new_sounds, new_cats = extract_ipa_template(
86 wxr, template_node, raw_tags
87 )
88 elif template_name == "enpr":
89 sounds.extend(process_enpr_template(wxr, template_node, raw_tags))
90 elif template_name == "ja-pron": 90 ↛ 91line 90 didn't jump to line 91 because the condition on line 90 was never true
91 new_sounds, new_cats = extract_ja_pron_template(wxr, template_node)
92 elif template_name == "th-pron": 92 ↛ 93line 92 didn't jump to line 93 because the condition on line 92 was never true
93 new_sounds, new_cats = extract_th_pron_template(wxr, template_node)
94 elif template_name.endswith("-pr"): 94 ↛ 98line 94 didn't jump to line 98 because the condition on line 94 was always true
95 new_sounds, new_cats = extract_pl_pr_template(
96 wxr, base_data, template_node
97 )
98 sounds.extend(new_sounds)
99 categories.extend(new_cats)
100 return sounds, categories
103def process_zh_pron_template(
104 wxr: WiktextractContext, template_node: TemplateNode
105) -> tuple[list[Sound], list[str]]:
106 # https://zh.wiktionary.org/wiki/Template:Zh-pron
107 expanded_node = wxr.wtp.parse(
108 wxr.wtp.node_to_wikitext(template_node), expand_all=True
109 )
110 seen_lists = set()
111 sounds = []
112 categories = {}
113 for list_node in expanded_node.find_child_recursively(NodeKind.LIST):
114 if list_node not in seen_lists:
115 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
116 sounds.extend(
117 process_zh_pron_list_item(wxr, list_item, [], seen_lists)
118 )
119 clean_node(wxr, categories, expanded_node)
120 for sound in sounds:
121 translate_raw_tags(sound)
122 return sounds, categories.get("categories", [])
125def process_zh_pron_list_item(
126 wxr: WiktextractContext,
127 list_item_node: WikiNode,
128 raw_tags: list[str],
129 seen_lists: set[WikiNode],
130) -> list[Sound]:
131 current_tags = raw_tags[:]
132 sounds = []
133 is_first_small_tag = True
134 for node in list_item_node.children:
135 if isinstance(node, WikiNode):
136 if node.kind == NodeKind.LINK:
137 link_str = clean_node(wxr, None, node.largs)
138 node_str = clean_node(wxr, None, node)
139 if link_str.startswith("File:"): 139 ↛ 140line 139 didn't jump to line 140 because the condition on line 139 was never true
140 filename = link_str.removeprefix("File:")
141 sound_data = Sound(raw_tags=current_tags)
142 set_sound_file_url_fields(wxr, filename, sound_data)
143 sounds.append(sound_data)
144 elif node_str != "":
145 current_tags.append(node_str.strip("()"))
146 elif isinstance(node, HTMLNode):
147 if node.tag == "small":
148 # remove "幫助"(help) <sup> tag
149 if is_first_small_tag:
150 raw_tag_text = clean_node(
151 wxr,
152 None,
153 [
154 n
155 for n in node.children
156 if not (
157 isinstance(n, HTMLNode) and n.tag == "sup"
158 )
159 ],
160 ).rstrip(":")
161 current_tags.extend(split_zh_pron_raw_tag(raw_tag_text))
162 elif len(sounds) > 0: 162 ↛ 166line 162 didn't jump to line 166 because the condition on line 162 was always true
163 sounds[-1].raw_tags.extend(
164 split_zh_pron_raw_tag(clean_node(wxr, None, node))
165 )
166 is_first_small_tag = False
167 elif node.tag == "span":
168 sounds.extend(extract_zh_pron_span(wxr, node, current_tags))
169 elif (
170 node.tag == "table"
171 and len(current_tags) > 0
172 and current_tags[-1] == "同音詞"
173 ):
174 sounds.extend(
175 extract_zh_pron_homophones_table(
176 wxr, node, current_tags
177 )
178 )
180 elif node.kind == NodeKind.LIST: 180 ↛ 134line 180 didn't jump to line 134 because the condition on line 180 was always true
181 seen_lists.add(node)
182 for next_list_item in node.find_child(NodeKind.LIST_ITEM):
183 sounds.extend(
184 process_zh_pron_list_item(
185 wxr,
186 next_list_item,
187 current_tags,
188 seen_lists,
189 )
190 )
191 return sounds
194def split_zh_pron_raw_tag(raw_tag_text: str) -> list[str]:
195 raw_tags = []
196 if "(" not in raw_tag_text and "(" not in raw_tag_text:
197 for raw_tag in re.split(r",|,|:|、|;|;|和(?!$)", raw_tag_text):
198 raw_tag = raw_tag.strip().removeprefix("包括").strip()
199 if raw_tag != "":
200 raw_tags.append(raw_tag)
201 else:
202 processed_offsets = []
203 for match in re.finditer(r"\([^()]+\)|([^()]+)", raw_tag_text):
204 processed_offsets.append((match.start(), match.end()))
205 raw_tags.extend(
206 split_zh_pron_raw_tag(
207 raw_tag_text[match.start() + 1 : match.end() - 1]
208 )
209 )
210 not_processed = ""
211 last_end = 0
212 for start, end in processed_offsets:
213 not_processed += raw_tag_text[last_end:start]
214 last_end = end
215 not_processed += raw_tag_text[last_end:]
216 if not_processed != raw_tag_text:
217 raw_tags = split_zh_pron_raw_tag(not_processed) + raw_tags
218 else:
219 raw_tags.append(not_processed)
220 return raw_tags
223def extract_zh_pron_span(
224 wxr: WiktextractContext, span_tag: HTMLNode, raw_tags: list[str]
225) -> list[Sound]:
226 sounds = []
227 small_tags = []
228 pron_nodes = []
229 roman = ""
230 phonetic_pron = ""
231 for index, node in enumerate(span_tag.children):
232 if isinstance(node, HTMLNode) and node.tag == "small":
233 small_tags = split_zh_pron_raw_tag(clean_node(wxr, None, node))
234 elif (
235 isinstance(node, HTMLNode)
236 and node.tag == "span"
237 and "-Latn" in node.attrs.get("lang", "")
238 ):
239 roman = clean_node(wxr, None, node).strip("() ")
240 elif isinstance(node, str) and node.strip() == "[實際讀音:":
241 phonetic_pron = clean_node(
242 wxr, None, span_tag.children[index + 1 :]
243 ).strip("] ")
244 break
245 else:
246 pron_nodes.append(node)
247 for zh_pron in split_zh_pron(clean_node(wxr, None, pron_nodes)):
248 zh_pron = zh_pron.strip("[]: ")
249 if len(zh_pron) > 0: 249 ↛ 247line 249 didn't jump to line 247 because the condition on line 249 was always true
250 if "IPA" in span_tag.attrs.get("class", ""):
251 sounds.append(
252 Sound(ipa=zh_pron, roman=roman, raw_tags=raw_tags)
253 )
254 else:
255 sounds.append(
256 Sound(zh_pron=zh_pron, roman=roman, raw_tags=raw_tags)
257 )
258 if len(sounds) > 0:
259 sounds[-1].raw_tags.extend(small_tags)
260 if phonetic_pron != "":
261 sounds.append(
262 Sound(
263 zh_pron=phonetic_pron,
264 roman=roman,
265 raw_tags=raw_tags + ["實際讀音"],
266 )
267 )
268 return sounds
271def split_zh_pron(zh_pron: str) -> list[str]:
272 # split by comma and other symbols that outside parentheses
273 parentheses = 0
274 pron_list = []
275 pron = ""
276 for c in zh_pron:
277 if (
278 (c in [",", ";", "→"] or (c == "/" and not zh_pron.startswith("/")))
279 and parentheses == 0
280 and len(pron.strip()) > 0
281 ):
282 pron_list.append(pron.strip())
283 pron = ""
284 elif c in ["(", "("]:
285 parentheses += 1
286 pron += c
287 elif c in [")", ")"]:
288 parentheses -= 1
289 pron += c
290 else:
291 pron += c
293 if pron.strip() != "":
294 pron_list.append(pron)
295 return pron_list
298def extract_zh_pron_homophones_table(
299 wxr: WiktextractContext, table: HTMLNode, raw_tags: list[str]
300) -> list[Sound]:
301 sounds = []
302 for td_tag in table.find_html_recursively("td"):
303 for span_tag in td_tag.find_html("span"):
304 span_class = span_tag.attrs.get("class", "")
305 span_lang = span_tag.attrs.get("lang", "")
306 span_str = clean_node(wxr, None, span_tag)
307 if (
308 span_str not in ["", "/"]
309 and span_lang != ""
310 and span_class in ["Hant", "Hans", "Hani"]
311 ):
312 sound = Sound(homophone=span_str, raw_tags=raw_tags)
313 if span_class == "Hant":
314 sound.tags.append("Traditional-Chinese")
315 elif span_class == "Hans":
316 sound.tags.append("Simplified-Chinese")
317 sounds.append(sound)
318 return sounds
321def extract_homophones_template(
322 wxr: WiktextractContext, t_node: TemplateNode
323) -> tuple[list[Sound], list[str]]:
324 # https://zh.wiktionary.org/wiki/Template:homophones
325 expanded_node = wxr.wtp.parse(
326 wxr.wtp.node_to_wikitext(t_node), expand_all=True
327 )
328 homophones = []
329 cats = {}
330 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
331 for top_span in expanded_node.find_html(
332 "span", attr_name="class", attr_value="homophones"
333 ):
334 for span_tag in top_span.find_html("span"):
335 span_lang = span_tag.attrs.get("lang", "")
336 span_class = span_tag.attrs.get("class", "").split()
337 if "Latn" in span_class and len(homophones) > 0: 337 ↛ 338line 337 didn't jump to line 338 because the condition on line 337 was never true
338 homophones[-1].roman = clean_node(wxr, None, span_tag)
339 elif span_lang == lang_code: 339 ↛ 343line 339 didn't jump to line 343 because the condition on line 339 was always true
340 homophone = clean_node(wxr, None, span_tag)
341 if homophone != "": 341 ↛ 334line 341 didn't jump to line 334 because the condition on line 341 was always true
342 homophones.append(Sound(homophone=homophone))
343 elif "qualifier-content" in span_class and len(homophones) > 0:
344 raw_tag = clean_node(wxr, None, span_tag)
345 if raw_tag != "":
346 homophones[-1].raw_tags.append(raw_tag)
347 translate_raw_tags(homophones[-1])
348 for link_node in expanded_node.find_child(NodeKind.LINK):
349 clean_node(wxr, cats, link_node)
350 return homophones, cats.get("categories", [])
353def process_audio_template(
354 wxr: WiktextractContext, t_node: TemplateNode, raw_tags: list[str]
355) -> tuple[list[Sound], list[str]]:
356 # https://zh.wiktionary.org/wiki/Template:Audio
357 cats = {}
358 sound_file = clean_node(wxr, None, t_node.template_parameters.get(2, ""))
359 sound_data = Sound(raw_tags=raw_tags)
360 set_sound_file_url_fields(wxr, sound_file, sound_data)
361 raw_tag = clean_node(wxr, None, t_node.template_parameters.get(3, ""))
362 if len(raw_tag) > 0: 362 ↛ 363line 362 didn't jump to line 363 because the condition on line 362 was never true
363 sound_data.raw_tags.append(raw_tag)
364 expanded_node = wxr.wtp.parse(
365 wxr.wtp.node_to_wikitext(t_node), expand_all=True
366 )
367 for span_node in expanded_node.find_html_recursively(
368 "span", attr_name="class", attr_value="ib-content"
369 ):
370 for raw_tag in re.split(r",|,", clean_node(wxr, None, span_node)):
371 raw_tag = raw_tag.strip()
372 if raw_tag != "": 372 ↛ 370line 372 didn't jump to line 370 because the condition on line 372 was always true
373 sound_data.raw_tags.append(raw_tag)
374 translate_raw_tags(sound_data)
375 clean_node(wxr, cats, expanded_node)
376 return [sound_data], cats.get("categories", [])
379def extract_ipa_template(
380 wxr: WiktextractContext,
381 t_node: TemplateNode,
382 raw_tags: list[str],
383) -> tuple[list[Sound], list[str]]:
384 # https://zh.wiktionary.org/wiki/Template:IPA
385 cats = {}
386 sounds = []
387 expanded_node = wxr.wtp.parse(
388 wxr.wtp.node_to_wikitext(t_node), expand_all=True
389 )
390 clean_node(wxr, cats, expanded_node)
391 no_list_nodes = []
392 for node in expanded_node.children:
393 if isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
394 for list_item in node.find_child(NodeKind.LIST_ITEM):
395 sounds.extend(extract_ipa_list_item(wxr, list_item, raw_tags))
396 else:
397 no_list_nodes.append(node)
398 if len(no_list_nodes) > 0: 398 ↛ 402line 398 didn't jump to line 402 because the condition on line 398 was always true
399 tmp_node = WikiNode(NodeKind.ROOT, 0)
400 tmp_node.children = no_list_nodes
401 sounds.extend(extract_ipa_list_item(wxr, tmp_node, raw_tags))
402 return sounds, cats.get("categories", [])
405def extract_ipa_list_item(
406 wxr: WiktextractContext, list_item: WikiNode, shared_raw_tags: list[str]
407) -> list[Sound]:
408 sounds = []
409 shared_raw_tags = shared_raw_tags[:]
410 raw_tags = []
411 after_colon = False
412 for node in list_item.children:
413 if isinstance(node, str) and (":" in node or ":" in node):
414 after_colon = True
415 elif isinstance(node, HTMLNode) and node.tag == "span":
416 span_class = node.attrs.get("class", "").split()
417 if (
418 "qualifier-content" in span_class
419 or "ib-content" in span_class
420 or "usage-label-accent" in span_class
421 ):
422 for raw_tag in (
423 clean_node(wxr, None, node).strip("() ").split(",")
424 ):
425 raw_tag = raw_tag.strip()
426 if raw_tag != "": 426 ↛ 422line 426 didn't jump to line 422 because the condition on line 426 was always true
427 if after_colon:
428 raw_tags.append(raw_tag)
429 else:
430 shared_raw_tags.append(raw_tag)
431 elif "IPA" in span_class:
432 sound = Sound(
433 ipa=clean_node(wxr, None, node),
434 raw_tags=shared_raw_tags + raw_tags,
435 )
436 if sound.ipa != "": 436 ↛ 439line 436 didn't jump to line 439 because the condition on line 436 was always true
437 translate_raw_tags(sound)
438 sounds.append(sound)
439 raw_tags.clear()
440 elif "Latn" in span_class: 440 ↛ 441line 440 didn't jump to line 441 because the condition on line 440 was never true
441 sound = Sound(
442 roman=clean_node(wxr, None, node),
443 raw_tags=shared_raw_tags + raw_tags,
444 )
445 if sound.roman != "":
446 translate_raw_tags(sound)
447 sounds.append(sound)
448 raw_tags.clear()
449 return sounds
452def process_enpr_template(
453 wxr: WiktextractContext,
454 template_node: TemplateNode,
455 raw_tags: list[str],
456) -> list[Sound]:
457 # https://zh.wiktionary.org/wiki/Template:enPR
458 sounds = []
459 for index in range(1, 4): 459 ↛ 470line 459 didn't jump to line 470 because the loop on line 459 didn't complete
460 if index not in template_node.template_parameters:
461 break
462 sound = Sound(
463 enpr=clean_node(
464 wxr, None, template_node.template_parameters.get(index)
465 ),
466 raw_tags=raw_tags,
467 )
468 translate_raw_tags(sound)
469 sounds.append(sound)
470 return sounds
473def extract_ja_pron_template(
474 wxr: WiktextractContext, t_node: TemplateNode
475) -> tuple[list[Sound], list[str]]:
476 JA_PRON_ACCENTS = {
477 "中高型": "Nakadaka",
478 "平板型": "Heiban",
479 "頭高型": "Atamadaka",
480 "尾高型": "Odaka",
481 }
482 expanded_node = wxr.wtp.parse(
483 wxr.wtp.node_to_wikitext(t_node), expand_all=True
484 )
485 cats = {}
486 sounds = []
487 for li_tag in expanded_node.find_html_recursively("li"):
488 sound = Sound()
489 for span_tag in li_tag.find_html("span"):
490 span_class = span_tag.attrs.get("class", "").split()
491 if "usage-label-accent" in span_class:
492 raw_tag = clean_node(wxr, None, span_tag).strip("() ")
493 if raw_tag != "":
494 sound.raw_tags.append(raw_tag)
495 elif "IPA" in span_class:
496 sound.ipa = clean_node(wxr, None, span_tag)
497 elif "Latn" in span_class:
498 sound.roman = clean_node(wxr, None, span_tag)
499 elif span_tag.attrs.get("lang", "") == "ja":
500 sound.other = clean_node(wxr, None, span_tag)
501 for link_node in li_tag.find_child(NodeKind.LINK):
502 link_text = clean_node(wxr, None, link_node)
503 if link_text in JA_PRON_ACCENTS:
504 sound.tags.append(JA_PRON_ACCENTS[link_text])
505 if sound.ipa != "" or sound.other != "":
506 translate_raw_tags(sound)
507 sounds.append(sound)
509 audio_file = t_node.template_parameters.get(
510 "a", t_node.template_parameters.get("audio", "")
511 ).strip()
512 if audio_file != "":
513 sound = Sound()
514 set_sound_file_url_fields(wxr, audio_file, sound)
515 sounds.append(sound)
516 clean_node(wxr, cats, expanded_node)
517 return sounds, cats.get("categories", [])
520def extract_th_pron_template(
521 wxr: WiktextractContext, t_node: TemplateNode
522) -> tuple[list[Sound], list[str]]:
523 @dataclass
524 class TableHeader:
525 raw_tags: list[str]
526 rowspan: int
528 expanded_node = wxr.wtp.parse(
529 wxr.wtp.node_to_wikitext(t_node), expand_all=True
530 )
531 cats = {}
532 sounds = []
533 for table_tag in expanded_node.find_html("table"):
534 row_headers = []
535 for tr_tag in table_tag.find_html("tr"):
536 field = "other"
537 new_headers = []
538 for header in row_headers:
539 if header.rowspan > 1:
540 header.rowspan -= 1
541 new_headers.append(header)
542 row_headers = new_headers
543 for th_tag in tr_tag.find_html("th"):
544 header_str = clean_node(wxr, None, th_tag)
545 if header_str.startswith("(標準泰語) IPA"):
546 field = "ipa"
547 elif header_str.startswith("同音詞"):
548 field = "homophone"
549 elif header_str == "音頻":
550 field = "audio"
551 elif header_str != "":
552 rowspan = 1
553 rowspan_str = th_tag.attrs.get("rowspan", "1")
554 if re.fullmatch(r"\d+", rowspan_str):
555 rowspan = int(rowspan_str)
556 header = TableHeader([], rowspan)
557 for line in header_str.splitlines():
558 for raw_tag in line.strip("{}\n ").split(";"):
559 raw_tag = raw_tag.strip()
560 if raw_tag != "":
561 header.raw_tags.append(raw_tag)
562 row_headers.append(header)
564 for td_tag in tr_tag.find_html("td"):
565 if field == "audio":
566 for link_node in td_tag.find_child(NodeKind.LINK):
567 filename = clean_node(wxr, None, link_node.largs[0])
568 if filename != "":
569 sound = Sound()
570 set_sound_file_url_fields(wxr, filename, sound)
571 sounds.append(sound)
572 elif field == "homophone":
573 for span_tag in td_tag.find_html_recursively(
574 "span", attr_name="lang", attr_value="th"
575 ):
576 word = clean_node(wxr, None, span_tag)
577 if word != "":
578 sounds.append(Sound(homophone=word))
579 else:
580 raw_tags = []
581 for html_node in td_tag.find_child_recursively(
582 NodeKind.HTML
583 ):
584 if html_node.tag == "small":
585 node_str = clean_node(wxr, None, html_node)
586 if node_str.startswith("[") and node_str.endswith(
587 "]"
588 ):
589 for raw_tag in node_str.strip("[]").split(","):
590 raw_tag = raw_tag.strip()
591 if raw_tag != "":
592 raw_tags.append(raw_tag)
593 elif len(sounds) > 0:
594 sounds[-1].roman = node_str
595 elif html_node.tag == "span":
596 node_str = clean_node(wxr, None, html_node)
597 span_lang = html_node.attrs.get("lang", "")
598 span_class = html_node.attrs.get("class", "")
599 if node_str != "" and (
600 span_lang == "th" or span_class in ["IPA", "tr"]
601 ):
602 sound = Sound(raw_tags=raw_tags)
603 for header in row_headers:
604 sound.raw_tags.extend(header.raw_tags)
605 translate_raw_tags(sound)
606 if "romanization" in sound.tags:
607 field = "roman"
608 setattr(sound, field, node_str)
609 sounds.append(sound)
611 clean_node(wxr, cats, expanded_node)
612 return sounds, cats.get("categories", [])
615def extract_rhymes_template(
616 wxr: WiktextractContext, t_node: TemplateNode
617) -> tuple[list[Sound], list[str]]:
618 expanded_node = wxr.wtp.parse(
619 wxr.wtp.node_to_wikitext(t_node), expand_all=True
620 )
621 return extract_rhymes_list_item(wxr, expanded_node)
624def extract_rhymes_list_item(
625 wxr: WiktextractContext, list_item: WikiNode
626) -> tuple[list[Sound], list[str]]:
627 sounds = []
628 cats = {}
629 for link_node in list_item.find_child(NodeKind.LINK):
630 rhyme = clean_node(wxr, cats, link_node)
631 if rhyme != "": 631 ↛ 629line 631 didn't jump to line 629 because the condition on line 631 was always true
632 sounds.append(Sound(rhymes=rhyme))
633 return sounds, cats.get("categories", [])
636def extract_hyphenation_template(
637 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
638):
639 expanded_node = wxr.wtp.parse(
640 wxr.wtp.node_to_wikitext(t_node), expand_all=True
641 )
642 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
643 extract_hyphenation_list_item(wxr, base_data, expanded_node, lang_code)
646def extract_hyphenation_list_item(
647 wxr: WiktextractContext,
648 base_data: WordEntry,
649 list_item: WikiNode,
650 lang_code: str,
651):
652 for span_tag in list_item.find_html(
653 "span", attr_name="lang", attr_value=lang_code
654 ):
655 h_str = clean_node(wxr, None, span_tag)
656 h_data = Hyphenation(
657 parts=list(filter(None, map(str.strip, h_str.split("‧"))))
658 )
659 if len(h_data.parts) > 0: 659 ↛ 652line 659 didn't jump to line 652 because the condition on line 659 was always true
660 base_data.hyphenations.append(h_data)
663def extract_pl_pr_template(
664 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
665) -> tuple[list[Sound], list[str]]:
666 sounds = []
667 cats = {}
668 expanded_node = wxr.wtp.parse(
669 wxr.wtp.node_to_wikitext(t_node), expand_all=True
670 )
671 for list_item in expanded_node.find_child_recursively(NodeKind.LIST_ITEM):
672 skip_list = False
673 for html_node in list_item.find_child(NodeKind.HTML):
674 if html_node.tag == "table":
675 sounds.extend(extract_pl_pr_sound_table(wxr, html_node))
676 skip_list = True
677 break
678 elif (
679 html_node.tag == "span"
680 and "IPA" in html_node.attrs.get("class", "").split()
681 ):
682 sounds.extend(extract_ipa_list_item(wxr, list_item, []))
683 skip_list = True
684 break
685 if skip_list:
686 continue
687 for index, node in enumerate(list_item.children): 687 ↛ 671line 687 didn't jump to line 671 because the loop on line 687 didn't complete
688 if isinstance(node, str) and (":" in node or ":" in node): 688 ↛ 687line 688 didn't jump to line 687 because the condition on line 688 was always true
689 m = re.search(r":|:", node)
690 list_type = clean_node(
691 wxr, None, list_item.children[:index] + [node[: m.start()]]
692 )
693 if list_type == "韻部":
694 new_sounds, _ = extract_rhymes_list_item(wxr, list_item)
695 sounds.extend(new_sounds)
696 break
697 elif list_type == "音節化": 697 ↛ 687line 697 didn't jump to line 687 because the condition on line 697 was always true
698 extract_hyphenation_list_item(
699 wxr, base_data, list_item, "pl"
700 )
701 break
703 clean_node(wxr, cats, expanded_node)
704 return sounds, cats.get("categories", [])
707def extract_pl_pr_sound_table(
708 wxr: WiktextractContext, table_node: HTMLNode
709) -> list[Sound]:
710 sounds = []
711 for tr_node in table_node.find_html("tr"):
712 raw_tag = ""
713 for td_node in tr_node.find_html("td"):
714 td_class = td_node.attrs.get("class", "").split()
715 if td_class == []:
716 for i_node in td_node.find_html("i"):
717 raw_tag = clean_node(wxr, None, i_node)
718 elif "audiofile" in td_class:
719 for link_node in td_node.find_child(NodeKind.LINK):
720 if len(link_node.largs) > 0 and len(link_node.largs[0]) > 0: 720 ↛ 719line 720 didn't jump to line 719 because the condition on line 720 was always true
721 file_name = clean_node(
722 wxr, None, link_node.largs[0][0]
723 ).removeprefix("File:")
724 if file_name != "": 724 ↛ 719line 724 didn't jump to line 719 because the condition on line 724 was always true
725 sound = Sound()
726 set_sound_file_url_fields(wxr, file_name, sound)
727 if raw_tag != "": 727 ↛ 730line 727 didn't jump to line 730 because the condition on line 727 was always true
728 sound.raw_tags.append(raw_tag)
729 translate_raw_tags(sound)
730 sounds.append(sound)
731 return sounds
734def extract_ko_ipa_template(
735 wxr: WiktextractContext,
736 t_node: TemplateNode,
737 raw_tags: list[str],
738) -> tuple[list[Sound], list[str]]:
739 cats = {}
740 sounds = []
741 expanded_node = wxr.wtp.parse(
742 wxr.wtp.node_to_wikitext(t_node), expand_all=True
743 )
744 clean_node(wxr, cats, expanded_node)
745 for ul_node in expanded_node.find_html("ul"):
746 for li_node in ul_node.find_html("li"):
747 if "ko-pron__ph" in li_node.attrs.get("class", ""):
748 for span_node in li_node.find_html(
749 "span", attr_name="lang", attr_value="ko"
750 ):
751 hangeul_str = clean_node(wxr, None, span_node).strip("[]")
752 for hangeul in hangeul_str.split("/"):
753 if hangeul != "": 753 ↛ 752line 753 didn't jump to line 752 because the condition on line 753 was always true
754 sounds.append(
755 Sound(hangeul=hangeul, tags=["phonetic"])
756 )
757 else:
758 raw_tags = []
759 for link_node in li_node.find_child(NodeKind.LINK):
760 raw_tag = clean_node(wxr, None, link_node)
761 if raw_tag not in ["", "IPA"]:
762 raw_tags.append(raw_tag)
763 for span_node in li_node.find_html(
764 "span", attr_name="class", attr_value="IPA"
765 ):
766 ipas = clean_node(wxr, None, span_node)
767 for ipa in ipas.split("~"):
768 ipa = ipa.strip()
769 if ipa != "": 769 ↛ 767line 769 didn't jump to line 767 because the condition on line 769 was always true
770 sound = Sound(ipa=ipa, raw_tags=raw_tags)
771 translate_raw_tags(sound)
772 sounds.append(sound)
774 for table in expanded_node.find_html("table"):
775 for tr in table.find_html("tr"):
776 raw_tag = ""
777 for th in tr.find_html("th"):
778 raw_tag = clean_node(wxr, None, th)
779 for td in tr.find_html("td"):
780 roman = clean_node(wxr, None, td)
781 if roman != "": 781 ↛ 779line 781 didn't jump to line 779 because the condition on line 781 was always true
782 sound = Sound(roman=roman)
783 if raw_tag != "": 783 ↛ 786line 783 didn't jump to line 786 because the condition on line 783 was always true
784 sound.raw_tags.append(raw_tag)
785 translate_raw_tags(sound)
786 sounds.append(sound)
788 audio_file = clean_node(
789 wxr,
790 None,
791 t_node.template_parameters.get(
792 "a", t_node.template_parameters.get("audio", "")
793 ),
794 )
795 if audio_file != "": 795 ↛ 800line 795 didn't jump to line 800 because the condition on line 795 was always true
796 sound = Sound()
797 set_sound_file_url_fields(wxr, audio_file, sound)
798 sounds.append(sound)
800 return sounds, cats.get("categories", [])