Coverage for src / wiktextract / extractor / zh / pronunciation.py: 66%
454 statements
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-05 07:46 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-05 07:46 +0000
1import re
2from dataclasses import dataclass
4from wikitextprocessor import (
5 HTMLNode,
6 NodeKind,
7 TemplateNode,
8 WikiNode,
9)
11from ...page import clean_node
12from ...wxr_context import WiktextractContext
13from ..share import set_sound_file_url_fields
14from .models import Hyphenation, Sound, WordEntry
15from .tags import translate_raw_tags
18def extract_pronunciation_section(
19 wxr: WiktextractContext, base_data: WordEntry, level_node: WikiNode
20) -> None:
21 for t_node in level_node.find_child(NodeKind.TEMPLATE):
22 if t_node.template_name == "zh-forms":
23 from .page import process_zh_forms
25 process_zh_forms(wxr, base_data, t_node)
26 else:
27 new_sounds, new_cats = process_pron_template(wxr, base_data, t_node)
28 base_data.sounds.extend(new_sounds)
29 base_data.categories.extend(new_cats)
30 for list_item_node in level_node.find_child_recursively(NodeKind.LIST_ITEM):
31 new_sounds, new_cats = process_pron_item_list_item(
32 wxr, base_data, list_item_node
33 )
34 base_data.sounds.extend(new_sounds)
35 base_data.categories.extend(new_cats)
38def process_pron_item_list_item(
39 wxr: WiktextractContext, base_data: WordEntry, list_item_node: WikiNode
40) -> tuple[list[Sound], list[str]]:
41 raw_tags = []
42 sounds = []
43 categories = []
44 for t_node in list_item_node.find_child(NodeKind.TEMPLATE):
45 if t_node.template_name.lower() in ["hyph", "hyphenation"]: 45 ↛ 46line 45 didn't jump to line 46 because the condition on line 45 was never true
46 extract_hyphenation_template(wxr, base_data, t_node)
47 else:
48 new_sounds, new_cats = process_pron_template(
49 wxr, base_data, t_node, raw_tags
50 )
51 sounds.extend(new_sounds)
52 categories.extend(new_cats)
53 return sounds, categories
56def process_pron_template(
57 wxr: WiktextractContext,
58 base_data: WordEntry,
59 template_node: TemplateNode,
60 raw_tags: list[str] = [],
61) -> tuple[list[Sound], list[str]]:
62 template_name = template_node.template_name.lower()
63 sounds = []
64 categories = []
65 new_sounds = []
66 new_cats = []
67 if template_name == "zh-pron":
68 new_sounds, new_cats = process_zh_pron_template(wxr, template_node)
69 elif template_name in ["rhymes", "rhyme"]: 69 ↛ 70line 69 didn't jump to line 70 because the condition on line 69 was never true
70 new_sounds, new_cats = extract_rhymes_template(wxr, template_node)
71 elif template_name in ["homophones", "homophone", "hmp"]:
72 new_sounds, new_cats = extract_homophones_template(wxr, template_node)
73 elif template_name in ["a", "accent"]:
74 # https://zh.wiktionary.org/wiki/Template:Accent
75 raw_tags.append(clean_node(wxr, None, template_node).strip("()"))
76 elif template_name in ["audio", "音"]: 76 ↛ 77line 76 didn't jump to line 77 because the condition on line 76 was never true
77 sounds.extend(process_audio_template(wxr, template_node, raw_tags))
78 elif template_name == "ipa" or template_name.endswith("-ipa"):
79 new_sounds, new_cats = extract_ipa_template(
80 wxr, template_node, raw_tags
81 )
82 elif template_name == "enpr":
83 sounds.extend(process_enpr_template(wxr, template_node, raw_tags))
84 elif template_name == "ja-pron": 84 ↛ 85line 84 didn't jump to line 85 because the condition on line 84 was never true
85 new_sounds, new_cats = extract_ja_pron_template(wxr, template_node)
86 elif template_name == "th-pron": 86 ↛ 87line 86 didn't jump to line 87 because the condition on line 86 was never true
87 new_sounds, new_cats = extract_th_pron_template(wxr, template_node)
88 elif template_name.endswith("-pr"): 88 ↛ 92line 88 didn't jump to line 92 because the condition on line 88 was always true
89 new_sounds, new_cats = extract_pl_pr_template(
90 wxr, base_data, template_node
91 )
92 sounds.extend(new_sounds)
93 categories.extend(new_cats)
94 return sounds, categories
97def process_zh_pron_template(
98 wxr: WiktextractContext, template_node: TemplateNode
99) -> tuple[list[Sound], list[str]]:
100 # https://zh.wiktionary.org/wiki/Template:Zh-pron
101 expanded_node = wxr.wtp.parse(
102 wxr.wtp.node_to_wikitext(template_node), expand_all=True
103 )
104 seen_lists = set()
105 sounds = []
106 categories = {}
107 for list_node in expanded_node.find_child_recursively(NodeKind.LIST):
108 if list_node not in seen_lists:
109 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
110 sounds.extend(
111 process_zh_pron_list_item(wxr, list_item, [], seen_lists)
112 )
113 clean_node(wxr, categories, expanded_node)
114 for sound in sounds:
115 translate_raw_tags(sound)
116 return sounds, categories.get("categories", [])
119def process_zh_pron_list_item(
120 wxr: WiktextractContext,
121 list_item_node: WikiNode,
122 raw_tags: list[str],
123 seen_lists: set[WikiNode],
124) -> list[Sound]:
125 current_tags = raw_tags[:]
126 sounds = []
127 is_first_small_tag = True
128 for node in list_item_node.children:
129 if isinstance(node, WikiNode):
130 if node.kind == NodeKind.LINK:
131 link_str = clean_node(wxr, None, node.largs)
132 node_str = clean_node(wxr, None, node)
133 if link_str.startswith("File:"): 133 ↛ 134line 133 didn't jump to line 134 because the condition on line 133 was never true
134 filename = link_str.removeprefix("File:")
135 sound_data = Sound(raw_tags=current_tags)
136 set_sound_file_url_fields(wxr, filename, sound_data)
137 sounds.append(sound_data)
138 elif node_str != "":
139 current_tags.append(node_str.strip("()"))
140 elif isinstance(node, HTMLNode):
141 if node.tag == "small":
142 # remove "幫助"(help) <sup> tag
143 if is_first_small_tag:
144 raw_tag_text = clean_node(
145 wxr,
146 None,
147 [
148 n
149 for n in node.children
150 if not (
151 isinstance(n, HTMLNode) and n.tag == "sup"
152 )
153 ],
154 ).rstrip(":")
155 current_tags.extend(split_zh_pron_raw_tag(raw_tag_text))
156 elif len(sounds) > 0: 156 ↛ 160line 156 didn't jump to line 160 because the condition on line 156 was always true
157 sounds[-1].raw_tags.extend(
158 split_zh_pron_raw_tag(clean_node(wxr, None, node))
159 )
160 is_first_small_tag = False
161 elif node.tag == "span":
162 sounds.extend(extract_zh_pron_span(wxr, node, current_tags))
163 elif (
164 node.tag == "table"
165 and len(current_tags) > 0
166 and current_tags[-1] == "同音詞"
167 ):
168 sounds.extend(
169 extract_zh_pron_homophones_table(
170 wxr, node, current_tags
171 )
172 )
174 elif node.kind == NodeKind.LIST: 174 ↛ 128line 174 didn't jump to line 128 because the condition on line 174 was always true
175 seen_lists.add(node)
176 for next_list_item in node.find_child(NodeKind.LIST_ITEM):
177 sounds.extend(
178 process_zh_pron_list_item(
179 wxr,
180 next_list_item,
181 current_tags,
182 seen_lists,
183 )
184 )
185 return sounds
188def split_zh_pron_raw_tag(raw_tag_text: str) -> list[str]:
189 raw_tags = []
190 if "(" not in raw_tag_text and "(" not in raw_tag_text:
191 for raw_tag in re.split(r",|,|:|、|;|;|和(?!$)", raw_tag_text):
192 raw_tag = raw_tag.strip().removeprefix("包括").strip()
193 if raw_tag != "":
194 raw_tags.append(raw_tag)
195 else:
196 processed_offsets = []
197 for match in re.finditer(r"\([^()]+\)|([^()]+)", raw_tag_text):
198 processed_offsets.append((match.start(), match.end()))
199 raw_tags.extend(
200 split_zh_pron_raw_tag(
201 raw_tag_text[match.start() + 1 : match.end() - 1]
202 )
203 )
204 not_processed = ""
205 last_end = 0
206 for start, end in processed_offsets:
207 not_processed += raw_tag_text[last_end:start]
208 last_end = end
209 not_processed += raw_tag_text[last_end:]
210 if not_processed != raw_tag_text:
211 raw_tags = split_zh_pron_raw_tag(not_processed) + raw_tags
212 else:
213 raw_tags.append(not_processed)
214 return raw_tags
217def extract_zh_pron_span(
218 wxr: WiktextractContext, span_tag: HTMLNode, raw_tags: list[str]
219) -> list[Sound]:
220 sounds = []
221 small_tags = []
222 pron_nodes = []
223 roman = ""
224 phonetic_pron = ""
225 for index, node in enumerate(span_tag.children):
226 if isinstance(node, HTMLNode) and node.tag == "small":
227 small_tags = split_zh_pron_raw_tag(clean_node(wxr, None, node))
228 elif (
229 isinstance(node, HTMLNode)
230 and node.tag == "span"
231 and "-Latn" in node.attrs.get("lang", "")
232 ):
233 roman = clean_node(wxr, None, node).strip("() ")
234 elif isinstance(node, str) and node.strip() == "[實際讀音:":
235 phonetic_pron = clean_node(
236 wxr, None, span_tag.children[index + 1 :]
237 ).strip("] ")
238 break
239 else:
240 pron_nodes.append(node)
241 for zh_pron in split_zh_pron(clean_node(wxr, None, pron_nodes)):
242 zh_pron = zh_pron.strip("[]: ")
243 if len(zh_pron) > 0: 243 ↛ 241line 243 didn't jump to line 241 because the condition on line 243 was always true
244 if "IPA" in span_tag.attrs.get("class", ""):
245 sounds.append(
246 Sound(ipa=zh_pron, roman=roman, raw_tags=raw_tags)
247 )
248 else:
249 sounds.append(
250 Sound(zh_pron=zh_pron, roman=roman, raw_tags=raw_tags)
251 )
252 if len(sounds) > 0:
253 sounds[-1].raw_tags.extend(small_tags)
254 if phonetic_pron != "":
255 sounds.append(
256 Sound(
257 zh_pron=phonetic_pron,
258 roman=roman,
259 raw_tags=raw_tags + ["實際讀音"],
260 )
261 )
262 return sounds
265def split_zh_pron(zh_pron: str) -> list[str]:
266 # split by comma and other symbols that outside parentheses
267 parentheses = 0
268 pron_list = []
269 pron = ""
270 for c in zh_pron:
271 if (
272 (c in [",", ";", "→"] or (c == "/" and not zh_pron.startswith("/")))
273 and parentheses == 0
274 and len(pron.strip()) > 0
275 ):
276 pron_list.append(pron.strip())
277 pron = ""
278 elif c in ["(", "("]:
279 parentheses += 1
280 pron += c
281 elif c in [")", ")"]:
282 parentheses -= 1
283 pron += c
284 else:
285 pron += c
287 if pron.strip() != "":
288 pron_list.append(pron)
289 return pron_list
292def extract_zh_pron_homophones_table(
293 wxr: WiktextractContext, table: HTMLNode, raw_tags: list[str]
294) -> list[Sound]:
295 sounds = []
296 for td_tag in table.find_html_recursively("td"):
297 for span_tag in td_tag.find_html("span"):
298 span_class = span_tag.attrs.get("class", "")
299 span_lang = span_tag.attrs.get("lang", "")
300 span_str = clean_node(wxr, None, span_tag)
301 if (
302 span_str not in ["", "/"]
303 and span_lang != ""
304 and span_class in ["Hant", "Hans", "Hani"]
305 ):
306 sound = Sound(homophone=span_str, raw_tags=raw_tags)
307 if span_class == "Hant":
308 sound.tags.append("Traditional-Chinese")
309 elif span_class == "Hans":
310 sound.tags.append("Simplified-Chinese")
311 sounds.append(sound)
312 return sounds
315def extract_homophones_template(
316 wxr: WiktextractContext, t_node: TemplateNode
317) -> tuple[list[Sound], list[str]]:
318 # https://zh.wiktionary.org/wiki/Template:homophones
319 expanded_node = wxr.wtp.parse(
320 wxr.wtp.node_to_wikitext(t_node), expand_all=True
321 )
322 homophones = []
323 cats = {}
324 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
325 for top_span in expanded_node.find_html(
326 "span", attr_name="class", attr_value="homophones"
327 ):
328 for span_tag in top_span.find_html("span"):
329 span_lang = span_tag.attrs.get("lang", "")
330 span_class = span_tag.attrs.get("class", "").split()
331 if "Latn" in span_class and len(homophones) > 0: 331 ↛ 332line 331 didn't jump to line 332 because the condition on line 331 was never true
332 homophones[-1].roman = clean_node(wxr, None, span_tag)
333 elif span_lang == lang_code: 333 ↛ 337line 333 didn't jump to line 337 because the condition on line 333 was always true
334 homophone = clean_node(wxr, None, span_tag)
335 if homophone != "": 335 ↛ 328line 335 didn't jump to line 328 because the condition on line 335 was always true
336 homophones.append(Sound(homophone=homophone))
337 elif "qualifier-content" in span_class and len(homophones) > 0:
338 raw_tag = clean_node(wxr, None, span_tag)
339 if raw_tag != "":
340 homophones[-1].raw_tags.append(raw_tag)
341 translate_raw_tags(homophones[-1])
342 for link_node in expanded_node.find_child(NodeKind.LINK):
343 clean_node(wxr, cats, link_node)
344 return homophones, cats.get("categories", [])
347def process_audio_template(
348 wxr: WiktextractContext, template_node: TemplateNode, raw_tags: list[str]
349) -> list[Sound]:
350 # https://zh.wiktionary.org/wiki/Template:Audio
351 sound_file = clean_node(
352 wxr, None, template_node.template_parameters.get(2, "")
353 )
354 sound_data = Sound()
355 set_sound_file_url_fields(wxr, sound_file, sound_data)
356 raw_tag = clean_node(
357 wxr, None, template_node.template_parameters.get(3, "")
358 )
359 if len(raw_tag) > 0:
360 sound_data.raw_tags.append(raw_tag)
361 sound_data.raw_tags.extend(raw_tags)
362 return [sound_data]
365def extract_ipa_template(
366 wxr: WiktextractContext,
367 t_node: TemplateNode,
368 raw_tags: list[str],
369) -> tuple[list[Sound], list[str]]:
370 # https://zh.wiktionary.org/wiki/Template:IPA
371 cats = {}
372 sounds = []
373 expanded_node = wxr.wtp.parse(
374 wxr.wtp.node_to_wikitext(t_node), expand_all=True
375 )
376 clean_node(wxr, cats, expanded_node)
377 no_list_nodes = []
378 for node in expanded_node.children:
379 if isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
380 for list_item in node.find_child(NodeKind.LIST_ITEM):
381 sounds.extend(extract_ipa_list_item(wxr, list_item, raw_tags))
382 else:
383 no_list_nodes.append(node)
384 if len(no_list_nodes) > 0: 384 ↛ 388line 384 didn't jump to line 388 because the condition on line 384 was always true
385 tmp_node = WikiNode(NodeKind.ROOT, 0)
386 tmp_node.children = no_list_nodes
387 sounds.extend(extract_ipa_list_item(wxr, tmp_node, raw_tags))
388 return sounds, cats.get("categories", [])
391def extract_ipa_list_item(
392 wxr: WiktextractContext, list_item: WikiNode, shared_raw_tags: list[str]
393) -> list[Sound]:
394 sounds = []
395 shared_raw_tags = shared_raw_tags[:]
396 raw_tags = []
397 after_colon = False
398 for node in list_item.children:
399 if isinstance(node, str) and (":" in node or ":" in node):
400 after_colon = True
401 elif isinstance(node, HTMLNode) and node.tag == "span":
402 span_class = node.attrs.get("class", "").split()
403 if (
404 "qualifier-content" in span_class
405 or "ib-content" in span_class
406 or "usage-label-accent" in span_class
407 ):
408 for raw_tag in (
409 clean_node(wxr, None, node).strip("() ").split(",")
410 ):
411 raw_tag = raw_tag.strip()
412 if raw_tag != "": 412 ↛ 408line 412 didn't jump to line 408 because the condition on line 412 was always true
413 if after_colon:
414 raw_tags.append(raw_tag)
415 else:
416 shared_raw_tags.append(raw_tag)
417 elif "IPA" in span_class:
418 sound = Sound(
419 ipa=clean_node(wxr, None, node),
420 raw_tags=shared_raw_tags + raw_tags,
421 )
422 if sound.ipa != "": 422 ↛ 425line 422 didn't jump to line 425 because the condition on line 422 was always true
423 translate_raw_tags(sound)
424 sounds.append(sound)
425 raw_tags.clear()
426 elif "Latn" in span_class: 426 ↛ 427line 426 didn't jump to line 427 because the condition on line 426 was never true
427 sound = Sound(
428 roman=clean_node(wxr, None, node),
429 raw_tags=shared_raw_tags + raw_tags,
430 )
431 if sound.roman != "":
432 translate_raw_tags(sound)
433 sounds.append(sound)
434 raw_tags.clear()
435 return sounds
438def process_enpr_template(
439 wxr: WiktextractContext,
440 template_node: TemplateNode,
441 raw_tags: list[str],
442) -> list[Sound]:
443 # https://zh.wiktionary.org/wiki/Template:enPR
444 sounds = []
445 for index in range(1, 4): 445 ↛ 455line 445 didn't jump to line 455 because the loop on line 445 didn't complete
446 if index not in template_node.template_parameters:
447 break
448 sound = Sound(
449 enpr=clean_node(
450 wxr, None, template_node.template_parameters.get(index)
451 ),
452 raw_tags=raw_tags,
453 )
454 sounds.append(sound)
455 return sounds
458def extract_ja_pron_template(
459 wxr: WiktextractContext, t_node: TemplateNode
460) -> tuple[list[Sound], list[str]]:
461 JA_PRON_ACCENTS = {
462 "中高型": "Nakadaka",
463 "平板型": "Heiban",
464 "頭高型": "Atamadaka",
465 "尾高型": "Odaka",
466 }
467 expanded_node = wxr.wtp.parse(
468 wxr.wtp.node_to_wikitext(t_node), expand_all=True
469 )
470 cats = {}
471 sounds = []
472 for li_tag in expanded_node.find_html_recursively("li"):
473 sound = Sound()
474 for span_tag in li_tag.find_html("span"):
475 span_class = span_tag.attrs.get("class", "").split()
476 if "usage-label-accent" in span_class:
477 raw_tag = clean_node(wxr, None, span_tag).strip("() ")
478 if raw_tag != "":
479 sound.raw_tags.append(raw_tag)
480 elif "IPA" in span_class:
481 sound.ipa = clean_node(wxr, None, span_tag)
482 elif "Latn" in span_class:
483 sound.roman = clean_node(wxr, None, span_tag)
484 elif span_tag.attrs.get("lang", "") == "ja":
485 sound.other = clean_node(wxr, None, span_tag)
486 for link_node in li_tag.find_child(NodeKind.LINK):
487 link_text = clean_node(wxr, None, link_node)
488 if link_text in JA_PRON_ACCENTS:
489 sound.tags.append(JA_PRON_ACCENTS[link_text])
490 if sound.ipa != "" or sound.other != "":
491 translate_raw_tags(sound)
492 sounds.append(sound)
494 audio_file = t_node.template_parameters.get(
495 "a", t_node.template_parameters.get("audio", "")
496 ).strip()
497 if audio_file != "":
498 sound = Sound()
499 set_sound_file_url_fields(wxr, audio_file, sound)
500 sounds.append(sound)
501 clean_node(wxr, cats, expanded_node)
502 return sounds, cats.get("categories", [])
505def extract_th_pron_template(
506 wxr: WiktextractContext, t_node: TemplateNode
507) -> tuple[list[Sound], list[str]]:
508 @dataclass
509 class TableHeader:
510 raw_tags: list[str]
511 rowspan: int
513 expanded_node = wxr.wtp.parse(
514 wxr.wtp.node_to_wikitext(t_node), expand_all=True
515 )
516 cats = {}
517 sounds = []
518 for table_tag in expanded_node.find_html("table"):
519 row_headers = []
520 for tr_tag in table_tag.find_html("tr"):
521 field = "other"
522 new_headers = []
523 for header in row_headers:
524 if header.rowspan > 1:
525 header.rowspan -= 1
526 new_headers.append(header)
527 row_headers = new_headers
528 for th_tag in tr_tag.find_html("th"):
529 header_str = clean_node(wxr, None, th_tag)
530 if header_str.startswith("(標準泰語) IPA"):
531 field = "ipa"
532 elif header_str.startswith("同音詞"):
533 field = "homophone"
534 elif header_str == "音頻":
535 field = "audio"
536 elif header_str != "":
537 rowspan = 1
538 rowspan_str = th_tag.attrs.get("rowspan", "1")
539 if re.fullmatch(r"\d+", rowspan_str):
540 rowspan = int(rowspan_str)
541 header = TableHeader([], rowspan)
542 for line in header_str.splitlines():
543 for raw_tag in line.strip("{}\n ").split(";"):
544 raw_tag = raw_tag.strip()
545 if raw_tag != "":
546 header.raw_tags.append(raw_tag)
547 row_headers.append(header)
549 for td_tag in tr_tag.find_html("td"):
550 if field == "audio":
551 for link_node in td_tag.find_child(NodeKind.LINK):
552 filename = clean_node(wxr, None, link_node.largs[0])
553 if filename != "":
554 sound = Sound()
555 set_sound_file_url_fields(wxr, filename, sound)
556 sounds.append(sound)
557 elif field == "homophone":
558 for span_tag in td_tag.find_html_recursively(
559 "span", attr_name="lang", attr_value="th"
560 ):
561 word = clean_node(wxr, None, span_tag)
562 if word != "":
563 sounds.append(Sound(homophone=word))
564 else:
565 raw_tags = []
566 for html_node in td_tag.find_child_recursively(
567 NodeKind.HTML
568 ):
569 if html_node.tag == "small":
570 node_str = clean_node(wxr, None, html_node)
571 if node_str.startswith("[") and node_str.endswith(
572 "]"
573 ):
574 for raw_tag in node_str.strip("[]").split(","):
575 raw_tag = raw_tag.strip()
576 if raw_tag != "":
577 raw_tags.append(raw_tag)
578 elif len(sounds) > 0:
579 sounds[-1].roman = node_str
580 elif html_node.tag == "span":
581 node_str = clean_node(wxr, None, html_node)
582 span_lang = html_node.attrs.get("lang", "")
583 span_class = html_node.attrs.get("class", "")
584 if node_str != "" and (
585 span_lang == "th" or span_class in ["IPA", "tr"]
586 ):
587 sound = Sound(raw_tags=raw_tags)
588 for header in row_headers:
589 sound.raw_tags.extend(header.raw_tags)
590 translate_raw_tags(sound)
591 if "romanization" in sound.tags:
592 field = "roman"
593 setattr(sound, field, node_str)
594 sounds.append(sound)
596 clean_node(wxr, cats, expanded_node)
597 return sounds, cats.get("categories", [])
600def extract_rhymes_template(
601 wxr: WiktextractContext, t_node: TemplateNode
602) -> tuple[list[Sound], list[str]]:
603 expanded_node = wxr.wtp.parse(
604 wxr.wtp.node_to_wikitext(t_node), expand_all=True
605 )
606 return extract_rhymes_list_item(wxr, expanded_node)
609def extract_rhymes_list_item(
610 wxr: WiktextractContext, list_item: WikiNode
611) -> tuple[list[Sound], list[str]]:
612 sounds = []
613 cats = {}
614 for link_node in list_item.find_child(NodeKind.LINK):
615 rhyme = clean_node(wxr, cats, link_node)
616 if rhyme != "": 616 ↛ 614line 616 didn't jump to line 614 because the condition on line 616 was always true
617 sounds.append(Sound(rhymes=rhyme))
618 return sounds, cats.get("categories", [])
621def extract_hyphenation_template(
622 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
623):
624 expanded_node = wxr.wtp.parse(
625 wxr.wtp.node_to_wikitext(t_node), expand_all=True
626 )
627 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
628 extract_hyphenation_list_item(wxr, base_data, expanded_node, lang_code)
631def extract_hyphenation_list_item(
632 wxr: WiktextractContext,
633 base_data: WordEntry,
634 list_item: WikiNode,
635 lang_code: str,
636):
637 for span_tag in list_item.find_html(
638 "span", attr_name="lang", attr_value=lang_code
639 ):
640 h_str = clean_node(wxr, None, span_tag)
641 h_data = Hyphenation(
642 parts=list(filter(None, map(str.strip, h_str.split("‧"))))
643 )
644 if len(h_data.parts) > 0: 644 ↛ 637line 644 didn't jump to line 637 because the condition on line 644 was always true
645 base_data.hyphenations.append(h_data)
648def extract_pl_pr_template(
649 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
650) -> tuple[list[Sound], list[str]]:
651 sounds = []
652 cats = {}
653 expanded_node = wxr.wtp.parse(
654 wxr.wtp.node_to_wikitext(t_node), expand_all=True
655 )
656 for list_item in expanded_node.find_child_recursively(NodeKind.LIST_ITEM):
657 skip_list = False
658 for html_node in list_item.find_child(NodeKind.HTML):
659 if html_node.tag == "table":
660 sounds.extend(extract_pl_pr_sound_table(wxr, html_node))
661 skip_list = True
662 break
663 elif (
664 html_node.tag == "span"
665 and "IPA" in html_node.attrs.get("class", "").split()
666 ):
667 sounds.extend(extract_ipa_list_item(wxr, list_item, []))
668 skip_list = True
669 break
670 if skip_list:
671 continue
672 for index, node in enumerate(list_item.children): 672 ↛ 656line 672 didn't jump to line 656 because the loop on line 672 didn't complete
673 if isinstance(node, str) and (":" in node or ":" in node): 673 ↛ 672line 673 didn't jump to line 672 because the condition on line 673 was always true
674 m = re.search(r":|:", node)
675 list_type = clean_node(
676 wxr, None, list_item.children[:index] + [node[: m.start()]]
677 )
678 if list_type == "韻部":
679 new_sounds, _ = extract_rhymes_list_item(wxr, list_item)
680 sounds.extend(new_sounds)
681 break
682 elif list_type == "音節化": 682 ↛ 672line 682 didn't jump to line 672 because the condition on line 682 was always true
683 extract_hyphenation_list_item(
684 wxr, base_data, list_item, "pl"
685 )
686 break
688 clean_node(wxr, cats, expanded_node)
689 return sounds, cats.get("categories", [])
692def extract_pl_pr_sound_table(
693 wxr: WiktextractContext, table_node: HTMLNode
694) -> list[Sound]:
695 sounds = []
696 for tr_node in table_node.find_html("tr"):
697 raw_tag = ""
698 for td_node in tr_node.find_html("td"):
699 td_class = td_node.attrs.get("class", "").split()
700 if td_class == []:
701 for i_node in td_node.find_html("i"):
702 raw_tag = clean_node(wxr, None, i_node)
703 elif "audiofile" in td_class:
704 for link_node in td_node.find_child(NodeKind.LINK):
705 if len(link_node.largs) > 0 and len(link_node.largs[0]) > 0: 705 ↛ 704line 705 didn't jump to line 704 because the condition on line 705 was always true
706 file_name = clean_node(
707 wxr, None, link_node.largs[0][0]
708 ).removeprefix("File:")
709 if file_name != "": 709 ↛ 704line 709 didn't jump to line 704 because the condition on line 709 was always true
710 sound = Sound()
711 set_sound_file_url_fields(wxr, file_name, sound)
712 if raw_tag != "": 712 ↛ 715line 712 didn't jump to line 715 because the condition on line 712 was always true
713 sound.raw_tags.append(raw_tag)
714 translate_raw_tags(sound)
715 sounds.append(sound)
716 return sounds