Coverage for src/wiktextract/extractor/zh/pronunciation.py: 59%
317 statements
« prev ^ index » next coverage.py v7.11.0, created at 2025-11-03 05:44 +0000
« prev ^ index » next coverage.py v7.11.0, created at 2025-11-03 05:44 +0000
1import itertools
2import re
3from dataclasses import dataclass
5from wikitextprocessor import (
6 HTMLNode,
7 NodeKind,
8 TemplateNode,
9 WikiNode,
10)
12from ...page import clean_node
13from ...wxr_context import WiktextractContext
14from ..share import set_sound_file_url_fields
15from .models import Sound, WordEntry
16from .tags import translate_raw_tags
19def extract_pronunciation_section(
20 wxr: WiktextractContext, base_data: WordEntry, level_node: WikiNode
21) -> None:
22 for t_node in level_node.find_child(NodeKind.TEMPLATE):
23 if t_node.template_name == "zh-forms":
24 from .page import process_zh_forms
26 process_zh_forms(wxr, base_data, t_node)
27 else:
28 new_sounds, new_cats = process_pron_template(wxr, t_node)
29 base_data.sounds.extend(new_sounds)
30 base_data.categories.extend(new_cats)
31 for list_item_node in level_node.find_child_recursively(NodeKind.LIST_ITEM):
32 new_sounds, new_cats = process_pron_item_list_item(wxr, list_item_node)
33 base_data.sounds.extend(new_sounds)
34 base_data.categories.extend(new_cats)
37def process_pron_item_list_item(
38 wxr: WiktextractContext, list_item_node: WikiNode
39) -> tuple[list[Sound], list[str]]:
40 raw_tags = []
41 sounds = []
42 categories = []
43 for template_node in list_item_node.find_child(NodeKind.TEMPLATE):
44 new_sounds, new_cats = process_pron_template(
45 wxr, template_node, raw_tags
46 )
47 sounds.extend(new_sounds)
48 categories.extend(new_cats)
49 return sounds, categories
52def process_pron_template(
53 wxr: WiktextractContext,
54 template_node: TemplateNode,
55 raw_tags: list[str] = [],
56) -> tuple[list[Sound], list[str]]:
57 template_name = template_node.template_name.lower()
58 sounds = []
59 categories = []
60 if template_name == "zh-pron":
61 new_sounds, new_cats = process_zh_pron_template(wxr, template_node)
62 sounds.extend(new_sounds)
63 categories.extend(new_cats)
64 elif template_name in ["homophones", "homophone", "hmp"]:
65 sounds.extend(process_homophones_template(wxr, template_node))
66 elif template_name in ["a", "accent"]:
67 # https://zh.wiktionary.org/wiki/Template:Accent
68 raw_tags.append(clean_node(wxr, None, template_node).strip("()"))
69 elif template_name in ["audio", "音"]: 69 ↛ 70line 69 didn't jump to line 70 because the condition on line 69 was never true
70 sounds.extend(process_audio_template(wxr, template_node, raw_tags))
71 elif template_name == "ipa":
72 sounds.extend(process_ipa_template(wxr, template_node, raw_tags))
73 elif template_name == "enpr": 73 ↛ 75line 73 didn't jump to line 75 because the condition on line 73 was always true
74 sounds.extend(process_enpr_template(wxr, template_node, raw_tags))
75 elif template_name == "ja-pron":
76 new_sounds, new_cats = extract_ja_pron_template(wxr, template_node)
77 sounds.extend(new_sounds)
78 categories.extend(new_cats)
79 elif template_name == "th-pron":
80 new_sounds, new_cats = extract_th_pron_template(wxr, template_node)
81 sounds.extend(new_sounds)
82 categories.extend(new_cats)
83 return sounds, categories
86def process_zh_pron_template(
87 wxr: WiktextractContext, template_node: TemplateNode
88) -> tuple[list[Sound], list[str]]:
89 # https://zh.wiktionary.org/wiki/Template:Zh-pron
90 expanded_node = wxr.wtp.parse(
91 wxr.wtp.node_to_wikitext(template_node), expand_all=True
92 )
93 seen_lists = set()
94 sounds = []
95 categories = {}
96 for list_node in expanded_node.find_child_recursively(NodeKind.LIST):
97 if list_node not in seen_lists:
98 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
99 sounds.extend(
100 process_zh_pron_list_item(wxr, list_item, [], seen_lists)
101 )
102 clean_node(wxr, categories, expanded_node)
103 for sound in sounds:
104 translate_raw_tags(sound)
105 return sounds, categories.get("categories", [])
108def process_zh_pron_list_item(
109 wxr: WiktextractContext,
110 list_item_node: WikiNode,
111 raw_tags: list[str],
112 seen_lists: set[WikiNode],
113) -> list[Sound]:
114 current_tags = raw_tags[:]
115 sounds = []
116 is_first_small_tag = True
117 for node in list_item_node.children:
118 if isinstance(node, WikiNode):
119 if node.kind == NodeKind.LINK:
120 link_str = clean_node(wxr, None, node.largs)
121 node_str = clean_node(wxr, None, node)
122 if link_str.startswith("File:"): 122 ↛ 123line 122 didn't jump to line 123 because the condition on line 122 was never true
123 filename = link_str.removeprefix("File:")
124 sound_data = Sound(raw_tags=current_tags)
125 set_sound_file_url_fields(wxr, filename, sound_data)
126 sounds.append(sound_data)
127 elif node_str != "":
128 current_tags.append(node_str.strip("()"))
129 elif isinstance(node, HTMLNode):
130 if node.tag == "small":
131 # remove "幫助"(help) <sup> tag
132 if is_first_small_tag:
133 raw_tag_text = clean_node(
134 wxr,
135 None,
136 [
137 n
138 for n in node.children
139 if not (
140 isinstance(n, HTMLNode) and n.tag == "sup"
141 )
142 ],
143 ).rstrip(":")
144 current_tags.extend(split_zh_pron_raw_tag(raw_tag_text))
145 elif len(sounds) > 0: 145 ↛ 149line 145 didn't jump to line 149 because the condition on line 145 was always true
146 sounds[-1].raw_tags.extend(
147 split_zh_pron_raw_tag(clean_node(wxr, None, node))
148 )
149 is_first_small_tag = False
150 elif node.tag == "span":
151 sounds.extend(extract_zh_pron_span(wxr, node, current_tags))
152 elif (
153 node.tag == "table"
154 and len(current_tags) > 0
155 and current_tags[-1] == "同音詞"
156 ):
157 sounds.extend(
158 extract_zh_pron_homophones_table(
159 wxr, node, current_tags
160 )
161 )
163 elif node.kind == NodeKind.LIST: 163 ↛ 117line 163 didn't jump to line 117 because the condition on line 163 was always true
164 seen_lists.add(node)
165 for next_list_item in node.find_child(NodeKind.LIST_ITEM):
166 sounds.extend(
167 process_zh_pron_list_item(
168 wxr,
169 next_list_item,
170 current_tags,
171 seen_lists,
172 )
173 )
174 return sounds
177def split_zh_pron_raw_tag(raw_tag_text: str) -> list[str]:
178 raw_tags = []
179 if "(" not in raw_tag_text and "(" not in raw_tag_text:
180 for raw_tag in re.split(r",|,|:|、|;|;|和(?!$)", raw_tag_text):
181 raw_tag = raw_tag.strip().removeprefix("包括").strip()
182 if raw_tag != "":
183 raw_tags.append(raw_tag)
184 else:
185 processed_offsets = []
186 for match in re.finditer(r"\([^()]+\)|([^()]+)", raw_tag_text):
187 processed_offsets.append((match.start(), match.end()))
188 raw_tags.extend(
189 split_zh_pron_raw_tag(
190 raw_tag_text[match.start() + 1 : match.end() - 1]
191 )
192 )
193 not_processed = ""
194 last_end = 0
195 for start, end in processed_offsets:
196 not_processed += raw_tag_text[last_end:start]
197 last_end = end
198 not_processed += raw_tag_text[last_end:]
199 if not_processed != raw_tag_text:
200 raw_tags = split_zh_pron_raw_tag(not_processed) + raw_tags
201 else:
202 raw_tags.append(not_processed)
203 return raw_tags
206def extract_zh_pron_span(
207 wxr: WiktextractContext, span_tag: HTMLNode, raw_tags: list[str]
208) -> list[Sound]:
209 sounds = []
210 small_tags = []
211 pron_nodes = []
212 roman = ""
213 phonetic_pron = ""
214 for index, node in enumerate(span_tag.children):
215 if isinstance(node, HTMLNode) and node.tag == "small":
216 small_tags = split_zh_pron_raw_tag(clean_node(wxr, None, node))
217 elif (
218 isinstance(node, HTMLNode)
219 and node.tag == "span"
220 and "-Latn" in node.attrs.get("lang", "")
221 ):
222 roman = clean_node(wxr, None, node).strip("() ")
223 elif isinstance(node, str) and node.strip() == "[實際讀音:":
224 phonetic_pron = clean_node(
225 wxr, None, span_tag.children[index + 1 :]
226 ).strip("] ")
227 break
228 else:
229 pron_nodes.append(node)
230 for zh_pron in split_zh_pron(clean_node(wxr, None, pron_nodes)):
231 zh_pron = zh_pron.strip("[]: ")
232 if len(zh_pron) > 0: 232 ↛ 230line 232 didn't jump to line 230 because the condition on line 232 was always true
233 if "IPA" in span_tag.attrs.get("class", ""):
234 sounds.append(
235 Sound(ipa=zh_pron, roman=roman, raw_tags=raw_tags)
236 )
237 else:
238 sounds.append(
239 Sound(zh_pron=zh_pron, roman=roman, raw_tags=raw_tags)
240 )
241 if len(sounds) > 0:
242 sounds[-1].raw_tags.extend(small_tags)
243 if phonetic_pron != "":
244 sounds.append(
245 Sound(
246 zh_pron=phonetic_pron,
247 roman=roman,
248 raw_tags=raw_tags + ["實際讀音"],
249 )
250 )
251 return sounds
254def split_zh_pron(zh_pron: str) -> list[str]:
255 # split by comma and other symbols that outside parentheses
256 parentheses = 0
257 pron_list = []
258 pron = ""
259 for c in zh_pron:
260 if (
261 (c in [",", ";", "→"] or (c == "/" and not zh_pron.startswith("/")))
262 and parentheses == 0
263 and len(pron.strip()) > 0
264 ):
265 pron_list.append(pron.strip())
266 pron = ""
267 elif c in ["(", "("]:
268 parentheses += 1
269 pron += c
270 elif c in [")", ")"]:
271 parentheses -= 1
272 pron += c
273 else:
274 pron += c
276 if pron.strip() != "":
277 pron_list.append(pron)
278 return pron_list
281def extract_zh_pron_homophones_table(
282 wxr: WiktextractContext, table: HTMLNode, raw_tags: list[str]
283) -> list[Sound]:
284 sounds = []
285 for td_tag in table.find_html_recursively("td"):
286 for span_tag in td_tag.find_html("span"):
287 span_class = span_tag.attrs.get("class", "")
288 span_lang = span_tag.attrs.get("lang", "")
289 span_str = clean_node(wxr, None, span_tag)
290 if (
291 span_str not in ["", "/"]
292 and span_lang != ""
293 and span_class in ["Hant", "Hans", "Hani"]
294 ):
295 sound = Sound(homophone=span_str, raw_tags=raw_tags)
296 if span_class == "Hant":
297 sound.tags.append("Traditional-Chinese")
298 elif span_class == "Hans":
299 sound.tags.append("Simplified-Chinese")
300 sounds.append(sound)
301 return sounds
304def process_homophones_template(
305 wxr: WiktextractContext, template_node: TemplateNode
306) -> list[Sound]:
307 # https://zh.wiktionary.org/wiki/Template:homophones
308 sounds = []
309 for word_index in itertools.count(2): 309 ↛ 317line 309 didn't jump to line 317 because the loop on line 309 didn't complete
310 if word_index not in template_node.template_parameters:
311 break
312 homophone = clean_node(
313 wxr, None, template_node.template_parameters.get(word_index, "")
314 )
315 if len(homophone) > 0: 315 ↛ 309line 315 didn't jump to line 309 because the condition on line 315 was always true
316 sounds.append(Sound(homophone=homophone))
317 return sounds
320def process_audio_template(
321 wxr: WiktextractContext, template_node: TemplateNode, raw_tags: list[str]
322) -> list[Sound]:
323 # https://zh.wiktionary.org/wiki/Template:Audio
324 sound_file = clean_node(
325 wxr, None, template_node.template_parameters.get(2, "")
326 )
327 sound_data = Sound()
328 set_sound_file_url_fields(wxr, sound_file, sound_data)
329 raw_tag = clean_node(
330 wxr, None, template_node.template_parameters.get(3, "")
331 )
332 if len(raw_tag) > 0:
333 sound_data.raw_tags.append(raw_tag)
334 sound_data.raw_tags.extend(raw_tags)
335 return [sound_data]
338def process_ipa_template(
339 wxr: WiktextractContext,
340 template_node: TemplateNode,
341 raw_tags: list[str],
342) -> list[Sound]:
343 # https://zh.wiktionary.org/wiki/Template:IPA
344 sounds = []
345 for index in itertools.count(2): 345 ↛ 355line 345 didn't jump to line 355 because the loop on line 345 didn't complete
346 if index not in template_node.template_parameters:
347 break
348 sound = Sound(
349 ipa=clean_node(
350 wxr, None, template_node.template_parameters.get(index)
351 ),
352 raw_tags=raw_tags,
353 )
354 sounds.append(sound)
355 return sounds
358def process_enpr_template(
359 wxr: WiktextractContext,
360 template_node: TemplateNode,
361 raw_tags: list[str],
362) -> list[Sound]:
363 # https://zh.wiktionary.org/wiki/Template:enPR
364 sounds = []
365 for index in range(1, 4): 365 ↛ 375line 365 didn't jump to line 375 because the loop on line 365 didn't complete
366 if index not in template_node.template_parameters:
367 break
368 sound = Sound(
369 enpr=clean_node(
370 wxr, None, template_node.template_parameters.get(index)
371 ),
372 raw_tags=raw_tags,
373 )
374 sounds.append(sound)
375 return sounds
378def extract_ja_pron_template(
379 wxr: WiktextractContext, t_node: TemplateNode
380) -> tuple[list[Sound], list[str]]:
381 expanded_node = wxr.wtp.parse(
382 wxr.wtp.node_to_wikitext(t_node), expand_all=True
383 )
384 cats = {}
385 sounds = []
386 for li_tag in expanded_node.find_html_recursively("li"):
387 sound = Sound()
388 for span_tag in li_tag.find_html("span"):
389 span_class = span_tag.attrs.get("class", "")
390 if "usage-label-accent" in span_class:
391 raw_tag = clean_node(wxr, None, span_tag).strip("() ")
392 if raw_tag != "":
393 sound.raw_tags.append(raw_tag)
394 elif "IPA" in span_class:
395 sound.ipa = clean_node(wxr, None, span_tag)
396 elif "Latn" in span_class:
397 sound.roman = clean_node(wxr, None, span_tag)
398 elif span_tag.attrs.get("lang", "") == "ja":
399 sound.other = clean_node(wxr, None, span_tag)
400 if sound.ipa != "" or sound.other != "":
401 translate_raw_tags(sound)
402 sounds.append(sound)
404 clean_node(wxr, cats, expanded_node)
405 return sounds, cats.get("categories", [])
408def extract_th_pron_template(
409 wxr: WiktextractContext, t_node: TemplateNode
410) -> tuple[list[Sound], list[str]]:
411 @dataclass
412 class TableHeader:
413 raw_tags: list[str]
414 rowspan: int
416 expanded_node = wxr.wtp.parse(
417 wxr.wtp.node_to_wikitext(t_node), expand_all=True
418 )
419 cats = {}
420 sounds = []
421 for table_tag in expanded_node.find_html("table"):
422 row_headers = []
423 for tr_tag in table_tag.find_html("tr"):
424 field = "other"
425 new_headers = []
426 for header in row_headers:
427 if header.rowspan > 1:
428 header.rowspan -= 1
429 new_headers.append(header)
430 row_headers = new_headers
431 for th_tag in tr_tag.find_html("th"):
432 header_str = clean_node(wxr, None, th_tag)
433 if header_str.startswith("(標準泰語) IPA"):
434 field = "ipa"
435 elif header_str.startswith("同音詞"):
436 field = "homophone"
437 elif header_str == "音頻":
438 field = "audio"
439 elif header_str != "":
440 rowspan = 1
441 rowspan_str = th_tag.attrs.get("rowspan", "1")
442 if re.fullmatch(r"\d+", rowspan_str):
443 rowspan = int(rowspan_str)
444 header = TableHeader([], rowspan)
445 for line in header_str.splitlines():
446 for raw_tag in line.strip("{}\n ").split(";"):
447 raw_tag = raw_tag.strip()
448 if raw_tag != "":
449 header.raw_tags.append(raw_tag)
450 row_headers.append(header)
452 for td_tag in tr_tag.find_html("td"):
453 if field == "audio":
454 for link_node in td_tag.find_child(NodeKind.LINK):
455 filename = clean_node(wxr, None, link_node.largs[0])
456 if filename != "":
457 sound = Sound()
458 set_sound_file_url_fields(wxr, filename, sound)
459 sounds.append(sound)
460 elif field == "homophone":
461 for span_tag in td_tag.find_html_recursively(
462 "span", attr_name="lang", attr_value="th"
463 ):
464 word = clean_node(wxr, None, span_tag)
465 if word != "":
466 sounds.append(Sound(homophone=word))
467 else:
468 raw_tags = []
469 for html_node in td_tag.find_child_recursively(
470 NodeKind.HTML
471 ):
472 if html_node.tag == "small":
473 node_str = clean_node(wxr, None, html_node)
474 if node_str.startswith("[") and node_str.endswith(
475 "]"
476 ):
477 for raw_tag in node_str.strip("[]").split(","):
478 raw_tag = raw_tag.strip()
479 if raw_tag != "":
480 raw_tags.append(raw_tag)
481 elif len(sounds) > 0:
482 sounds[-1].roman = node_str
483 elif html_node.tag == "span":
484 node_str = clean_node(wxr, None, html_node)
485 span_lang = html_node.attrs.get("lang", "")
486 span_class = html_node.attrs.get("class", "")
487 if node_str != "" and (
488 span_lang == "th" or span_class in ["IPA", "tr"]
489 ):
490 sound = Sound(raw_tags=raw_tags)
491 for header in row_headers:
492 sound.raw_tags.extend(header.raw_tags)
493 translate_raw_tags(sound)
494 if "romanization" in sound.tags:
495 field = "roman"
496 setattr(sound, field, node_str)
497 sounds.append(sound)
499 clean_node(wxr, cats, expanded_node)
500 return sounds, cats.get("categories", [])