Coverage for src/wiktextract/extractor/zh/pronunciation.py: 82%
235 statements
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-15 05:18 +0000
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-15 05:18 +0000
1import itertools
2import re
4from wikitextprocessor import (
5 HTMLNode,
6 NodeKind,
7 TemplateNode,
8 WikiNode,
9)
11from ...page import clean_node
12from ...wxr_context import WiktextractContext
13from ..share import set_sound_file_url_fields
14from .models import Sound, WordEntry
15from .tags import translate_raw_tags
18def extract_pronunciation_section(
19 wxr: WiktextractContext, base_data: WordEntry, level_node: WikiNode
20) -> None:
21 for t_node in level_node.find_child(NodeKind.TEMPLATE):
22 if t_node.template_name == "zh-forms":
23 from .page import process_zh_forms
25 process_zh_forms(wxr, base_data, t_node)
26 else:
27 new_sounds, new_cats = process_pron_template(wxr, t_node)
28 base_data.sounds.extend(new_sounds)
29 base_data.categories.extend(new_cats)
30 for list_item_node in level_node.find_child_recursively(NodeKind.LIST_ITEM):
31 new_sounds, new_cats = process_pron_item_list_item(wxr, list_item_node)
32 base_data.sounds.extend(new_sounds)
33 base_data.categories.extend(new_cats)
36def process_pron_item_list_item(
37 wxr: WiktextractContext, list_item_node: WikiNode
38) -> tuple[list[Sound], list[str]]:
39 raw_tags = []
40 sounds = []
41 categories = []
42 for template_node in list_item_node.find_child(NodeKind.TEMPLATE):
43 new_sounds, new_cats = process_pron_template(
44 wxr, template_node, raw_tags
45 )
46 sounds.extend(new_sounds)
47 categories.extend(new_cats)
48 return sounds, categories
51def process_pron_template(
52 wxr: WiktextractContext,
53 template_node: TemplateNode,
54 raw_tags: list[str] = [],
55) -> tuple[list[Sound], list[str]]:
56 template_name = template_node.template_name.lower()
57 sounds = []
58 categories = []
59 if template_name == "zh-pron":
60 new_sounds, new_cats = process_zh_pron_template(wxr, template_node)
61 sounds.extend(new_sounds)
62 categories.extend(new_cats)
63 elif template_name in ["homophones", "homophone", "hmp"]:
64 sounds.extend(process_homophones_template(wxr, template_node))
65 elif template_name in ["a", "accent"]:
66 # https://zh.wiktionary.org/wiki/Template:Accent
67 raw_tags.append(clean_node(wxr, None, template_node).strip("()"))
68 elif template_name in ["audio", "音"]: 68 ↛ 69line 68 didn't jump to line 69 because the condition on line 68 was never true
69 sounds.extend(process_audio_template(wxr, template_node, raw_tags))
70 elif template_name == "ipa":
71 sounds.extend(process_ipa_template(wxr, template_node, raw_tags))
72 elif template_name == "enpr": 72 ↛ 74line 72 didn't jump to line 74 because the condition on line 72 was always true
73 sounds.extend(process_enpr_template(wxr, template_node, raw_tags))
74 elif template_name == "ja-pron":
75 new_sounds, new_cats = extract_ja_pron_template(wxr, template_node)
76 sounds.extend(new_sounds)
77 categories.extend(new_cats)
78 return sounds, categories
81def process_zh_pron_template(
82 wxr: WiktextractContext, template_node: TemplateNode
83) -> tuple[list[Sound], list[str]]:
84 # https://zh.wiktionary.org/wiki/Template:Zh-pron
85 expanded_node = wxr.wtp.parse(
86 wxr.wtp.node_to_wikitext(template_node), expand_all=True
87 )
88 seen_lists = set()
89 sounds = []
90 categories = {}
91 for list_node in expanded_node.find_child_recursively(NodeKind.LIST):
92 if list_node not in seen_lists:
93 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
94 sounds.extend(
95 process_zh_pron_list_item(wxr, list_item, [], seen_lists)
96 )
97 clean_node(wxr, categories, expanded_node)
98 for sound in sounds:
99 translate_raw_tags(sound)
100 return sounds, categories.get("categories", [])
103def process_zh_pron_list_item(
104 wxr: WiktextractContext,
105 list_item_node: WikiNode,
106 raw_tags: list[str],
107 seen_lists: set[WikiNode],
108) -> list[Sound]:
109 current_tags = raw_tags[:]
110 sounds = []
111 is_first_small_tag = True
112 for node in list_item_node.children:
113 if isinstance(node, WikiNode):
114 if node.kind == NodeKind.LINK:
115 link_str = clean_node(wxr, None, node.largs)
116 node_str = clean_node(wxr, None, node)
117 if link_str.startswith("File:"): 117 ↛ 118line 117 didn't jump to line 118 because the condition on line 117 was never true
118 filename = link_str.removeprefix("File:")
119 sound_data = Sound(raw_tags=current_tags)
120 set_sound_file_url_fields(wxr, filename, sound_data)
121 sounds.append(sound_data)
122 elif node_str != "":
123 current_tags.append(node_str.strip("()"))
124 elif isinstance(node, HTMLNode):
125 if node.tag == "small":
126 # remove "幫助"(help) <sup> tag
127 if is_first_small_tag:
128 raw_tag_text = clean_node(
129 wxr,
130 None,
131 [
132 n
133 for n in node.children
134 if not (
135 isinstance(n, HTMLNode) and n.tag == "sup"
136 )
137 ],
138 ).rstrip(":")
139 current_tags.extend(split_zh_pron_raw_tag(raw_tag_text))
140 elif len(sounds) > 0: 140 ↛ 144line 140 didn't jump to line 144 because the condition on line 140 was always true
141 sounds[-1].raw_tags.extend(
142 split_zh_pron_raw_tag(clean_node(wxr, None, node))
143 )
144 is_first_small_tag = False
145 elif node.tag == "span":
146 sounds.extend(extract_zh_pron_span(wxr, node, current_tags))
147 elif (
148 node.tag == "table"
149 and len(current_tags) > 0
150 and current_tags[-1] == "同音詞"
151 ):
152 sounds.extend(
153 extract_zh_pron_homophones_table(
154 wxr, node, current_tags
155 )
156 )
158 elif node.kind == NodeKind.LIST: 158 ↛ 112line 158 didn't jump to line 112 because the condition on line 158 was always true
159 seen_lists.add(node)
160 for next_list_item in node.find_child(NodeKind.LIST_ITEM):
161 sounds.extend(
162 process_zh_pron_list_item(
163 wxr,
164 next_list_item,
165 current_tags,
166 seen_lists,
167 )
168 )
169 return sounds
172def split_zh_pron_raw_tag(raw_tag_text: str) -> list[str]:
173 raw_tags = []
174 if "(" not in raw_tag_text and "(" not in raw_tag_text:
175 for raw_tag in re.split(r",|,|:|、|和", raw_tag_text):
176 raw_tag = raw_tag.strip().removeprefix("包括").strip()
177 if raw_tag != "":
178 raw_tags.append(raw_tag)
179 else:
180 processed_offsets = []
181 for match in re.finditer(r"\([^()]+\)|([^()]+)", raw_tag_text):
182 processed_offsets.append((match.start(), match.end()))
183 raw_tags.extend(
184 split_zh_pron_raw_tag(
185 raw_tag_text[match.start() + 1 : match.end() - 1]
186 )
187 )
188 not_processed = ""
189 last_end = 0
190 for start, end in processed_offsets:
191 not_processed += raw_tag_text[last_end:start]
192 last_end = end
193 not_processed += raw_tag_text[last_end:]
194 if not_processed != raw_tag_text:
195 raw_tags = split_zh_pron_raw_tag(not_processed) + raw_tags
196 else:
197 raw_tags.append(not_processed)
198 return raw_tags
201def extract_zh_pron_span(
202 wxr: WiktextractContext, span_tag: HTMLNode, raw_tags: list[str]
203) -> list[Sound]:
204 sounds = []
205 small_tags = []
206 pron_nodes = []
207 roman = ""
208 phonetic_pron = ""
209 for index, node in enumerate(span_tag.children):
210 if isinstance(node, HTMLNode) and node.tag == "small":
211 small_tags = split_zh_pron_raw_tag(clean_node(wxr, None, node))
212 elif (
213 isinstance(node, HTMLNode)
214 and node.tag == "span"
215 and "-Latn" in node.attrs.get("lang", "")
216 ):
217 roman = clean_node(wxr, None, node).strip("() ")
218 elif isinstance(node, str) and node.strip() == "[實際讀音:":
219 phonetic_pron = clean_node(
220 wxr, None, span_tag.children[index + 1 :]
221 ).strip("] ")
222 break
223 else:
224 pron_nodes.append(node)
225 for zh_pron in split_zh_pron(clean_node(wxr, None, pron_nodes)):
226 zh_pron = zh_pron.strip("[]: ")
227 if len(zh_pron) > 0: 227 ↛ 225line 227 didn't jump to line 225 because the condition on line 227 was always true
228 if "IPA" in span_tag.attrs.get("class", ""):
229 sounds.append(
230 Sound(ipa=zh_pron, roman=roman, raw_tags=raw_tags)
231 )
232 else:
233 sounds.append(
234 Sound(zh_pron=zh_pron, roman=roman, raw_tags=raw_tags)
235 )
236 if len(sounds) > 0:
237 sounds[-1].raw_tags.extend(small_tags)
238 if phonetic_pron != "":
239 sounds.append(
240 Sound(
241 zh_pron=phonetic_pron,
242 roman=roman,
243 raw_tags=raw_tags + ["實際讀音"],
244 )
245 )
246 return sounds
249def split_zh_pron(zh_pron: str) -> list[str]:
250 # split by comma and other symbols that outside parentheses
251 parentheses = 0
252 pron_list = []
253 pron = ""
254 for c in zh_pron:
255 if (
256 (c in [",", ";", "→"] or (c == "/" and not zh_pron.startswith("/")))
257 and parentheses == 0
258 and len(pron.strip()) > 0
259 ):
260 pron_list.append(pron.strip())
261 pron = ""
262 elif c in ["(", "("]:
263 parentheses += 1
264 pron += c
265 elif c in [")", ")"]:
266 parentheses -= 1
267 pron += c
268 else:
269 pron += c
271 if pron.strip() != "":
272 pron_list.append(pron)
273 return pron_list
276def extract_zh_pron_homophones_table(
277 wxr: WiktextractContext, table: HTMLNode, raw_tags: list[str]
278) -> list[Sound]:
279 sounds = []
280 for td_tag in table.find_html_recursively("td"):
281 for span_tag in td_tag.find_html("span"):
282 span_class = span_tag.attrs.get("class", "")
283 span_lang = span_tag.attrs.get("lang", "")
284 span_str = clean_node(wxr, None, span_tag)
285 if (
286 span_str not in ["", "/"]
287 and span_lang != ""
288 and span_class in ["Hant", "Hans", "Hani"]
289 ):
290 sound = Sound(homophone=span_str, raw_tags=raw_tags)
291 if span_class == "Hant":
292 sound.tags.append("Traditional-Chinese")
293 elif span_class == "Hans":
294 sound.tags.append("Simplified-Chinese")
295 sounds.append(sound)
296 return sounds
299def process_homophones_template(
300 wxr: WiktextractContext, template_node: TemplateNode
301) -> list[Sound]:
302 # https://zh.wiktionary.org/wiki/Template:homophones
303 sounds = []
304 for word_index in itertools.count(2): 304 ↛ 312line 304 didn't jump to line 312 because the loop on line 304 didn't complete
305 if word_index not in template_node.template_parameters:
306 break
307 homophone = clean_node(
308 wxr, None, template_node.template_parameters.get(word_index, "")
309 )
310 if len(homophone) > 0: 310 ↛ 304line 310 didn't jump to line 304 because the condition on line 310 was always true
311 sounds.append(Sound(homophone=homophone))
312 return sounds
315def process_audio_template(
316 wxr: WiktextractContext, template_node: TemplateNode, raw_tags: list[str]
317) -> list[Sound]:
318 # https://zh.wiktionary.org/wiki/Template:Audio
319 sound_file = clean_node(
320 wxr, None, template_node.template_parameters.get(2, "")
321 )
322 sound_data = Sound()
323 set_sound_file_url_fields(wxr, sound_file, sound_data)
324 raw_tag = clean_node(
325 wxr, None, template_node.template_parameters.get(3, "")
326 )
327 if len(raw_tag) > 0:
328 sound_data.raw_tags.append(raw_tag)
329 sound_data.raw_tags.extend(raw_tags)
330 return [sound_data]
333def process_ipa_template(
334 wxr: WiktextractContext,
335 template_node: TemplateNode,
336 raw_tags: list[str],
337) -> list[Sound]:
338 # https://zh.wiktionary.org/wiki/Template:IPA
339 sounds = []
340 for index in itertools.count(2): 340 ↛ 350line 340 didn't jump to line 350 because the loop on line 340 didn't complete
341 if index not in template_node.template_parameters:
342 break
343 sound = Sound(
344 ipa=clean_node(
345 wxr, None, template_node.template_parameters.get(index)
346 ),
347 raw_tags=raw_tags,
348 )
349 sounds.append(sound)
350 return sounds
353def process_enpr_template(
354 wxr: WiktextractContext,
355 template_node: TemplateNode,
356 raw_tags: list[str],
357) -> list[Sound]:
358 # https://zh.wiktionary.org/wiki/Template:enPR
359 sounds = []
360 for index in range(1, 4): 360 ↛ 370line 360 didn't jump to line 370 because the loop on line 360 didn't complete
361 if index not in template_node.template_parameters:
362 break
363 sound = Sound(
364 enpr=clean_node(
365 wxr, None, template_node.template_parameters.get(index)
366 ),
367 raw_tags=raw_tags,
368 )
369 sounds.append(sound)
370 return sounds
373def extract_ja_pron_template(
374 wxr: WiktextractContext, t_node: TemplateNode
375) -> tuple[list[Sound], list[str]]:
376 expanded_node = wxr.wtp.parse(
377 wxr.wtp.node_to_wikitext(t_node), expand_all=True
378 )
379 cats = {}
380 sounds = []
381 for li_tag in expanded_node.find_html_recursively("li"):
382 sound = Sound()
383 for span_tag in li_tag.find_html("span"):
384 span_class = span_tag.attrs.get("class", "")
385 if "usage-label-accent" in span_class:
386 raw_tag = clean_node(wxr, None, span_tag).strip("() ")
387 if raw_tag != "":
388 sound.raw_tags.append(raw_tag)
389 elif "IPA" in span_class:
390 sound.ipa = clean_node(wxr, None, span_tag)
391 elif "Latn" in span_class:
392 sound.roman = clean_node(wxr, None, span_tag)
393 elif span_tag.attrs.get("lang", "") == "ja":
394 sound.other = clean_node(wxr, None, span_tag)
395 if sound.ipa != "" or sound.other != "":
396 translate_raw_tags(sound)
397 sounds.append(sound)
399 clean_node(wxr, cats, expanded_node)
400 return sounds, cats.get("categories", [])