Coverage for src/wiktextract/extractor/zh/pronunciation.py: 84%
133 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1import itertools
2import re
4from wikitextprocessor import (
5 HTMLNode,
6 NodeKind,
7 TemplateNode,
8 WikiNode,
9)
11from ...page import clean_node
12from ...wxr_context import WiktextractContext
13from ..share import create_audio_url_dict, set_sound_file_url_fields
14from .models import Sound, WordEntry
15from .tags import translate_raw_tags
18def extract_pronunciation(
19 wxr: WiktextractContext,
20 page_data: list[WordEntry],
21 base_data: WordEntry,
22 level_node: WikiNode,
23) -> tuple[list[Sound], list[str]]:
24 if len(base_data.sounds) > 0:
25 base_data.sounds.clear()
27 for template_node in level_node.find_child(NodeKind.TEMPLATE):
28 new_sounds, new_cats = process_pron_template(wxr, template_node)
29 base_data.sounds.extend(new_sounds)
30 base_data.categories.extend(new_cats)
31 for list_item_node in level_node.find_child_recursively(NodeKind.LIST_ITEM):
32 new_sounds, new_cats = process_pron_item_list_item(wxr, list_item_node)
33 base_data.sounds.extend(new_sounds)
34 base_data.categories.extend(new_cats)
37def process_pron_item_list_item(
38 wxr: WiktextractContext, list_item_node: WikiNode
39) -> tuple[list[Sound], list[str]]:
40 raw_tags = []
41 sounds = []
42 categories = []
43 for template_node in list_item_node.find_child(NodeKind.TEMPLATE):
44 new_sounds, new_cats = process_pron_template(
45 wxr, template_node, raw_tags
46 )
47 sounds.extend(new_sounds)
48 categories.extend(new_cats)
49 return sounds, categories
52def process_pron_template(
53 wxr: WiktextractContext,
54 template_node: TemplateNode,
55 raw_tags: list[str] = [],
56) -> tuple[list[Sound], list[str]]:
57 template_name = template_node.template_name.lower()
58 sounds = []
59 categories = []
60 if template_name == "zh-pron":
61 new_sounds, new_cats = process_zh_pron_template(wxr, template_node)
62 sounds.extend(new_sounds)
63 categories.extend(new_cats)
64 elif template_name in ["homophones", "homophone", "hmp"]:
65 sounds.extend(process_homophones_template(wxr, template_node))
66 elif template_name in ["a", "accent"]:
67 # https://zh.wiktionary.org/wiki/Template:Accent
68 raw_tags.append(clean_node(wxr, None, template_node).strip("()"))
69 elif template_name in ["audio", "音"]: 69 ↛ 70line 69 didn't jump to line 70 because the condition on line 69 was never true
70 sounds.extend(process_audio_template(wxr, template_node, raw_tags))
71 elif template_name == "ipa":
72 sounds.extend(process_ipa_template(wxr, template_node, raw_tags))
73 elif template_name == "enpr": 73 ↛ 75line 73 didn't jump to line 75 because the condition on line 73 was always true
74 sounds.extend(process_enpr_template(wxr, template_node, raw_tags))
75 return sounds, categories
78def process_zh_pron_template(
79 wxr: WiktextractContext, template_node: TemplateNode
80) -> tuple[list[Sound], list[str]]:
81 # https://zh.wiktionary.org/wiki/Template:Zh-pron
82 expanded_node = wxr.wtp.parse(
83 wxr.wtp.node_to_wikitext(template_node), expand_all=True
84 )
85 seen_lists = set()
86 sounds = []
87 categories = {}
88 for list_node in expanded_node.find_child_recursively(NodeKind.LIST):
89 if list_node not in seen_lists:
90 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
91 sounds.extend(
92 process_zh_pron_list_item(wxr, list_item, [], seen_lists)
93 )
94 clean_node(wxr, categories, expanded_node)
95 for sound in sounds:
96 translate_raw_tags(sound)
97 return sounds, categories.get("categories", [])
100def process_zh_pron_list_item(
101 wxr: WiktextractContext,
102 list_item_node: WikiNode,
103 raw_tags: list[str],
104 seen_lists: set[WikiNode],
105) -> list[Sound]:
106 current_tags = raw_tags[:]
107 sounds = []
108 for node in list_item_node.children:
109 if isinstance(node, WikiNode):
110 if node.kind == NodeKind.LINK:
111 if len(node.largs) > 0 and node.largs[0][0].startswith("File:"): 111 ↛ 112line 111 didn't jump to line 112 because the condition on line 111 was never true
112 sound_file_data = create_audio_url_dict(
113 node.largs[0][0].removeprefix("File:")
114 )
115 sound_data = Sound()
116 for key, value in sound_file_data.items():
117 if key in Sound.model_fields:
118 setattr(sound_data, key, value)
119 else:
120 wxr.wtp.warning(
121 f"{key=} not defined in Sound",
122 sortid="zh.pronunciation/56",
123 )
124 sounds.append(sound_data)
125 else:
126 current_tags.append(clean_node(wxr, None, node).strip("()"))
127 elif isinstance(node, HTMLNode):
128 if node.tag == "small":
129 # remove "幫助"(help) <sup> tag
130 raw_tags = re.split(
131 r",|:",
132 clean_node(
133 wxr,
134 None,
135 list(node.invert_find_child(NodeKind.HTML)),
136 ).strip("()"),
137 )
138 current_tags.extend(
139 [t.strip() for t in raw_tags if len(t.strip()) > 0]
140 )
141 elif node.tag == "span":
142 zh_pron = clean_node(wxr, None, node)
143 if len(zh_pron) > 0: 143 ↛ 108line 143 didn't jump to line 108 because the condition on line 143 was always true
144 if "IPA" in node.attrs.get("class", ""): 144 ↛ 145line 144 didn't jump to line 145 because the condition on line 144 was never true
145 sound = Sound(ipa=zh_pron, raw_tags=current_tags)
146 else:
147 sound = Sound(
148 zh_pron=zh_pron, raw_tags=current_tags
149 )
150 sounds.append(sound)
151 elif ( 151 ↛ 108line 151 didn't jump to line 108 because the condition on line 151 was always true
152 node.tag == "table"
153 and len(current_tags) > 0
154 and current_tags[-1] == "同音詞"
155 ):
156 sounds.extend(
157 process_homophones_table(wxr, node, current_tags)
158 )
160 elif node.kind == NodeKind.LIST: 160 ↛ 108line 160 didn't jump to line 108 because the condition on line 160 was always true
161 seen_lists.add(node)
162 for next_list_item in node.find_child(NodeKind.LIST_ITEM):
163 sounds.extend(
164 process_zh_pron_list_item(
165 wxr,
166 next_list_item,
167 current_tags,
168 seen_lists,
169 )
170 )
171 return sounds
174def process_homophones_template(
175 wxr: WiktextractContext, template_node: TemplateNode
176) -> list[Sound]:
177 # https://zh.wiktionary.org/wiki/Template:homophones
178 sounds = []
179 for word_index in itertools.count(2): 179 ↛ 187line 179 didn't jump to line 187 because the loop on line 179 didn't complete
180 if word_index not in template_node.template_parameters:
181 break
182 homophone = clean_node(
183 wxr, None, template_node.template_parameters.get(word_index, "")
184 )
185 if len(homophone) > 0: 185 ↛ 179line 185 didn't jump to line 179 because the condition on line 185 was always true
186 sounds.append(Sound(homophone=homophone))
187 return sounds
190def process_homophones_table(
191 wxr: WiktextractContext,
192 table_node: HTMLNode,
193 raw_tags: list[str],
194) -> list[Sound]:
195 sounds = []
196 for span_node in table_node.find_html_recursively("span", attr_name="lang"):
197 sound_data = Sound(
198 homophone=clean_node(wxr, None, span_node), raw_tags=raw_tags
199 )
200 sounds.append(sound_data)
201 return sounds
204def process_audio_template(
205 wxr: WiktextractContext, template_node: TemplateNode, raw_tags: list[str]
206) -> list[Sound]:
207 # https://zh.wiktionary.org/wiki/Template:Audio
208 sound_file = clean_node(
209 wxr, None, template_node.template_parameters.get(2, "")
210 )
211 sound_data = Sound()
212 set_sound_file_url_fields(wxr, sound_file, sound_data)
213 raw_tag = clean_node(
214 wxr, None, template_node.template_parameters.get(3, "")
215 )
216 if len(raw_tag) > 0:
217 sound_data.raw_tags.append(raw_tag)
218 sound_data.raw_tags.extend(raw_tags)
219 return [sound_data]
222def process_ipa_template(
223 wxr: WiktextractContext,
224 template_node: TemplateNode,
225 raw_tags: list[str],
226) -> list[Sound]:
227 # https://zh.wiktionary.org/wiki/Template:IPA
228 sounds = []
229 for index in itertools.count(2): 229 ↛ 239line 229 didn't jump to line 239 because the loop on line 229 didn't complete
230 if index not in template_node.template_parameters:
231 break
232 sound = Sound(
233 ipa=clean_node(
234 wxr, None, template_node.template_parameters.get(index)
235 ),
236 raw_tags=raw_tags,
237 )
238 sounds.append(sound)
239 return sounds
242def process_enpr_template(
243 wxr: WiktextractContext,
244 template_node: TemplateNode,
245 raw_tags: list[str],
246) -> list[Sound]:
247 # https://zh.wiktionary.org/wiki/Template:enPR
248 sounds = []
249 for index in range(1, 4): 249 ↛ 259line 249 didn't jump to line 259 because the loop on line 249 didn't complete
250 if index not in template_node.template_parameters:
251 break
252 sound = Sound(
253 enpr=clean_node(
254 wxr, None, template_node.template_parameters.get(index)
255 ),
256 raw_tags=raw_tags,
257 )
258 sounds.append(sound)
259 return sounds