Coverage for src / wiktextract / extractor / fr / pronunciation.py: 84%
185 statements
« prev ^ index » next coverage.py v7.12.0, created at 2025-11-28 08:21 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2025-11-28 08:21 +0000
1import re
3from wikitextprocessor.parser import NodeKind, TemplateNode, WikiNode
5from ...page import clean_node
6from ...wxr_context import WiktextractContext
7from ..share import set_sound_file_url_fields
8from .models import Linkage, Sound, WordEntry
9from .tags import translate_raw_tags
12def extract_pronunciation(
13 wxr: WiktextractContext,
14 page_data: list[WordEntry],
15 level_node: WikiNode,
16 base_data: WordEntry,
17) -> None:
18 sounds_list = []
19 lang_code = base_data.lang_code
20 for node in level_node.find_child(NodeKind.LIST | NodeKind.TEMPLATE):
21 if node.kind == NodeKind.LIST:
22 for list_item_node in node.find_child(NodeKind.LIST_ITEM):
23 sounds_list.extend(
24 process_pron_list_item(wxr, list_item_node, [], lang_code)
25 )
26 elif isinstance(node, TemplateNode): 26 ↛ 20line 26 didn't jump to line 20 because the condition on line 26 was always true
27 if node.template_name in ["cmn-pron", "zh-cmn-pron"]:
28 sounds_list.extend(process_cmn_pron_template(wxr, node))
30 if len(sounds_list) == 0:
31 return
33 if level_node.kind == NodeKind.LEVEL3 and len(page_data) > 0: 33 ↛ 42line 33 didn't jump to line 42 because the condition on line 33 was always true
34 # Add extracted sound data to all sense dictionaries that have the same
35 # language code when the prononciation subtitle is a level 3 title node.
36 # Otherwise only add to the last one.
37 for sense_data in page_data:
38 if sense_data.lang_code == lang_code:
39 sense_data.sounds.extend(sounds_list)
40 for sound in sounds_list:
41 sense_data.categories.extend(sound.categories)
42 elif len(page_data) > 0:
43 page_data[-1].sounds.extend(sounds_list)
44 for sound in sounds_list:
45 page_data[-1].categories.extend(sound.categories)
46 else:
47 base_data.sounds.extend(sounds_list)
48 for sound in sounds_list:
49 base_data.categories.extend(sound.categories)
52PRON_TEMPLATES = frozenset(
53 [
54 "pron", # redirect to "prononciation"
55 "prononciation",
56 "//", # redirect to "prononciation"
57 "phon", # redirect to "prononciation"
58 "pron-recons", # use "pron"
59 "prononciation reconstruite", # redirect to "pron-recons"
60 "pron recons", # redirect to "pron-recons"
61 "phono",
62 "pron-API",
63 "API",
64 ]
65)
67ASPIRATED_H_TEMPLATES = frozenset(
68 [
69 "h aspiré",
70 "h", # redirect to "h aspiré"
71 "h muet",
72 ]
73)
76def process_pron_list_item(
77 wxr: WiktextractContext,
78 list_item_node: WikiNode,
79 parent_raw_tags: list[str],
80 lang_code: str,
81) -> list[Sound]:
82 current_raw_tags = parent_raw_tags[:]
83 sounds_list = []
84 pron_key = "zh_pron" if lang_code == "zh" else "ipa"
85 after_colon = False
86 for child_index, list_item_child in enumerate(list_item_node.children):
87 if isinstance(list_item_child, TemplateNode):
88 sounds_list.extend(
89 process_sound_list_item_templates(
90 wxr,
91 list_item_child,
92 current_raw_tags,
93 after_colon,
94 list_item_node.children[child_index - 1 : child_index],
95 lang_code,
96 )
97 )
98 elif isinstance(list_item_child, WikiNode):
99 if list_item_child.kind == NodeKind.BOLD:
100 current_raw_tags.append(clean_node(wxr, None, list_item_child))
101 elif list_item_child.kind == NodeKind.LINK:
102 for span_tag in list_item_child.find_html_recursively("span"):
103 sound = Sound(
104 ipa=clean_node(wxr, None, span_tag),
105 raw_tags=current_raw_tags[:],
106 )
107 translate_raw_tags(sound)
108 sounds_list.append(sound)
109 elif isinstance(list_item_child, str): 109 ↛ 86line 109 didn't jump to line 86 because the condition on line 109 was always true
110 if ":" in list_item_child:
111 after_colon = True
112 pron_text = list_item_child[
113 list_item_child.find(":") + 1 :
114 ].strip()
115 if len(pron_text) > 0:
116 sound = Sound(raw_tags=current_raw_tags[:])
117 setattr(sound, pron_key, pron_text)
118 translate_raw_tags(sound)
119 sounds_list.append(sound)
121 for nest_list_item in list_item_node.find_child_recursively(
122 NodeKind.LIST_ITEM
123 ):
124 sounds_list.extend(
125 process_pron_list_item(
126 wxr, nest_list_item, current_raw_tags, lang_code
127 )
128 )
130 return sounds_list
133def process_sound_list_item_templates(
134 wxr: WiktextractContext,
135 t_node: TemplateNode,
136 raw_tags: list[str],
137 after_colon: bool,
138 pre_nodes: list[WikiNode],
139 lang_code: str,
140) -> list[Sound]:
141 if t_node.template_name in PRON_TEMPLATES:
142 return process_pron_template(
143 wxr, t_node, raw_tags, lang_code, pre_nodes
144 )
145 elif t_node.template_name == "lang":
146 return extract_lang_template(wxr, t_node, raw_tags, lang_code)
147 elif t_node.template_name in {
148 "écouter",
149 "audio",
150 "pron-rég",
151 }:
152 return [process_ecouter_template(wxr, t_node, raw_tags)]
153 elif t_node.template_name == "pron-rimes":
154 return [process_pron_rimes_template(wxr, t_node, raw_tags)]
155 elif t_node.template_name in ASPIRATED_H_TEMPLATES: 155 ↛ 156line 155 didn't jump to line 156 because the condition on line 155 was never true
156 pass
157 elif not after_colon: # location 157 ↛ 161line 157 didn't jump to line 161 because the condition on line 157 was always true
158 raw_tag = clean_node(wxr, None, t_node)
159 raw_tags.append(raw_tag)
161 return []
164def process_pron_template(
165 wxr: WiktextractContext,
166 t_node: TemplateNode,
167 raw_tags: list[str],
168 lang_code: str,
169 previous_nodes: list[WikiNode] = [],
170) -> list[Sound]:
171 if (
172 t_node.template_name in PRON_TEMPLATES
173 and clean_node(wxr, None, t_node.template_parameters.get(1, "")) == ""
174 ):
175 # some pages don't pass IPA parameter to the "pron" template
176 # and expand to an edit link for adding the missing data.
177 return []
178 sounds_list = []
179 # https://en.wikipedia.org/wiki/Aspirated_h
180 # https://fr.wiktionary.org/wiki/Modèle:h_aspiré
181 aspirated_h = ""
182 if len(previous_nodes) > 0 and isinstance(previous_nodes[-1], TemplateNode):
183 if previous_nodes[-1].template_name in ASPIRATED_H_TEMPLATES: 183 ↛ 186line 183 didn't jump to line 186 because the condition on line 183 was always true
184 aspirated_h = clean_node(wxr, None, previous_nodes[-1])
186 expanded_node = wxr.wtp.parse(
187 wxr.wtp.node_to_wikitext(t_node), expand_all=True
188 )
189 for span_tag in expanded_node.find_html_recursively(
190 "span", attr_name="class", attr_value="API"
191 ):
192 ipa = clean_node(wxr, None, span_tag)
193 if ipa != "": 193 ↛ 189line 193 didn't jump to line 189 because the condition on line 193 was always true
194 sound = Sound(raw_tags=raw_tags[:], ipa=aspirated_h + ipa)
195 translate_raw_tags(sound)
196 sounds_list.append(sound)
197 return sounds_list
200def extract_lang_template(
201 wxr: WiktextractContext,
202 t_node: TemplateNode,
203 raw_tags: list[str],
204 lang_code: str,
205) -> list[Sound]:
206 sounds = []
207 field = "zh_pron" if lang_code == "zh" else "ipa"
208 pron_texts = clean_node(wxr, None, t_node)
209 prons = set()
210 for pron_text in re.split(",|,", pron_texts):
211 pron_text = pron_text.strip()
212 if len(pron_text) > 0 and pron_text not in prons:
213 prons.add(pron_text)
214 sound = Sound(raw_tags=raw_tags[:])
215 setattr(sound, field, pron_text)
216 translate_raw_tags(sound)
217 sounds.append(sound)
218 return sounds
221def process_ecouter_template(
222 wxr: WiktextractContext,
223 template_node: TemplateNode,
224 raw_tags: list[str],
225) -> Sound:
226 # sound file template: https://fr.wiktionary.org/wiki/Modèle:écouter
227 sound = Sound()
228 location = clean_node(
229 wxr, None, template_node.template_parameters.get(1, "")
230 )
231 if location.startswith("(") and location.endswith(")"): 231 ↛ 232line 231 didn't jump to line 232 because the condition on line 231 was never true
232 location = location.strip("()")
233 ipa = clean_node(
234 wxr,
235 None,
236 template_node.template_parameters.get(
237 2, template_node.template_parameters.get("pron", "")
238 ),
239 )
240 audio_file = clean_node(
241 wxr, None, template_node.template_parameters.get("audio", "")
242 )
243 if len(raw_tags) > 0: 243 ↛ 244line 243 didn't jump to line 244 because the condition on line 243 was never true
244 sound.raw_tags = raw_tags[:]
245 if len(location) > 0: 245 ↛ 247line 245 didn't jump to line 247 because the condition on line 245 was always true
246 sound.raw_tags.append(location)
247 if len(ipa) > 0:
248 sound.ipa = f"[{ipa}]"
249 if len(audio_file) > 0: 249 ↛ 251line 249 didn't jump to line 251 because the condition on line 249 was always true
250 set_sound_file_url_fields(wxr, audio_file, sound)
251 translate_raw_tags(sound)
252 return sound
255def is_ipa_text(text: str) -> bool:
256 # check if the text is IPA, used for inflection table cell text
257 if text.startswith(("\\", "*\\")) and text.endswith("\\"):
258 return True
259 if text.startswith("/") and text.endswith("/"): 259 ↛ 260line 259 didn't jump to line 260 because the condition on line 259 was never true
260 return True
261 if text.startswith("[") and text.endswith("]"): 261 ↛ 262line 261 didn't jump to line 262 because the condition on line 261 was never true
262 return True
263 if text.startswith("ou ") and text.endswith("\\"): 263 ↛ 266line 263 didn't jump to line 266 because the condition on line 263 was never true
264 # some inflection table template like "en-nom-rég" might have a second
265 # ipa text in a new line
266 return True
267 return False
270def process_pron_rimes_template(
271 wxr: WiktextractContext,
272 template_node: TemplateNode,
273 raw_tags: list[str],
274) -> Sound:
275 # https://fr.wiktionary.org/wiki/Modèle:pron-rimes
276 sound = Sound()
277 expanded_node = wxr.wtp.parse(
278 wxr.wtp.node_to_wikitext(template_node), expand_all=True
279 )
280 for index, span_tag in enumerate(
281 expanded_node.find_html_recursively("span")
282 ):
283 span_text = clean_node(wxr, None, span_tag)
284 if index == 0:
285 sound.ipa = span_text
286 elif index == 1: 286 ↛ 280line 286 didn't jump to line 280 because the condition on line 286 was always true
287 sound.rhymes = span_text
288 if len(raw_tags) > 0: 288 ↛ 289line 288 didn't jump to line 289 because the condition on line 288 was never true
289 sound.raw_tags = raw_tags[:]
290 translate_raw_tags(sound)
291 clean_node(wxr, sound, expanded_node)
292 return sound
295def process_cmn_pron_template(
296 wxr: WiktextractContext, template_node: TemplateNode
297) -> list[Sound]:
298 # https://fr.wiktionary.org/wiki/Modèle:cmn-pron
299 sounds_list = []
300 expanded_node = wxr.wtp.parse(
301 wxr.wtp.node_to_wikitext(template_node),
302 pre_expand=True,
303 additional_expand={template_node.template_name},
304 )
305 for list_node in expanded_node.find_child(NodeKind.LIST):
306 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
307 sounds_list.extend(process_pron_list_item(wxr, list_item, [], "zh"))
309 return sounds_list
312def extract_homophone_section(
313 wxr: WiktextractContext,
314 page_data: list[WordEntry],
315 base_data: WordEntry,
316 level_node: WikiNode,
317 title_cats: list[str],
318) -> None:
319 sounds = []
320 for list_node in level_node.find_child(NodeKind.LIST):
321 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
322 sounds.extend(extract_homophone_list_item(wxr, list_item))
324 if len(page_data) > 0: 324 ↛ 332line 324 didn't jump to line 332 because the condition on line 324 was always true
325 for data in page_data:
326 if data.lang_code == base_data.lang_code: 326 ↛ 325line 326 didn't jump to line 325 because the condition on line 326 was always true
327 data.sounds.extend(sounds)
328 data.categories.extend(title_cats)
329 for sound in sounds:
330 data.categories.extend(sound.categories)
331 else:
332 base_data.sounds.extend(sounds)
333 base_data.categories.extend(title_cats)
334 for sound in sounds:
335 base_data.categories.extend(sound.categories)
338def extract_homophone_list_item(
339 wxr: WiktextractContext, list_item: WikiNode
340) -> list[Sound]:
341 sounds = []
342 for node in list_item.children:
343 if isinstance(node, WikiNode) and node.kind == NodeKind.LINK: 343 ↛ 344line 343 didn't jump to line 344 because the condition on line 343 was never true
344 word = clean_node(wxr, None, node)
345 if word != "":
346 sounds.append(Sound(homophone=word))
347 elif isinstance(node, TemplateNode) and node.template_name in [
348 "l",
349 "lien",
350 ]:
351 from .linkage import process_lien_template
353 l_data = Linkage(word="")
354 process_lien_template(wxr, node, l_data)
355 if l_data.word != "": 355 ↛ 342line 355 didn't jump to line 342 because the condition on line 355 was always true
356 sounds.append(Sound(homophone=l_data.word, roman=l_data.roman))
358 return sounds