Coverage for src/wiktextract/extractor/fr/pronunciation.py: 84%
185 statements
« prev ^ index » next coverage.py v7.9.0, created at 2025-06-13 07:43 +0000
« prev ^ index » next coverage.py v7.9.0, created at 2025-06-13 07:43 +0000
1import re
3from wikitextprocessor.parser import NodeKind, TemplateNode, WikiNode
5from ...page import clean_node
6from ...wxr_context import WiktextractContext
7from ..share import set_sound_file_url_fields
8from .models import Linkage, Sound, WordEntry
9from .tags import translate_raw_tags
12def extract_pronunciation(
13 wxr: WiktextractContext,
14 page_data: list[WordEntry],
15 level_node: WikiNode,
16 base_data: WordEntry,
17) -> None:
18 sounds_list = []
19 lang_code = base_data.lang_code
20 for node in level_node.find_child(NodeKind.LIST | NodeKind.TEMPLATE):
21 if node.kind == NodeKind.LIST:
22 for list_item_node in node.find_child(NodeKind.LIST_ITEM):
23 sounds_list.extend(
24 process_pron_list_item(wxr, list_item_node, [], lang_code)
25 )
26 elif isinstance(node, TemplateNode): 26 ↛ 20line 26 didn't jump to line 20 because the condition on line 26 was always true
27 if node.template_name in ["cmn-pron", "zh-cmn-pron"]:
28 sounds_list.extend(process_cmn_pron_template(wxr, node))
30 if len(sounds_list) == 0:
31 return
33 if level_node.kind == NodeKind.LEVEL3 and len(page_data) > 0: 33 ↛ 42line 33 didn't jump to line 42 because the condition on line 33 was always true
34 # Add extracted sound data to all sense dictionaries that have the same
35 # language code when the prononciation subtitle is a level 3 title node.
36 # Otherwise only add to the last one.
37 for sense_data in page_data:
38 if sense_data.lang_code == lang_code:
39 sense_data.sounds.extend(sounds_list)
40 for sound in sounds_list:
41 sense_data.categories.extend(sound.categories)
42 elif len(page_data) > 0:
43 page_data[-1].sounds.extend(sounds_list)
44 for sound in sounds_list:
45 page_data[-1].categories.extend(sound.categories)
46 else:
47 base_data.sounds.extend(sounds_list)
48 for sound in sounds_list:
49 base_data.categories.extend(sound.categories)
52PRON_TEMPLATES = frozenset(
53 [
54 "pron", # redirect to "prononciation"
55 "prononciation",
56 "//", # redirect to "prononciation"
57 "phon", # redirect to "prononciation"
58 "pron-recons", # use "pron"
59 "prononciation reconstruite", # redirect to "pron-recons"
60 "pron recons", # redirect to "pron-recons"
61 "phono",
62 ]
63)
65ASPIRATED_H_TEMPLATES = frozenset(
66 [
67 "h aspiré",
68 "h", # redirect to "h aspiré"
69 "h muet",
70 ]
71)
74def process_pron_list_item(
75 wxr: WiktextractContext,
76 list_item_node: WikiNode,
77 parent_raw_tags: list[str],
78 lang_code: str,
79) -> list[Sound]:
80 current_raw_tags = parent_raw_tags[:]
81 sounds_list = []
82 pron_key = "zh_pron" if lang_code == "zh" else "ipa"
83 after_colon = False
84 for child_index, list_item_child in enumerate(list_item_node.children):
85 if isinstance(list_item_child, TemplateNode):
86 sounds_list.extend(
87 process_sound_list_item_templates(
88 wxr,
89 list_item_child,
90 current_raw_tags,
91 after_colon,
92 list_item_node.children[child_index - 1 : child_index],
93 lang_code,
94 )
95 )
96 elif isinstance(list_item_child, WikiNode):
97 if list_item_child.kind == NodeKind.BOLD:
98 current_raw_tags.append(clean_node(wxr, None, list_item_child))
99 elif list_item_child.kind == NodeKind.LINK:
100 for span_tag in list_item_child.find_html_recursively("span"):
101 sound = Sound(
102 ipa=clean_node(wxr, None, span_tag),
103 raw_tags=current_raw_tags[:],
104 )
105 translate_raw_tags(sound)
106 sounds_list.append(sound)
107 elif isinstance(list_item_child, str): 107 ↛ 84line 107 didn't jump to line 84 because the condition on line 107 was always true
108 if ":" in list_item_child:
109 after_colon = True
110 pron_text = list_item_child[
111 list_item_child.find(":") + 1 :
112 ].strip()
113 if len(pron_text) > 0:
114 sound = Sound(raw_tags=current_raw_tags[:])
115 setattr(sound, pron_key, pron_text)
116 translate_raw_tags(sound)
117 sounds_list.append(sound)
119 for nest_list_item in list_item_node.find_child_recursively(
120 NodeKind.LIST_ITEM
121 ):
122 sounds_list.extend(
123 process_pron_list_item(
124 wxr, nest_list_item, current_raw_tags, lang_code
125 )
126 )
128 return sounds_list
131def process_sound_list_item_templates(
132 wxr: WiktextractContext,
133 t_node: TemplateNode,
134 raw_tags: list[str],
135 after_colon: bool,
136 pre_nodes: list[WikiNode],
137 lang_code: str,
138) -> list[Sound]:
139 if t_node.template_name in PRON_TEMPLATES:
140 return process_pron_template(
141 wxr, t_node, raw_tags, lang_code, pre_nodes
142 )
143 elif t_node.template_name == "lang":
144 return extract_lang_template(wxr, t_node, raw_tags, lang_code)
145 elif t_node.template_name in {
146 "écouter",
147 "audio",
148 "pron-rég",
149 }:
150 return [process_ecouter_template(wxr, t_node, raw_tags)]
151 elif t_node.template_name == "pron-rimes":
152 return [process_pron_rimes_template(wxr, t_node, raw_tags)]
153 elif t_node.template_name in ASPIRATED_H_TEMPLATES: 153 ↛ 154line 153 didn't jump to line 154 because the condition on line 153 was never true
154 pass
155 elif not after_colon: # location 155 ↛ 159line 155 didn't jump to line 159 because the condition on line 155 was always true
156 raw_tag = clean_node(wxr, None, t_node)
157 raw_tags.append(raw_tag)
159 return []
162def process_pron_template(
163 wxr: WiktextractContext,
164 t_node: TemplateNode,
165 raw_tags: list[str],
166 lang_code: str,
167 previous_nodes: list[WikiNode] = [],
168) -> list[Sound]:
169 if (
170 t_node.template_name in PRON_TEMPLATES
171 and clean_node(wxr, None, t_node.template_parameters.get(1, "")) == ""
172 ):
173 # some pages don't pass IPA parameter to the "pron" template
174 # and expand to an edit link for adding the missing data.
175 return []
176 sounds_list = []
177 # https://en.wikipedia.org/wiki/Aspirated_h
178 # https://fr.wiktionary.org/wiki/Modèle:h_aspiré
179 aspirated_h = ""
180 if len(previous_nodes) > 0 and isinstance(previous_nodes[-1], TemplateNode):
181 if previous_nodes[-1].template_name in ASPIRATED_H_TEMPLATES: 181 ↛ 184line 181 didn't jump to line 184 because the condition on line 181 was always true
182 aspirated_h = clean_node(wxr, None, previous_nodes[-1])
184 expanded_node = wxr.wtp.parse(
185 wxr.wtp.node_to_wikitext(t_node), expand_all=True
186 )
187 for span_tag in expanded_node.find_html_recursively(
188 "span", attr_name="class", attr_value="API"
189 ):
190 ipa = clean_node(wxr, None, span_tag)
191 if ipa != "": 191 ↛ 187line 191 didn't jump to line 187 because the condition on line 191 was always true
192 sound = Sound(raw_tags=raw_tags[:], ipa=aspirated_h + ipa)
193 translate_raw_tags(sound)
194 sounds_list.append(sound)
195 return sounds_list
198def extract_lang_template(
199 wxr: WiktextractContext,
200 t_node: TemplateNode,
201 raw_tags: list[str],
202 lang_code: str,
203) -> list[Sound]:
204 sounds = []
205 field = "zh_pron" if lang_code == "zh" else "ipa"
206 pron_texts = clean_node(wxr, None, t_node)
207 prons = set()
208 for pron_text in re.split(",|,", pron_texts):
209 pron_text = pron_text.strip()
210 if len(pron_text) > 0 and pron_text not in prons:
211 prons.add(pron_text)
212 sound = Sound(raw_tags=raw_tags[:])
213 setattr(sound, field, pron_text)
214 translate_raw_tags(sound)
215 sounds.append(sound)
216 return sounds
219def process_ecouter_template(
220 wxr: WiktextractContext,
221 template_node: TemplateNode,
222 raw_tags: list[str],
223) -> Sound:
224 # sound file template: https://fr.wiktionary.org/wiki/Modèle:écouter
225 sound = Sound()
226 location = clean_node(
227 wxr, None, template_node.template_parameters.get(1, "")
228 )
229 if location.startswith("(") and location.endswith(")"): 229 ↛ 230line 229 didn't jump to line 230 because the condition on line 229 was never true
230 location = location.strip("()")
231 ipa = clean_node(
232 wxr,
233 None,
234 template_node.template_parameters.get(
235 2, template_node.template_parameters.get("pron", "")
236 ),
237 )
238 audio_file = clean_node(
239 wxr, None, template_node.template_parameters.get("audio", "")
240 )
241 if len(raw_tags) > 0: 241 ↛ 242line 241 didn't jump to line 242 because the condition on line 241 was never true
242 sound.raw_tags = raw_tags[:]
243 if len(location) > 0: 243 ↛ 245line 243 didn't jump to line 245 because the condition on line 243 was always true
244 sound.raw_tags.append(location)
245 if len(ipa) > 0:
246 sound.ipa = ipa
247 if len(audio_file) > 0: 247 ↛ 249line 247 didn't jump to line 249 because the condition on line 247 was always true
248 set_sound_file_url_fields(wxr, audio_file, sound)
249 translate_raw_tags(sound)
250 return sound
253def is_ipa_text(text: str) -> bool:
254 # check if the text is IPA, used for inflection table cell text
255 if text.startswith("\\") and text.endswith("\\"):
256 return True
257 if text.startswith("/") and text.endswith("/"): 257 ↛ 258line 257 didn't jump to line 258 because the condition on line 257 was never true
258 return True
259 if text.startswith("[") and text.endswith("]"): 259 ↛ 260line 259 didn't jump to line 260 because the condition on line 259 was never true
260 return True
261 if text.startswith("ou ") and text.endswith("\\"):
262 # some inflection table template like "en-nom-rég" might have a second
263 # ipa text in a new line
264 return True
265 return False
268def process_pron_rimes_template(
269 wxr: WiktextractContext,
270 template_node: TemplateNode,
271 raw_tags: list[str],
272) -> Sound:
273 # https://fr.wiktionary.org/wiki/Modèle:pron-rimes
274 sound = Sound()
275 expanded_node = wxr.wtp.parse(
276 wxr.wtp.node_to_wikitext(template_node), expand_all=True
277 )
278 for index, span_tag in enumerate(
279 expanded_node.find_html_recursively("span")
280 ):
281 span_text = clean_node(wxr, None, span_tag)
282 if index == 0:
283 sound.ipa = span_text
284 elif index == 1: 284 ↛ 278line 284 didn't jump to line 278 because the condition on line 284 was always true
285 sound.rhymes = span_text
286 if len(raw_tags) > 0: 286 ↛ 287line 286 didn't jump to line 287 because the condition on line 286 was never true
287 sound.raw_tags = raw_tags[:]
288 translate_raw_tags(sound)
289 clean_node(wxr, sound, expanded_node)
290 return sound
293def process_cmn_pron_template(
294 wxr: WiktextractContext, template_node: TemplateNode
295) -> list[Sound]:
296 # https://fr.wiktionary.org/wiki/Modèle:cmn-pron
297 sounds_list = []
298 expanded_node = wxr.wtp.parse(
299 wxr.wtp.node_to_wikitext(template_node),
300 pre_expand=True,
301 additional_expand={template_node.template_name},
302 )
303 for list_node in expanded_node.find_child(NodeKind.LIST):
304 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
305 sounds_list.extend(process_pron_list_item(wxr, list_item, [], "zh"))
307 return sounds_list
310def extract_homophone_section(
311 wxr: WiktextractContext,
312 page_data: list[WordEntry],
313 base_data: WordEntry,
314 level_node: WikiNode,
315 title_cats: list[str],
316) -> None:
317 sounds = []
318 for list_node in level_node.find_child(NodeKind.LIST):
319 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
320 sounds.extend(extract_homophone_list_item(wxr, list_item))
322 if len(page_data) > 0: 322 ↛ 330line 322 didn't jump to line 330 because the condition on line 322 was always true
323 for data in page_data:
324 if data.lang_code == base_data.lang_code: 324 ↛ 323line 324 didn't jump to line 323 because the condition on line 324 was always true
325 data.sounds.extend(sounds)
326 data.categories.extend(title_cats)
327 for sound in sounds:
328 data.categories.extend(sound.categories)
329 else:
330 base_data.sounds.extend(sounds)
331 base_data.categories.extend(title_cats)
332 for sound in sounds:
333 base_data.categories.extend(sound.categories)
336def extract_homophone_list_item(
337 wxr: WiktextractContext, list_item: WikiNode
338) -> list[Sound]:
339 sounds = []
340 for node in list_item.children:
341 if isinstance(node, WikiNode) and node.kind == NodeKind.LINK: 341 ↛ 342line 341 didn't jump to line 342 because the condition on line 341 was never true
342 word = clean_node(wxr, None, node)
343 if word != "":
344 sounds.append(Sound(homophone=word))
345 elif isinstance(node, TemplateNode) and node.template_name in [
346 "l",
347 "lien",
348 ]:
349 from .linkage import process_lien_template
351 l_data = Linkage(word="")
352 process_lien_template(wxr, node, l_data)
353 if l_data.word != "": 353 ↛ 340line 353 didn't jump to line 340 because the condition on line 353 was always true
354 sounds.append(Sound(homophone=l_data.word, roman=l_data.roman))
356 return sounds