Coverage for src/wiktextract/extractor/fr/pronunciation.py: 86%
140 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1import re
3from wikitextprocessor.parser import NodeKind, TemplateNode, WikiNode
5from ...page import clean_node
6from ...wxr_context import WiktextractContext
7from ..share import set_sound_file_url_fields
8from .models import Sound, WordEntry
9from .tags import translate_raw_tags
12def extract_pronunciation(
13 wxr: WiktextractContext,
14 page_data: list[WordEntry],
15 level_node: WikiNode,
16 base_data: WordEntry,
17) -> None:
18 sounds_list = []
19 lang_code = base_data.lang_code
20 for node in level_node.find_child(NodeKind.LIST | NodeKind.TEMPLATE):
21 if node.kind == NodeKind.LIST:
22 for list_item_node in node.find_child(NodeKind.LIST_ITEM):
23 sounds_list.extend(
24 process_pron_list_item(wxr, list_item_node, [], lang_code)
25 )
26 elif isinstance(node, TemplateNode): 26 ↛ 20line 26 didn't jump to line 20 because the condition on line 26 was always true
27 if node.template_name in ["cmn-pron", "zh-cmn-pron"]:
28 sounds_list.extend(process_cmn_pron_template(wxr, node))
30 if len(sounds_list) == 0: 30 ↛ 31line 30 didn't jump to line 31 because the condition on line 30 was never true
31 return
33 if level_node.kind == NodeKind.LEVEL3 and len(page_data) > 0: 33 ↛ 42line 33 didn't jump to line 42 because the condition on line 33 was always true
34 # Add extracted sound data to all sense dictionaries that have the same
35 # language code when the prononciation subtitle is a level 3 title node.
36 # Otherwise only add to the last one.
37 for sense_data in page_data:
38 if sense_data.lang_code == lang_code:
39 sense_data.sounds.extend(sounds_list)
40 for sound in sounds_list:
41 sense_data.categories.extend(sound.categories)
42 elif len(page_data) > 0:
43 page_data[-1].sounds.extend(sounds_list)
44 for sound in sounds_list:
45 page_data[-1].categories.extend(sound.categories)
46 else:
47 base_data.sounds.extend(sounds_list)
48 for sound in sounds_list:
49 base_data.categories.extend(sound.categories)
52PRON_TEMPLATES = frozenset(
53 [
54 "pron", # redirect to "prononciation"
55 "prononciation",
56 "//", # redirect to "prononciation"
57 "phon", # redirect to "prononciation"
58 "pron-recons", # use "pron"
59 "prononciation reconstruite", # redirect to "pron-recons"
60 "pron recons", # redirect to "pron-recons"
61 "lang", # used in template "cmn-pron", which expands to list of Pinyin
62 ]
63)
65ASPIRATED_H_TEMPLATES = frozenset(
66 [
67 "h aspiré",
68 "h", # redirect to "h aspiré"
69 "h muet",
70 ]
71)
74def process_pron_list_item(
75 wxr: WiktextractContext,
76 list_item_node: WikiNode,
77 parent_raw_tags: list[str],
78 lang_code: str,
79) -> list[Sound]:
80 current_raw_tags = parent_raw_tags[:]
81 sounds_list = []
82 pron_key = "zh_pron" if lang_code == "zh" else "ipa"
83 after_colon = False
84 for child_index, list_item_child in enumerate(list_item_node.children):
85 if isinstance(list_item_child, TemplateNode):
86 sounds_list.extend(
87 process_sound_list_item_templates(
88 wxr,
89 list_item_child,
90 current_raw_tags,
91 after_colon,
92 list_item_node.children[child_index - 1 : child_index],
93 )
94 )
95 elif isinstance(list_item_child, WikiNode):
96 if list_item_child.kind == NodeKind.BOLD:
97 current_raw_tags.append(clean_node(wxr, None, list_item_child))
98 elif list_item_child.kind == NodeKind.LINK:
99 for span_tag in list_item_child.find_html_recursively("span"):
100 sound = Sound(
101 ipa=clean_node(wxr, None, span_tag),
102 raw_tags=current_raw_tags[:],
103 )
104 translate_raw_tags(sound)
105 sounds_list.append(sound)
106 elif isinstance(list_item_child, str): 106 ↛ 84line 106 didn't jump to line 84 because the condition on line 106 was always true
107 if ":" in list_item_child:
108 after_colon = True
109 pron_text = list_item_child[
110 list_item_child.find(":") + 1 :
111 ].strip()
112 if len(pron_text) > 0:
113 sound = Sound(raw_tags=current_raw_tags[:])
114 setattr(sound, pron_key, pron_text)
115 translate_raw_tags(sound)
116 sounds_list.append(sound)
118 for nest_list_item in list_item_node.find_child_recursively(
119 NodeKind.LIST_ITEM
120 ):
121 sounds_list.extend(
122 process_pron_list_item(
123 wxr, nest_list_item, current_raw_tags, lang_code
124 )
125 )
127 return sounds_list
130def process_sound_list_item_templates(
131 wxr: WiktextractContext,
132 template_node: TemplateNode,
133 raw_tags: list[str],
134 after_colon: bool,
135 pre_nodes: list[WikiNode],
136) -> list[Sound]:
137 if template_node.template_name in PRON_TEMPLATES:
138 return process_pron_template(wxr, template_node, raw_tags, pre_nodes)
139 elif template_node.template_name in {
140 "écouter",
141 "audio",
142 "pron-rég",
143 }:
144 return [process_ecouter_template(wxr, template_node, raw_tags)]
145 elif template_node.template_name == "pron-rimes":
146 return [process_pron_rimes_template(wxr, template_node, raw_tags)]
147 elif template_node.template_name in ASPIRATED_H_TEMPLATES: 147 ↛ 148line 147 didn't jump to line 148 because the condition on line 147 was never true
148 pass
149 elif not after_colon: # location 149 ↛ 153line 149 didn't jump to line 153 because the condition on line 149 was always true
150 raw_tag = clean_node(wxr, None, template_node)
151 raw_tags.append(raw_tag)
153 return []
156def process_pron_template(
157 wxr: WiktextractContext,
158 template_node: TemplateNode,
159 raw_tags: list[str],
160 previous_nodes: list[WikiNode] = [],
161) -> list[Sound]:
162 if (
163 template_node.template_name in PRON_TEMPLATES
164 and isinstance(template_node.template_parameters.get(1, ""), str)
165 and len(template_node.template_parameters.get(1, "")) == 0
166 ):
167 # some pages don't pass IPA parameter to the "pron" template
168 # and expand to an edit link for adding the missing data.
169 return []
170 sounds_list = []
171 pron_texts = clean_node(wxr, None, template_node)
172 # https://en.wikipedia.org/wiki/Aspirated_h
173 # https://fr.wiktionary.org/wiki/Modèle:h_aspiré
174 aspirated_h = ""
175 if len(previous_nodes) > 0 and isinstance(previous_nodes[-1], TemplateNode):
176 if previous_nodes[-1].template_name in ASPIRATED_H_TEMPLATES: 176 ↛ 179line 176 didn't jump to line 179 because the condition on line 176 was always true
177 aspirated_h = clean_node(wxr, None, previous_nodes[-1])
179 if len(pron_texts) > 0: 179 ↛ 190line 179 didn't jump to line 190 because the condition on line 179 was always true
180 use_key = "zh_pron" if template_node.template_name == "lang" else "ipa"
181 prons = set()
182 for pron_text in re.split(",|,", pron_texts):
183 pron_text = pron_text.strip()
184 if len(pron_text) > 0 and pron_text not in prons:
185 prons.add(pron_text)
186 sound = Sound(raw_tags=raw_tags[:])
187 setattr(sound, use_key, aspirated_h + pron_text)
188 translate_raw_tags(sound)
189 sounds_list.append(sound)
190 return sounds_list
193def process_ecouter_template(
194 wxr: WiktextractContext,
195 template_node: TemplateNode,
196 raw_tags: list[str],
197) -> Sound:
198 # sound file template: https://fr.wiktionary.org/wiki/Modèle:écouter
199 sound = Sound()
200 location = clean_node(
201 wxr, None, template_node.template_parameters.get(1, "")
202 )
203 if location.startswith("(") and location.endswith(")"): 203 ↛ 204line 203 didn't jump to line 204 because the condition on line 203 was never true
204 location = location.strip("()")
205 ipa = clean_node(
206 wxr,
207 None,
208 template_node.template_parameters.get(
209 2, template_node.template_parameters.get("pron", "")
210 ),
211 )
212 audio_file = clean_node(
213 wxr, None, template_node.template_parameters.get("audio", "")
214 )
215 if len(raw_tags) > 0: 215 ↛ 216line 215 didn't jump to line 216 because the condition on line 215 was never true
216 sound.raw_tags = raw_tags[:]
217 if len(location) > 0: 217 ↛ 219line 217 didn't jump to line 219 because the condition on line 217 was always true
218 sound.raw_tags.append(location)
219 if len(ipa) > 0:
220 sound.ipa = ipa
221 if len(audio_file) > 0: 221 ↛ 223line 221 didn't jump to line 223 because the condition on line 221 was always true
222 set_sound_file_url_fields(wxr, audio_file, sound)
223 translate_raw_tags(sound)
224 return sound
227def is_ipa_text(text: str) -> bool:
228 # check if the text is IPA, used for inflection table cell text
229 if text.startswith("\\") and text.endswith("\\"):
230 return True
231 if text.startswith("ou ") and text.endswith("\\"):
232 # some inflection table template like "en-nom-rég" might have a second
233 # ipa text in a new line
234 return True
235 return False
238def process_pron_rimes_template(
239 wxr: WiktextractContext,
240 template_node: TemplateNode,
241 raw_tags: list[str],
242) -> Sound:
243 # https://fr.wiktionary.org/wiki/Modèle:pron-rimes
244 sound = Sound()
245 expanded_node = wxr.wtp.parse(
246 wxr.wtp.node_to_wikitext(template_node), expand_all=True
247 )
248 for index, span_tag in enumerate(
249 expanded_node.find_html_recursively("span")
250 ):
251 span_text = clean_node(wxr, None, span_tag)
252 if index == 0:
253 sound.ipa = span_text
254 elif index == 1: 254 ↛ 248line 254 didn't jump to line 248 because the condition on line 254 was always true
255 sound.rhymes = span_text
256 if len(raw_tags) > 0: 256 ↛ 257line 256 didn't jump to line 257 because the condition on line 256 was never true
257 sound.raw_tags = raw_tags[:]
258 translate_raw_tags(sound)
259 clean_node(wxr, sound, expanded_node)
260 return sound
263def process_cmn_pron_template(
264 wxr: WiktextractContext, template_node: TemplateNode
265) -> list[Sound]:
266 # https://fr.wiktionary.org/wiki/Modèle:cmn-pron
267 sounds_list = []
268 expanded_node = wxr.wtp.parse(
269 wxr.wtp.node_to_wikitext(template_node),
270 pre_expand=True,
271 additional_expand={template_node.template_name},
272 )
273 for list_node in expanded_node.find_child(NodeKind.LIST):
274 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
275 sounds_list.extend(process_pron_list_item(wxr, list_item, [], "zh"))
277 return sounds_list