Coverage for src/wiktextract/extractor/ru/pronunciation.py: 84%
148 statements
« prev ^ index » next coverage.py v7.9.0, created at 2025-06-13 07:43 +0000
« prev ^ index » next coverage.py v7.9.0, created at 2025-06-13 07:43 +0000
1import re
2from typing import Union
4from wikitextprocessor.parser import (
5 LevelNode,
6 NodeKind,
7 WikiNode,
8)
10from ...page import clean_node
11from ...wxr_context import WiktextractContext
12from ..share import set_sound_file_url_fields
13from .models import Sound, WordEntry
14from .tags import translate_raw_tags
17def process_transcription_template(
18 wxr: WiktextractContext, word_entry: WordEntry, template_node: WikiNode
19):
20 # https://ru.wiktionary.org/wiki/Шаблон:transcription
22 sound = Sound()
24 template_params = template_node.template_parameters
26 extract_ipa(wxr, sound, template_params, 1)
28 extract_audio_file(wxr, sound, template_params, 2)
30 extract_tags(wxr, sound, template_params)
32 extract_homophones(wxr, sound, template_params)
34 if sound.model_dump(exclude_defaults=True) != {}:
35 word_entry.sounds.append(sound)
38def process_transcriptions_template(
39 wxr: WiktextractContext, word_entry: WordEntry, template_node: WikiNode
40):
41 # https://ru.wiktionary.org/wiki/Шаблон:transcriptions
43 sound_sg = Sound()
44 sound_pl = Sound()
46 template_params = template_node.template_parameters
48 extract_ipa(wxr, sound_sg, template_params, 1)
49 extract_ipa(wxr, sound_pl, template_params, 2)
51 extract_audio_file(wxr, sound_sg, template_params, 3)
52 extract_audio_file(wxr, sound_pl, template_params, 4)
54 extract_tags(wxr, [sound_sg, sound_pl], template_params)
56 extract_homophones(wxr, sound_sg, template_params)
58 if sound_sg.model_dump(exclude_defaults=True) != {} and ( 58 ↛ 64line 58 didn't jump to line 64 because the condition on line 58 was always true
59 sound_sg.ipa or sound_sg.audio
60 ):
61 sound_sg.tags.append("singular")
62 word_entry.sounds.append(sound_sg)
64 if sound_pl.model_dump(exclude_defaults=True) != {} and (
65 sound_pl.ipa or sound_pl.audio
66 ):
67 sound_pl.tags.append("plural")
68 word_entry.sounds.append(sound_pl)
71def process_transcription_ru_template(
72 wxr: WiktextractContext, word_entry: WordEntry, template_node: WikiNode
73):
74 # https://ru.wiktionary.org/wiki/Шаблон:transcription-ru
75 sound = Sound()
77 template_params = template_node.template_parameters
79 ipa = clean_node(wxr, {}, template_params.get("вручную", ""))
80 if not ipa: 80 ↛ 86line 80 didn't jump to line 86 because the condition on line 80 was always true
81 cleaned_node = clean_node(wxr, {}, template_node)
82 ipa_match = re.search(r"\[(.*?)\]", cleaned_node)
83 if ipa_match: 83 ↛ 86line 83 didn't jump to line 86 because the condition on line 83 was always true
84 ipa = ipa_match.group(1)
86 if ipa: 86 ↛ 89line 86 didn't jump to line 89 because the condition on line 86 was always true
87 sound.ipa = ipa
89 extract_audio_file(wxr, sound, template_params, 2)
91 extract_homophones(wxr, sound, template_params)
93 extract_tags(wxr, sound, template_params)
95 if sound.model_dump(exclude_defaults=True) != {}: 95 ↛ exitline 95 didn't return from function 'process_transcription_ru_template' because the condition on line 95 was always true
96 word_entry.sounds.append(sound)
99def process_transcriptions_ru_template(
100 wxr: WiktextractContext, word_entry: WordEntry, template_node: WikiNode
101):
102 sound_sg = Sound()
103 sound_pl = Sound()
105 template_params = template_node.template_parameters
107 cleaned_node = clean_node(wxr, {}, template_node)
108 ipa_matches = re.findall(r"\[(.*?)\]", cleaned_node)
109 if len(ipa_matches) > 0: 109 ↛ 111line 109 didn't jump to line 111 because the condition on line 109 was always true
110 sound_sg.ipa = ipa_matches[0]
111 if len(ipa_matches) > 1: 111 ↛ 114line 111 didn't jump to line 114 because the condition on line 111 was always true
112 sound_pl.ipa = ipa_matches[1]
114 extract_audio_file(wxr, sound_sg, template_params, 3)
115 extract_audio_file(wxr, sound_pl, template_params, 4)
117 extract_tags(wxr, [sound_sg, sound_pl], template_params)
119 extract_homophones(wxr, sound_sg, template_params)
121 if sound_sg.model_dump(exclude_defaults=True) != {}: 121 ↛ 125line 121 didn't jump to line 125 because the condition on line 121 was always true
122 sound_sg.tags.append("singular")
123 word_entry.sounds.append(sound_sg)
125 if sound_pl.model_dump(exclude_defaults=True) != {}: 125 ↛ exitline 125 didn't return from function 'process_transcriptions_ru_template' because the condition on line 125 was always true
126 sound_pl.tags.append("plural")
127 word_entry.sounds.append(sound_pl)
130def process_transcription_la_template(
131 wxr: WiktextractContext, word_entry: WordEntry, template_node: WikiNode
132):
133 # https://ru.wiktionary.org/wiki/Шаблон:transcription-la
134 sound = Sound()
135 cleaned_node = clean_node(wxr, {}, template_node)
136 ipa_match = re.search(r"\((.*?)\): \[(.*?)\]", cleaned_node)
138 if ipa_match: 138 ↛ exitline 138 didn't return from function 'process_transcription_la_template' because the condition on line 138 was always true
139 sound.ipa = ipa_match.group(2)
140 sound.raw_tags = [ipa_match.group(1).strip()]
141 word_entry.sounds.append(sound)
144def process_transcription_grc_template(
145 wxr: WiktextractContext, word_entry: WordEntry, template_node: WikiNode
146):
147 # https://ru.wiktionary.org/wiki/Шаблон:transcription-grc
148 sound = Sound()
149 cleaned_node = clean_node(wxr, {}, template_node)
150 ipa_with_labels = re.findall(r"\* (.*?): \[(.*?)\]", cleaned_node)
151 for label, ipa in ipa_with_labels:
152 sound = Sound(ipa=ipa, raw_tags=[label.strip()])
153 word_entry.sounds.append(sound)
156def extract_ipa(
157 wxr: WiktextractContext,
158 sound: Sound,
159 template_params: dict[str, WikiNode],
160 key: Union[str, int],
161):
162 ipa = clean_node(wxr, {}, template_params.get(key, ""))
163 if ipa:
164 sound.ipa = ipa
167def extract_audio_file(
168 wxr: WiktextractContext,
169 sound: Sound,
170 template_params: dict[str, WikiNode],
171 key: Union[str, int],
172):
173 audio_file = clean_node(wxr, None, template_params.get(key, ""))
174 if audio_file != "":
175 set_sound_file_url_fields(wxr, audio_file, sound)
178def extract_tags(
179 wxr: WiktextractContext,
180 sounds: Union[Sound, list[Sound]],
181 template_params: dict[str, WikiNode],
182):
183 tags = clean_node(wxr, None, template_params.get("норма", ""))
184 if tags != "":
185 if isinstance(sounds, list):
186 for sound in sounds:
187 sound.raw_tags = [tags]
188 else:
189 sounds.raw_tags = [tags]
192def extract_homophones(
193 wxr: WiktextractContext,
194 sounds: Union[Sound, list[Sound]],
195 template_params: dict[str, WikiNode],
196):
197 homophones_raw = clean_node(wxr, {}, template_params.get("омофоны", ""))
198 homophones = [
199 h.strip() for h in homophones_raw.split(",") if h.strip() != ""
200 ]
201 if homophones:
202 if isinstance(sounds, list): 202 ↛ 203line 202 didn't jump to line 203 because the condition on line 202 was never true
203 for sound in sounds:
204 sound.homophones = homophones
205 else:
206 sounds.homophones = homophones
209TRANSCRIPTION_TEMPLATE_PROCESSORS = {
210 "transcription": process_transcription_template,
211 "transcriptions": process_transcriptions_template,
212 "transcription-ru": process_transcription_ru_template,
213 "transcriptions-ru": process_transcriptions_ru_template,
214 "transcription-la": process_transcription_la_template,
215 "transcription-uk": None,
216 "transcription-grc": process_transcription_grc_template,
217 "transcription eo": None,
218}
221def extract_pronunciation_section(
222 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode
223) -> None:
224 for child in level_node.find_child(NodeKind.TEMPLATE):
225 template_name = child.template_name
226 if template_name in TRANSCRIPTION_TEMPLATE_PROCESSORS: 226 ↛ 230line 226 didn't jump to line 230 because the condition on line 226 was always true
227 processor = TRANSCRIPTION_TEMPLATE_PROCESSORS.get(template_name)
228 if processor is not None: 228 ↛ 224line 228 didn't jump to line 224 because the condition on line 228 was always true
229 processor(wxr, word_entry, child)
230 elif template_name in ["audio", "аудио", "медиа"]:
231 audio_file = clean_node(
232 wxr, None, child.template_parameters.get(1, "")
233 ).strip()
234 if audio_file != "":
235 if len(word_entry.sounds) > 0:
236 set_sound_file_url_fields(
237 wxr, audio_file, word_entry.sounds[-1]
238 )
239 else:
240 sound = Sound()
241 set_sound_file_url_fields(wxr, audio_file, sound)
242 word_entry.sounds.append(sound)
245def extract_homophone_section(
246 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode
247) -> None:
248 homophones = []
249 for link_node in level_node.find_child_recursively(NodeKind.LINK):
250 homophone = clean_node(wxr, None, link_node)
251 if len(homophone) > 0: 251 ↛ 249line 251 didn't jump to line 249 because the condition on line 251 was always true
252 homophones.append(homophone)
253 if len(homophones) > 0: 253 ↛ exitline 253 didn't return from function 'extract_homophone_section' because the condition on line 253 was always true
254 sound = Sound(homophones=homophones)
255 word_entry.sounds.append(sound)
258def extract_rhyme_section(
259 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode
260) -> None:
261 for list_node in level_node.find_child(NodeKind.LIST):
262 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
263 raw_tags = []
264 for node in list_item.children:
265 if isinstance(node, str) and node.strip().endswith(":"):
266 for raw_tag in node.strip(": ").split(","):
267 raw_tag = raw_tag.strip()
268 if raw_tag != "": 268 ↛ 266line 268 didn't jump to line 266 because the condition on line 268 was always true
269 raw_tags.append(raw_tag)
270 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
271 rhyme = clean_node(wxr, None, node)
272 if rhyme != "": 272 ↛ 264line 272 didn't jump to line 264 because the condition on line 272 was always true
273 sound = Sound(rhymes=rhyme, raw_tags=raw_tags)
274 translate_raw_tags(sound)
275 word_entry.sounds.append(sound)