Coverage for src/wiktextract/extractor/ru/pronunciation.py: 83%
131 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1import re
2from typing import Union
4from wikitextprocessor.parser import (
5 LevelNode,
6 NodeKind,
7 WikiNode,
8)
10from ...page import clean_node
11from ...wxr_context import WiktextractContext
12from ..share import set_sound_file_url_fields
13from .models import Sound, WordEntry
16def process_transcription_template(
17 wxr: WiktextractContext,
18 word_entry: WordEntry,
19 template_node: WikiNode,
20):
21 # https://ru.wiktionary.org/wiki/Шаблон:transcription
23 sound = Sound()
25 template_params = template_node.template_parameters
27 extract_ipa(wxr, sound, template_params, 1)
29 extract_audio_file(wxr, sound, template_params, 2)
31 extract_tags(wxr, sound, template_params)
33 extract_homophones(wxr, sound, template_params)
35 if sound.model_dump(exclude_defaults=True) != {}:
36 word_entry.sounds.append(sound)
39def process_transcriptions_template(
40 wxr: WiktextractContext,
41 word_entry: WordEntry,
42 template_node: WikiNode,
43):
44 # https://ru.wiktionary.org/wiki/Шаблон:transcriptions
46 sound_sg = Sound()
47 sound_pl = Sound()
49 template_params = template_node.template_parameters
51 extract_ipa(wxr, sound_sg, template_params, 1)
52 extract_ipa(wxr, sound_pl, template_params, 2)
54 extract_audio_file(wxr, sound_sg, template_params, 3)
55 extract_audio_file(wxr, sound_pl, template_params, 4)
57 extract_tags(wxr, [sound_sg, sound_pl], template_params)
59 extract_homophones(wxr, sound_sg, template_params)
61 if sound_sg.model_dump(exclude_defaults=True) != {} and ( 61 ↛ 67line 61 didn't jump to line 67 because the condition on line 61 was always true
62 sound_sg.ipa or sound_sg.audio
63 ):
64 sound_sg.tags.append("singular")
65 word_entry.sounds.append(sound_sg)
67 if sound_pl.model_dump(exclude_defaults=True) != {} and (
68 sound_pl.ipa or sound_pl.audio
69 ):
70 sound_pl.tags.append("plural")
71 word_entry.sounds.append(sound_pl)
74def process_transcription_ru_template(
75 wxr: WiktextractContext,
76 word_entry: WordEntry,
77 template_node: WikiNode,
78):
79 # https://ru.wiktionary.org/wiki/Шаблон:transcription-ru
80 sound = Sound()
82 template_params = template_node.template_parameters
84 ipa = clean_node(wxr, {}, template_params.get("вручную", ""))
85 if not ipa: 85 ↛ 91line 85 didn't jump to line 91 because the condition on line 85 was always true
86 cleaned_node = clean_node(wxr, {}, template_node)
87 ipa_match = re.search(r"\[(.*?)\]", cleaned_node)
88 if ipa_match: 88 ↛ 91line 88 didn't jump to line 91 because the condition on line 88 was always true
89 ipa = ipa_match.group(1)
91 if ipa: 91 ↛ 94line 91 didn't jump to line 94 because the condition on line 91 was always true
92 sound.ipa = ipa
94 extract_audio_file(wxr, sound, template_params, 2)
96 extract_homophones(wxr, sound, template_params)
98 extract_tags(wxr, sound, template_params)
100 if sound.model_dump(exclude_defaults=True) != {}: 100 ↛ exitline 100 didn't return from function 'process_transcription_ru_template' because the condition on line 100 was always true
101 word_entry.sounds.append(sound)
104def process_transcriptions_ru_template(
105 wxr: WiktextractContext,
106 word_entry: WordEntry,
107 template_node: WikiNode,
108):
109 sound_sg = Sound()
110 sound_pl = Sound()
112 template_params = template_node.template_parameters
114 cleaned_node = clean_node(wxr, {}, template_node)
115 ipa_matches = re.findall(r"\[(.*?)\]", cleaned_node)
116 if len(ipa_matches) > 0: 116 ↛ 118line 116 didn't jump to line 118 because the condition on line 116 was always true
117 sound_sg.ipa = ipa_matches[0]
118 if len(ipa_matches) > 1: 118 ↛ 121line 118 didn't jump to line 121 because the condition on line 118 was always true
119 sound_pl.ipa = ipa_matches[1]
121 extract_audio_file(wxr, sound_sg, template_params, 3)
122 extract_audio_file(wxr, sound_pl, template_params, 4)
124 extract_tags(wxr, [sound_sg, sound_pl], template_params)
126 extract_homophones(wxr, sound_sg, template_params)
128 if sound_sg.model_dump(exclude_defaults=True) != {}: 128 ↛ 132line 128 didn't jump to line 132 because the condition on line 128 was always true
129 sound_sg.tags.append("singular")
130 word_entry.sounds.append(sound_sg)
132 if sound_pl.model_dump(exclude_defaults=True) != {}: 132 ↛ exitline 132 didn't return from function 'process_transcriptions_ru_template' because the condition on line 132 was always true
133 sound_pl.tags.append("plural")
134 word_entry.sounds.append(sound_pl)
137def process_transcription_la_template(
138 wxr: WiktextractContext, word_entry: WordEntry, template_node: WikiNode
139):
140 # https://ru.wiktionary.org/wiki/Шаблон:transcription-la
141 sound = Sound()
142 cleaned_node = clean_node(wxr, {}, template_node)
143 ipa_match = re.search(r"\((.*?)\): \[(.*?)\]", cleaned_node)
145 if ipa_match: 145 ↛ exitline 145 didn't return from function 'process_transcription_la_template' because the condition on line 145 was always true
146 sound.ipa = ipa_match.group(2)
147 sound.raw_tags = [ipa_match.group(1).strip()]
148 word_entry.sounds.append(sound)
151def process_transcription_grc_template(
152 wxr: WiktextractContext, word_entry: WordEntry, template_node: WikiNode
153):
154 # https://ru.wiktionary.org/wiki/Шаблон:transcription-grc
155 sound = Sound()
156 cleaned_node = clean_node(wxr, {}, template_node)
157 ipa_with_labels = re.findall(r"\* (.*?): \[(.*?)\]", cleaned_node)
158 for label, ipa in ipa_with_labels:
159 sound = Sound(ipa=ipa, raw_tags=[label.strip()])
160 word_entry.sounds.append(sound)
163def extract_ipa(
164 wxr: WiktextractContext,
165 sound: Sound,
166 template_params: dict[str, WikiNode],
167 key: Union[str, int],
168):
169 ipa = clean_node(wxr, {}, template_params.get(key, ""))
170 if ipa:
171 sound.ipa = ipa
174def extract_audio_file(
175 wxr: WiktextractContext,
176 sound: Sound,
177 template_params: dict[str, WikiNode],
178 key: Union[str, int],
179):
180 audio_file = clean_node(wxr, None, template_params.get(key, ""))
181 if audio_file != "":
182 set_sound_file_url_fields(wxr, audio_file, sound)
185def extract_tags(
186 wxr: WiktextractContext,
187 sounds: Union[Sound, list[Sound]],
188 template_params: dict[str, WikiNode],
189):
190 tags = clean_node(wxr, None, template_params.get("норма", ""))
191 if tags != "":
192 if isinstance(sounds, list):
193 for sound in sounds:
194 sound.raw_tags = [tags]
195 else:
196 sounds.raw_tags = [tags]
199def extract_homophones(
200 wxr: WiktextractContext,
201 sounds: Union[Sound, list[Sound]],
202 template_params: dict[str, WikiNode],
203):
204 homophones_raw = clean_node(wxr, {}, template_params.get("омофоны", ""))
205 homophones = [
206 h.strip() for h in homophones_raw.split(",") if h.strip() != ""
207 ]
208 if homophones:
209 if isinstance(sounds, list): 209 ↛ 210line 209 didn't jump to line 210 because the condition on line 209 was never true
210 for sound in sounds:
211 sound.homophones = homophones
212 else:
213 sounds.homophones = homophones
216TRANSCRIPTION_TEMPLATE_PROCESSORS = {
217 "transcription": process_transcription_template,
218 "transcriptions": process_transcriptions_template,
219 "transcription-ru": process_transcription_ru_template,
220 "transcriptions-ru": process_transcriptions_ru_template,
221 "transcription-la": process_transcription_la_template,
222 "transcription-uk": None,
223 "transcription-grc": process_transcription_grc_template,
224 "transcription eo": None,
225}
228def extract_pronunciation_section(
229 wxr: WiktextractContext,
230 word_entry: WordEntry,
231 level_node: LevelNode,
232) -> None:
233 for child in level_node.find_child(NodeKind.TEMPLATE):
234 template_name = child.template_name
235 if template_name in TRANSCRIPTION_TEMPLATE_PROCESSORS: 235 ↛ 239line 235 didn't jump to line 239 because the condition on line 235 was always true
236 processor = TRANSCRIPTION_TEMPLATE_PROCESSORS.get(template_name)
237 if processor is not None: 237 ↛ 233line 237 didn't jump to line 233 because the condition on line 237 was always true
238 processor(wxr, word_entry, child)
239 elif template_name in ["audio", "аудио", "медиа"]:
240 audio_file = clean_node(
241 wxr, None, child.template_parameters.get(1, "")
242 ).strip()
243 if audio_file != "":
244 if len(word_entry.sounds) > 0:
245 set_sound_file_url_fields(
246 wxr, audio_file, word_entry.sounds[-1]
247 )
248 else:
249 sound = Sound()
250 set_sound_file_url_fields(wxr, audio_file, sound)
251 word_entry.sounds.append(sound)
254def extract_homophone_section(
255 wxr: WiktextractContext,
256 word_entry: WordEntry,
257 level_node: LevelNode,
258) -> None:
259 homophones = []
260 for link_node in level_node.find_child_recursively(NodeKind.LINK):
261 homophone = clean_node(wxr, None, link_node)
262 if len(homophone) > 0: 262 ↛ 260line 262 didn't jump to line 260 because the condition on line 262 was always true
263 homophones.append(homophone)
264 if len(homophones) > 0: 264 ↛ exitline 264 didn't return from function 'extract_homophone_section' because the condition on line 264 was always true
265 sound = Sound(homophones=homophones)
266 word_entry.sounds.append(sound)