Coverage for src/wiktextract/extractor/ru/pronunciation.py: 85%
150 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-12 08:27 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-12 08:27 +0000
1import re
3from wikitextprocessor import HTMLNode, LevelNode, NodeKind, WikiNode
5from ...page import clean_node
6from ...wxr_context import WiktextractContext
7from ..share import set_sound_file_url_fields
8from .models import Sound, WordEntry
9from .tags import translate_raw_tags
12def process_transcription_template(
13 wxr: WiktextractContext, word_entry: WordEntry, template_node: WikiNode
14):
15 # https://ru.wiktionary.org/wiki/Шаблон:transcription
17 sound = Sound()
18 template_params = template_node.template_parameters
19 extract_ipa(wxr, sound, template_params, 1)
20 extract_audio_file(wxr, sound, template_params, 2)
21 extract_tags(wxr, sound, template_params)
22 extract_homophones(wxr, sound, template_params)
24 if sound.ipa != "" or sound.audio != "" or len(sound.homophones) > 0:
25 word_entry.sounds.append(sound)
28def process_transcriptions_template(
29 wxr: WiktextractContext, word_entry: WordEntry, template_node: WikiNode
30):
31 # https://ru.wiktionary.org/wiki/Шаблон:transcriptions
33 sound_sg = Sound()
34 sound_pl = Sound()
35 template_params = template_node.template_parameters
36 extract_ipa(wxr, sound_sg, template_params, 1)
37 extract_ipa(wxr, sound_pl, template_params, 2)
38 extract_audio_file(wxr, sound_sg, template_params, 3)
39 extract_audio_file(wxr, sound_pl, template_params, 4)
40 extract_tags(wxr, [sound_sg, sound_pl], template_params)
41 extract_homophones(wxr, sound_sg, template_params)
43 if sound_sg.ipa != "" or sound_sg.audio != "": 43 ↛ 47line 43 didn't jump to line 47 because the condition on line 43 was always true
44 sound_sg.tags.append("singular")
45 word_entry.sounds.append(sound_sg)
47 if sound_pl.ipa != "" or sound_pl.audio != "":
48 sound_pl.tags.append("plural")
49 word_entry.sounds.append(sound_pl)
52def process_transcription_ru_template(
53 wxr: WiktextractContext, word_entry: WordEntry, template_node: WikiNode
54):
55 # https://ru.wiktionary.org/wiki/Шаблон:transcription-ru
56 sound = Sound()
57 template_params = template_node.template_parameters
58 sound.ipa = clean_node(wxr, None, template_params.get("вручную", ""))
59 if sound.ipa == "": 59 ↛ 64line 59 didn't jump to line 64 because the condition on line 59 was always true
60 cleaned_node = clean_node(wxr, None, template_node)
61 ipa_match = re.search(r"\[.+?\]", cleaned_node)
62 if ipa_match is not None: 62 ↛ 64line 62 didn't jump to line 64 because the condition on line 62 was always true
63 sound.ipa = ipa_match.group()
64 extract_audio_file(wxr, sound, template_params, 2)
65 extract_homophones(wxr, sound, template_params)
66 extract_tags(wxr, sound, template_params)
68 if sound.ipa != "" or sound.audio != "" or len(sound.homophones) > 0: 68 ↛ exitline 68 didn't return from function 'process_transcription_ru_template' because the condition on line 68 was always true
69 word_entry.sounds.append(sound)
72def process_transcriptions_ru_template(
73 wxr: WiktextractContext, word_entry: WordEntry, template_node: WikiNode
74):
75 sound_sg = Sound()
76 sound_pl = Sound()
77 template_params = template_node.template_parameters
78 cleaned_node = clean_node(wxr, None, template_node)
79 ipa_matches = re.findall(r"\[.+?\]", cleaned_node)
80 if len(ipa_matches) > 0: 80 ↛ 82line 80 didn't jump to line 82 because the condition on line 80 was always true
81 sound_sg.ipa = ipa_matches[0]
82 if len(ipa_matches) > 1: 82 ↛ 84line 82 didn't jump to line 84 because the condition on line 82 was always true
83 sound_pl.ipa = ipa_matches[1]
84 extract_audio_file(wxr, sound_sg, template_params, 3)
85 extract_audio_file(wxr, sound_pl, template_params, 4)
86 extract_tags(wxr, [sound_sg, sound_pl], template_params)
87 extract_homophones(wxr, sound_sg, template_params)
89 if ( 89 ↛ 96line 89 didn't jump to line 96 because the condition on line 89 was always true
90 sound_sg.ipa != ""
91 or sound_sg.audio != ""
92 or len(sound_sg.homophones) > 0
93 ):
94 sound_sg.tags.append("singular")
95 word_entry.sounds.append(sound_sg)
96 if ( 96 ↛ exitline 96 didn't return from function 'process_transcriptions_ru_template' because the condition on line 96 was always true
97 sound_pl.ipa != ""
98 or sound_pl.audio != ""
99 or len(sound_pl.homophones) > 0
100 ):
101 sound_pl.tags.append("plural")
102 word_entry.sounds.append(sound_pl)
105def process_transcription_la_template(
106 wxr: WiktextractContext, word_entry: WordEntry, template_node: WikiNode
107):
108 # https://ru.wiktionary.org/wiki/Шаблон:transcription-la
109 sound = Sound()
110 cleaned_node = clean_node(wxr, None, template_node)
111 ipa_match = re.search(r"\((.+?)\): (\[.+?\])", cleaned_node)
112 if ipa_match is not None: 112 ↛ exitline 112 didn't return from function 'process_transcription_la_template' because the condition on line 112 was always true
113 sound.ipa = ipa_match.group(2)
114 sound.raw_tags.append(ipa_match.group(1).strip())
115 word_entry.sounds.append(sound)
118def process_transcription_grc_template(
119 wxr: WiktextractContext, word_entry: WordEntry, t_node: WikiNode
120):
121 # https://ru.wiktionary.org/wiki/Шаблон:transcription-grc
122 expanded_node = wxr.wtp.parse(
123 wxr.wtp.node_to_wikitext(t_node), expand_all=True
124 )
125 for node in expanded_node.children:
126 if (
127 isinstance(node, HTMLNode)
128 and node.tag == "span"
129 and node.attrs.get("class", "") == "IPA"
130 ):
131 ipa = clean_node(wxr, None, node)
132 if ipa != "": 132 ↛ 125line 132 didn't jump to line 125 because the condition on line 132 was always true
133 word_entry.sounds.append(Sound(ipa=ipa))
134 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
135 for list_item in node.find_child(NodeKind.LIST_ITEM):
136 text = clean_node(wxr, None, list_item.children)
137 for raw_tag, ipa in re.findall(r"(.+?): (\[.+?\])", text):
138 word_entry.sounds.append(
139 Sound(ipa=ipa, raw_tags=[raw_tag.strip()])
140 )
143def extract_ipa(
144 wxr: WiktextractContext,
145 sound: Sound,
146 template_params: dict[str, WikiNode],
147 key: str | int,
148):
149 ipa = clean_node(wxr, {}, template_params.get(key, ""))
150 if ipa != "":
151 sound.ipa = f"[{ipa}]"
154def extract_audio_file(
155 wxr: WiktextractContext,
156 sound: Sound,
157 template_params: dict[str, WikiNode],
158 key: str | int,
159):
160 audio_file = clean_node(wxr, None, template_params.get(key, ""))
161 if audio_file != "":
162 set_sound_file_url_fields(wxr, audio_file, sound)
165def extract_tags(
166 wxr: WiktextractContext,
167 sounds: Sound | list[Sound],
168 template_params: dict[str, WikiNode],
169):
170 tags = clean_node(wxr, None, template_params.get("норма", ""))
171 if tags != "":
172 if isinstance(sounds, list):
173 for sound in sounds:
174 sound.raw_tags = [tags]
175 else:
176 sounds.raw_tags = [tags]
179def extract_homophones(
180 wxr: WiktextractContext,
181 sounds: Sound | list[Sound],
182 template_params: dict[str, WikiNode],
183):
184 homophones_raw = clean_node(wxr, {}, template_params.get("омофоны", ""))
185 homophones = [
186 h.strip() for h in homophones_raw.split(",") if h.strip() != ""
187 ]
188 if homophones:
189 if isinstance(sounds, list): 189 ↛ 190line 189 didn't jump to line 190 because the condition on line 189 was never true
190 for sound in sounds:
191 sound.homophones = homophones
192 else:
193 sounds.homophones = homophones
196TRANSCRIPTION_TEMPLATE_PROCESSORS = {
197 "transcription": process_transcription_template,
198 "transcriptions": process_transcriptions_template,
199 "transcription-ru": process_transcription_ru_template,
200 "transcriptions-ru": process_transcriptions_ru_template,
201 "transcription-la": process_transcription_la_template,
202 "transcription-uk": None,
203 "transcription-grc": process_transcription_grc_template,
204 "transcription eo": None,
205}
208def extract_pronunciation_section(
209 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode
210) -> None:
211 for child in level_node.find_child(NodeKind.TEMPLATE):
212 template_name = child.template_name
213 if template_name in TRANSCRIPTION_TEMPLATE_PROCESSORS: 213 ↛ 217line 213 didn't jump to line 217 because the condition on line 213 was always true
214 processor = TRANSCRIPTION_TEMPLATE_PROCESSORS.get(template_name)
215 if processor is not None: 215 ↛ 211line 215 didn't jump to line 211 because the condition on line 215 was always true
216 processor(wxr, word_entry, child)
217 elif template_name in ["audio", "аудио", "медиа"]:
218 audio_file = clean_node(
219 wxr, None, child.template_parameters.get(1, "")
220 ).strip()
221 if audio_file != "":
222 if len(word_entry.sounds) > 0:
223 set_sound_file_url_fields(
224 wxr, audio_file, word_entry.sounds[-1]
225 )
226 else:
227 sound = Sound()
228 set_sound_file_url_fields(wxr, audio_file, sound)
229 word_entry.sounds.append(sound)
232def extract_homophone_section(
233 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode
234) -> None:
235 homophones = []
236 for link_node in level_node.find_child_recursively(NodeKind.LINK):
237 homophone = clean_node(wxr, None, link_node)
238 if len(homophone) > 0: 238 ↛ 236line 238 didn't jump to line 236 because the condition on line 238 was always true
239 homophones.append(homophone)
240 if len(homophones) > 0: 240 ↛ exitline 240 didn't return from function 'extract_homophone_section' because the condition on line 240 was always true
241 sound = Sound(homophones=homophones)
242 word_entry.sounds.append(sound)
245def extract_rhyme_section(
246 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode
247) -> None:
248 for list_node in level_node.find_child(NodeKind.LIST):
249 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
250 raw_tags = []
251 for node in list_item.children:
252 if isinstance(node, str) and node.strip().endswith(":"):
253 for raw_tag in node.strip(": ").split(","):
254 raw_tag = raw_tag.strip()
255 if raw_tag != "": 255 ↛ 253line 255 didn't jump to line 253 because the condition on line 255 was always true
256 raw_tags.append(raw_tag)
257 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
258 rhyme = clean_node(wxr, None, node)
259 if rhyme != "": 259 ↛ 251line 259 didn't jump to line 251 because the condition on line 259 was always true
260 sound = Sound(rhymes=rhyme, raw_tags=raw_tags)
261 translate_raw_tags(sound)
262 word_entry.sounds.append(sound)