Coverage for src/wiktextract/extractor/ru/pronunciation.py: 84%

148 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-06-13 07:43 +0000

1import re 

2from typing import Union 

3 

4from wikitextprocessor.parser import ( 

5 LevelNode, 

6 NodeKind, 

7 WikiNode, 

8) 

9 

10from ...page import clean_node 

11from ...wxr_context import WiktextractContext 

12from ..share import set_sound_file_url_fields 

13from .models import Sound, WordEntry 

14from .tags import translate_raw_tags 

15 

16 

17def process_transcription_template( 

18 wxr: WiktextractContext, word_entry: WordEntry, template_node: WikiNode 

19): 

20 # https://ru.wiktionary.org/wiki/Шаблон:transcription 

21 

22 sound = Sound() 

23 

24 template_params = template_node.template_parameters 

25 

26 extract_ipa(wxr, sound, template_params, 1) 

27 

28 extract_audio_file(wxr, sound, template_params, 2) 

29 

30 extract_tags(wxr, sound, template_params) 

31 

32 extract_homophones(wxr, sound, template_params) 

33 

34 if sound.model_dump(exclude_defaults=True) != {}: 

35 word_entry.sounds.append(sound) 

36 

37 

38def process_transcriptions_template( 

39 wxr: WiktextractContext, word_entry: WordEntry, template_node: WikiNode 

40): 

41 # https://ru.wiktionary.org/wiki/Шаблон:transcriptions 

42 

43 sound_sg = Sound() 

44 sound_pl = Sound() 

45 

46 template_params = template_node.template_parameters 

47 

48 extract_ipa(wxr, sound_sg, template_params, 1) 

49 extract_ipa(wxr, sound_pl, template_params, 2) 

50 

51 extract_audio_file(wxr, sound_sg, template_params, 3) 

52 extract_audio_file(wxr, sound_pl, template_params, 4) 

53 

54 extract_tags(wxr, [sound_sg, sound_pl], template_params) 

55 

56 extract_homophones(wxr, sound_sg, template_params) 

57 

58 if sound_sg.model_dump(exclude_defaults=True) != {} and ( 58 ↛ 64line 58 didn't jump to line 64 because the condition on line 58 was always true

59 sound_sg.ipa or sound_sg.audio 

60 ): 

61 sound_sg.tags.append("singular") 

62 word_entry.sounds.append(sound_sg) 

63 

64 if sound_pl.model_dump(exclude_defaults=True) != {} and ( 

65 sound_pl.ipa or sound_pl.audio 

66 ): 

67 sound_pl.tags.append("plural") 

68 word_entry.sounds.append(sound_pl) 

69 

70 

71def process_transcription_ru_template( 

72 wxr: WiktextractContext, word_entry: WordEntry, template_node: WikiNode 

73): 

74 # https://ru.wiktionary.org/wiki/Шаблон:transcription-ru 

75 sound = Sound() 

76 

77 template_params = template_node.template_parameters 

78 

79 ipa = clean_node(wxr, {}, template_params.get("вручную", "")) 

80 if not ipa: 80 ↛ 86line 80 didn't jump to line 86 because the condition on line 80 was always true

81 cleaned_node = clean_node(wxr, {}, template_node) 

82 ipa_match = re.search(r"\[(.*?)\]", cleaned_node) 

83 if ipa_match: 83 ↛ 86line 83 didn't jump to line 86 because the condition on line 83 was always true

84 ipa = ipa_match.group(1) 

85 

86 if ipa: 86 ↛ 89line 86 didn't jump to line 89 because the condition on line 86 was always true

87 sound.ipa = ipa 

88 

89 extract_audio_file(wxr, sound, template_params, 2) 

90 

91 extract_homophones(wxr, sound, template_params) 

92 

93 extract_tags(wxr, sound, template_params) 

94 

95 if sound.model_dump(exclude_defaults=True) != {}: 95 ↛ exitline 95 didn't return from function 'process_transcription_ru_template' because the condition on line 95 was always true

96 word_entry.sounds.append(sound) 

97 

98 

99def process_transcriptions_ru_template( 

100 wxr: WiktextractContext, word_entry: WordEntry, template_node: WikiNode 

101): 

102 sound_sg = Sound() 

103 sound_pl = Sound() 

104 

105 template_params = template_node.template_parameters 

106 

107 cleaned_node = clean_node(wxr, {}, template_node) 

108 ipa_matches = re.findall(r"\[(.*?)\]", cleaned_node) 

109 if len(ipa_matches) > 0: 109 ↛ 111line 109 didn't jump to line 111 because the condition on line 109 was always true

110 sound_sg.ipa = ipa_matches[0] 

111 if len(ipa_matches) > 1: 111 ↛ 114line 111 didn't jump to line 114 because the condition on line 111 was always true

112 sound_pl.ipa = ipa_matches[1] 

113 

114 extract_audio_file(wxr, sound_sg, template_params, 3) 

115 extract_audio_file(wxr, sound_pl, template_params, 4) 

116 

117 extract_tags(wxr, [sound_sg, sound_pl], template_params) 

118 

119 extract_homophones(wxr, sound_sg, template_params) 

120 

121 if sound_sg.model_dump(exclude_defaults=True) != {}: 121 ↛ 125line 121 didn't jump to line 125 because the condition on line 121 was always true

122 sound_sg.tags.append("singular") 

123 word_entry.sounds.append(sound_sg) 

124 

125 if sound_pl.model_dump(exclude_defaults=True) != {}: 125 ↛ exitline 125 didn't return from function 'process_transcriptions_ru_template' because the condition on line 125 was always true

126 sound_pl.tags.append("plural") 

127 word_entry.sounds.append(sound_pl) 

128 

129 

130def process_transcription_la_template( 

131 wxr: WiktextractContext, word_entry: WordEntry, template_node: WikiNode 

132): 

133 # https://ru.wiktionary.org/wiki/Шаблон:transcription-la 

134 sound = Sound() 

135 cleaned_node = clean_node(wxr, {}, template_node) 

136 ipa_match = re.search(r"\((.*?)\): \[(.*?)\]", cleaned_node) 

137 

138 if ipa_match: 138 ↛ exitline 138 didn't return from function 'process_transcription_la_template' because the condition on line 138 was always true

139 sound.ipa = ipa_match.group(2) 

140 sound.raw_tags = [ipa_match.group(1).strip()] 

141 word_entry.sounds.append(sound) 

142 

143 

144def process_transcription_grc_template( 

145 wxr: WiktextractContext, word_entry: WordEntry, template_node: WikiNode 

146): 

147 # https://ru.wiktionary.org/wiki/Шаблон:transcription-grc 

148 sound = Sound() 

149 cleaned_node = clean_node(wxr, {}, template_node) 

150 ipa_with_labels = re.findall(r"\* (.*?): \[(.*?)\]", cleaned_node) 

151 for label, ipa in ipa_with_labels: 

152 sound = Sound(ipa=ipa, raw_tags=[label.strip()]) 

153 word_entry.sounds.append(sound) 

154 

155 

156def extract_ipa( 

157 wxr: WiktextractContext, 

158 sound: Sound, 

159 template_params: dict[str, WikiNode], 

160 key: Union[str, int], 

161): 

162 ipa = clean_node(wxr, {}, template_params.get(key, "")) 

163 if ipa: 

164 sound.ipa = ipa 

165 

166 

167def extract_audio_file( 

168 wxr: WiktextractContext, 

169 sound: Sound, 

170 template_params: dict[str, WikiNode], 

171 key: Union[str, int], 

172): 

173 audio_file = clean_node(wxr, None, template_params.get(key, "")) 

174 if audio_file != "": 

175 set_sound_file_url_fields(wxr, audio_file, sound) 

176 

177 

178def extract_tags( 

179 wxr: WiktextractContext, 

180 sounds: Union[Sound, list[Sound]], 

181 template_params: dict[str, WikiNode], 

182): 

183 tags = clean_node(wxr, None, template_params.get("норма", "")) 

184 if tags != "": 

185 if isinstance(sounds, list): 

186 for sound in sounds: 

187 sound.raw_tags = [tags] 

188 else: 

189 sounds.raw_tags = [tags] 

190 

191 

192def extract_homophones( 

193 wxr: WiktextractContext, 

194 sounds: Union[Sound, list[Sound]], 

195 template_params: dict[str, WikiNode], 

196): 

197 homophones_raw = clean_node(wxr, {}, template_params.get("омофоны", "")) 

198 homophones = [ 

199 h.strip() for h in homophones_raw.split(",") if h.strip() != "" 

200 ] 

201 if homophones: 

202 if isinstance(sounds, list): 202 ↛ 203line 202 didn't jump to line 203 because the condition on line 202 was never true

203 for sound in sounds: 

204 sound.homophones = homophones 

205 else: 

206 sounds.homophones = homophones 

207 

208 

209TRANSCRIPTION_TEMPLATE_PROCESSORS = { 

210 "transcription": process_transcription_template, 

211 "transcriptions": process_transcriptions_template, 

212 "transcription-ru": process_transcription_ru_template, 

213 "transcriptions-ru": process_transcriptions_ru_template, 

214 "transcription-la": process_transcription_la_template, 

215 "transcription-uk": None, 

216 "transcription-grc": process_transcription_grc_template, 

217 "transcription eo": None, 

218} 

219 

220 

221def extract_pronunciation_section( 

222 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode 

223) -> None: 

224 for child in level_node.find_child(NodeKind.TEMPLATE): 

225 template_name = child.template_name 

226 if template_name in TRANSCRIPTION_TEMPLATE_PROCESSORS: 226 ↛ 230line 226 didn't jump to line 230 because the condition on line 226 was always true

227 processor = TRANSCRIPTION_TEMPLATE_PROCESSORS.get(template_name) 

228 if processor is not None: 228 ↛ 224line 228 didn't jump to line 224 because the condition on line 228 was always true

229 processor(wxr, word_entry, child) 

230 elif template_name in ["audio", "аудио", "медиа"]: 

231 audio_file = clean_node( 

232 wxr, None, child.template_parameters.get(1, "") 

233 ).strip() 

234 if audio_file != "": 

235 if len(word_entry.sounds) > 0: 

236 set_sound_file_url_fields( 

237 wxr, audio_file, word_entry.sounds[-1] 

238 ) 

239 else: 

240 sound = Sound() 

241 set_sound_file_url_fields(wxr, audio_file, sound) 

242 word_entry.sounds.append(sound) 

243 

244 

245def extract_homophone_section( 

246 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode 

247) -> None: 

248 homophones = [] 

249 for link_node in level_node.find_child_recursively(NodeKind.LINK): 

250 homophone = clean_node(wxr, None, link_node) 

251 if len(homophone) > 0: 251 ↛ 249line 251 didn't jump to line 249 because the condition on line 251 was always true

252 homophones.append(homophone) 

253 if len(homophones) > 0: 253 ↛ exitline 253 didn't return from function 'extract_homophone_section' because the condition on line 253 was always true

254 sound = Sound(homophones=homophones) 

255 word_entry.sounds.append(sound) 

256 

257 

258def extract_rhyme_section( 

259 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode 

260) -> None: 

261 for list_node in level_node.find_child(NodeKind.LIST): 

262 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

263 raw_tags = [] 

264 for node in list_item.children: 

265 if isinstance(node, str) and node.strip().endswith(":"): 

266 for raw_tag in node.strip(": ").split(","): 

267 raw_tag = raw_tag.strip() 

268 if raw_tag != "": 268 ↛ 266line 268 didn't jump to line 266 because the condition on line 268 was always true

269 raw_tags.append(raw_tag) 

270 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK: 

271 rhyme = clean_node(wxr, None, node) 

272 if rhyme != "": 272 ↛ 264line 272 didn't jump to line 264 because the condition on line 272 was always true

273 sound = Sound(rhymes=rhyme, raw_tags=raw_tags) 

274 translate_raw_tags(sound) 

275 word_entry.sounds.append(sound)