Coverage for src/wiktextract/extractor/ru/pronunciation.py: 83%

1import re

2from typing import Union

4from wikitextprocessor.parser import (

5 LevelNode,

6 NodeKind,

7 WikiNode,

10from ...page import clean_node

11from ...wxr_context import WiktextractContext

12from ..share import set_sound_file_url_fields

13from .models import Sound, WordEntry

16def process_transcription_template(

17 wxr: WiktextractContext,

18 word_entry: WordEntry,

19 template_node: WikiNode,

20):

21 # https://ru.wiktionary.org/wiki/Шаблон:transcription

23 sound = Sound()

25 template_params = template_node.template_parameters

27 extract_ipa(wxr, sound, template_params, 1)

29 extract_audio_file(wxr, sound, template_params, 2)

31 extract_tags(wxr, sound, template_params)

33 extract_homophones(wxr, sound, template_params)

35 if sound.model_dump(exclude_defaults=True) != {}:

36 word_entry.sounds.append(sound)

39def process_transcriptions_template(

40 wxr: WiktextractContext,

41 word_entry: WordEntry,

42 template_node: WikiNode,

43):

44 # https://ru.wiktionary.org/wiki/Шаблон:transcriptions

46 sound_sg = Sound()

47 sound_pl = Sound()

49 template_params = template_node.template_parameters

51 extract_ipa(wxr, sound_sg, template_params, 1)

52 extract_ipa(wxr, sound_pl, template_params, 2)

54 extract_audio_file(wxr, sound_sg, template_params, 3)

55 extract_audio_file(wxr, sound_pl, template_params, 4)

57 extract_tags(wxr, [sound_sg, sound_pl], template_params)

59 extract_homophones(wxr, sound_sg, template_params)

61 if sound_sg.model_dump(exclude_defaults=True) != {} and ( 61 ↛ 67line 61 didn't jump to line 67 because the condition on line 61 was always true

62 sound_sg.ipa or sound_sg.audio

63 ):

64 sound_sg.tags.append("singular")

65 word_entry.sounds.append(sound_sg)

67 if sound_pl.model_dump(exclude_defaults=True) != {} and (

68 sound_pl.ipa or sound_pl.audio

69 ):

70 sound_pl.tags.append("plural")

71 word_entry.sounds.append(sound_pl)

74def process_transcription_ru_template(

75 wxr: WiktextractContext,

76 word_entry: WordEntry,

77 template_node: WikiNode,

78):

79 # https://ru.wiktionary.org/wiki/Шаблон:transcription-ru

80 sound = Sound()

82 template_params = template_node.template_parameters

84 ipa = clean_node(wxr, {}, template_params.get("вручную", ""))

85 if not ipa: 85 ↛ 91line 85 didn't jump to line 91 because the condition on line 85 was always true

86 cleaned_node = clean_node(wxr, {}, template_node)

87 ipa_match = re.search(r"\[(.*?)\]", cleaned_node)

88 if ipa_match: 88 ↛ 91line 88 didn't jump to line 91 because the condition on line 88 was always true

89 ipa = ipa_match.group(1)

91 if ipa: 91 ↛ 94line 91 didn't jump to line 94 because the condition on line 91 was always true

92 sound.ipa = ipa

94 extract_audio_file(wxr, sound, template_params, 2)

96 extract_homophones(wxr, sound, template_params)

98 extract_tags(wxr, sound, template_params)

100 if sound.model_dump(exclude_defaults=True) != {}: 100 ↛ exitline 100 didn't return from function 'process_transcription_ru_template' because the condition on line 100 was always true

101 word_entry.sounds.append(sound)

102

103

104def process_transcriptions_ru_template(

105 wxr: WiktextractContext,

106 word_entry: WordEntry,

107 template_node: WikiNode,

108):

109 sound_sg = Sound()

110 sound_pl = Sound()

111

112 template_params = template_node.template_parameters

113

114 cleaned_node = clean_node(wxr, {}, template_node)

115 ipa_matches = re.findall(r"\[(.*?)\]", cleaned_node)

116 if len(ipa_matches) > 0: 116 ↛ 118line 116 didn't jump to line 118 because the condition on line 116 was always true

117 sound_sg.ipa = ipa_matches[0]

118 if len(ipa_matches) > 1: 118 ↛ 121line 118 didn't jump to line 121 because the condition on line 118 was always true

119 sound_pl.ipa = ipa_matches[1]

120

121 extract_audio_file(wxr, sound_sg, template_params, 3)

122 extract_audio_file(wxr, sound_pl, template_params, 4)

123

124 extract_tags(wxr, [sound_sg, sound_pl], template_params)

125

126 extract_homophones(wxr, sound_sg, template_params)

127

128 if sound_sg.model_dump(exclude_defaults=True) != {}: 128 ↛ 132line 128 didn't jump to line 132 because the condition on line 128 was always true

129 sound_sg.tags.append("singular")

130 word_entry.sounds.append(sound_sg)

131

132 if sound_pl.model_dump(exclude_defaults=True) != {}: 132 ↛ exitline 132 didn't return from function 'process_transcriptions_ru_template' because the condition on line 132 was always true

133 sound_pl.tags.append("plural")

134 word_entry.sounds.append(sound_pl)

135

136

137def process_transcription_la_template(

138 wxr: WiktextractContext, word_entry: WordEntry, template_node: WikiNode

139):

140 # https://ru.wiktionary.org/wiki/Шаблон:transcription-la

141 sound = Sound()

142 cleaned_node = clean_node(wxr, {}, template_node)

143 ipa_match = re.search(r"\((.*?)\): \[(.*?)\]", cleaned_node)

144

145 if ipa_match: 145 ↛ exitline 145 didn't return from function 'process_transcription_la_template' because the condition on line 145 was always true

146 sound.ipa = ipa_match.group(2)

147 sound.raw_tags = [ipa_match.group(1).strip()]

148 word_entry.sounds.append(sound)

149

150

151def process_transcription_grc_template(

152 wxr: WiktextractContext, word_entry: WordEntry, template_node: WikiNode

153):

154 # https://ru.wiktionary.org/wiki/Шаблон:transcription-grc

155 sound = Sound()

156 cleaned_node = clean_node(wxr, {}, template_node)

157 ipa_with_labels = re.findall(r"\* (.*?): \[(.*?)\]", cleaned_node)

158 for label, ipa in ipa_with_labels:

159 sound = Sound(ipa=ipa, raw_tags=[label.strip()])

160 word_entry.sounds.append(sound)

161

162

163def extract_ipa(

164 wxr: WiktextractContext,

165 sound: Sound,

166 template_params: dict[str, WikiNode],

167 key: Union[str, int],

168):

169 ipa = clean_node(wxr, {}, template_params.get(key, ""))

170 if ipa:

171 sound.ipa = ipa

172

173

174def extract_audio_file(

175 wxr: WiktextractContext,

176 sound: Sound,

177 template_params: dict[str, WikiNode],

178 key: Union[str, int],

179):

180 audio_file = clean_node(wxr, None, template_params.get(key, ""))

181 if audio_file != "":

182 set_sound_file_url_fields(wxr, audio_file, sound)

183

184

185def extract_tags(

186 wxr: WiktextractContext,

187 sounds: Union[Sound, list[Sound]],

188 template_params: dict[str, WikiNode],

189):

190 tags = clean_node(wxr, None, template_params.get("норма", ""))

191 if tags != "":

192 if isinstance(sounds, list):

193 for sound in sounds:

194 sound.raw_tags = [tags]

195 else:

196 sounds.raw_tags = [tags]

197

198

199def extract_homophones(

200 wxr: WiktextractContext,

201 sounds: Union[Sound, list[Sound]],

202 template_params: dict[str, WikiNode],

203):

204 homophones_raw = clean_node(wxr, {}, template_params.get("омофоны", ""))

205 homophones = [

206 h.strip() for h in homophones_raw.split(",") if h.strip() != ""

207 ]

208 if homophones:

209 if isinstance(sounds, list): 209 ↛ 210line 209 didn't jump to line 210 because the condition on line 209 was never true

210 for sound in sounds:

211 sound.homophones = homophones

212 else:

213 sounds.homophones = homophones

214

215

216TRANSCRIPTION_TEMPLATE_PROCESSORS = {

217 "transcription": process_transcription_template,

218 "transcriptions": process_transcriptions_template,

219 "transcription-ru": process_transcription_ru_template,

220 "transcriptions-ru": process_transcriptions_ru_template,

221 "transcription-la": process_transcription_la_template,

222 "transcription-uk": None,

223 "transcription-grc": process_transcription_grc_template,

224 "transcription eo": None,

225}

226

227

228def extract_pronunciation_section(

229 wxr: WiktextractContext,

230 word_entry: WordEntry,

231 level_node: LevelNode,

232) -> None:

233 for child in level_node.find_child(NodeKind.TEMPLATE):

234 template_name = child.template_name

235 if template_name in TRANSCRIPTION_TEMPLATE_PROCESSORS: 235 ↛ 239line 235 didn't jump to line 239 because the condition on line 235 was always true

236 processor = TRANSCRIPTION_TEMPLATE_PROCESSORS.get(template_name)

237 if processor is not None: 237 ↛ 233line 237 didn't jump to line 233 because the condition on line 237 was always true

238 processor(wxr, word_entry, child)

239 elif template_name in ["audio", "аудио", "медиа"]:

240 audio_file = clean_node(

241 wxr, None, child.template_parameters.get(1, "")

242 ).strip()

243 if audio_file != "":

244 if len(word_entry.sounds) > 0:

245 set_sound_file_url_fields(

246 wxr, audio_file, word_entry.sounds[-1]

247 )

248 else:

249 sound = Sound()

250 set_sound_file_url_fields(wxr, audio_file, sound)

251 word_entry.sounds.append(sound)

252

253

254def extract_homophone_section(

255 wxr: WiktextractContext,

256 word_entry: WordEntry,

257 level_node: LevelNode,

258) -> None:

259 homophones = []

260 for link_node in level_node.find_child_recursively(NodeKind.LINK):

261 homophone = clean_node(wxr, None, link_node)

262 if len(homophone) > 0: 262 ↛ 260line 262 didn't jump to line 260 because the condition on line 262 was always true

263 homophones.append(homophone)

264 if len(homophones) > 0: 264 ↛ exitline 264 didn't return from function 'extract_homophone_section' because the condition on line 264 was always true

265 sound = Sound(homophones=homophones)

266 word_entry.sounds.append(sound)