Coverage for src/wiktextract/extractor/ru/pronunciation.py: 83%

131 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2024-12-27 08:07 +0000

1import re 

2from typing import Union 

3 

4from wikitextprocessor.parser import ( 

5 LevelNode, 

6 NodeKind, 

7 WikiNode, 

8) 

9 

10from ...page import clean_node 

11from ...wxr_context import WiktextractContext 

12from ..share import set_sound_file_url_fields 

13from .models import Sound, WordEntry 

14 

15 

16def process_transcription_template( 

17 wxr: WiktextractContext, 

18 word_entry: WordEntry, 

19 template_node: WikiNode, 

20): 

21 # https://ru.wiktionary.org/wiki/Шаблон:transcription 

22 

23 sound = Sound() 

24 

25 template_params = template_node.template_parameters 

26 

27 extract_ipa(wxr, sound, template_params, 1) 

28 

29 extract_audio_file(wxr, sound, template_params, 2) 

30 

31 extract_tags(wxr, sound, template_params) 

32 

33 extract_homophones(wxr, sound, template_params) 

34 

35 if sound.model_dump(exclude_defaults=True) != {}: 

36 word_entry.sounds.append(sound) 

37 

38 

39def process_transcriptions_template( 

40 wxr: WiktextractContext, 

41 word_entry: WordEntry, 

42 template_node: WikiNode, 

43): 

44 # https://ru.wiktionary.org/wiki/Шаблон:transcriptions 

45 

46 sound_sg = Sound() 

47 sound_pl = Sound() 

48 

49 template_params = template_node.template_parameters 

50 

51 extract_ipa(wxr, sound_sg, template_params, 1) 

52 extract_ipa(wxr, sound_pl, template_params, 2) 

53 

54 extract_audio_file(wxr, sound_sg, template_params, 3) 

55 extract_audio_file(wxr, sound_pl, template_params, 4) 

56 

57 extract_tags(wxr, [sound_sg, sound_pl], template_params) 

58 

59 extract_homophones(wxr, sound_sg, template_params) 

60 

61 if sound_sg.model_dump(exclude_defaults=True) != {} and ( 61 ↛ 67line 61 didn't jump to line 67 because the condition on line 61 was always true

62 sound_sg.ipa or sound_sg.audio 

63 ): 

64 sound_sg.tags.append("singular") 

65 word_entry.sounds.append(sound_sg) 

66 

67 if sound_pl.model_dump(exclude_defaults=True) != {} and ( 

68 sound_pl.ipa or sound_pl.audio 

69 ): 

70 sound_pl.tags.append("plural") 

71 word_entry.sounds.append(sound_pl) 

72 

73 

74def process_transcription_ru_template( 

75 wxr: WiktextractContext, 

76 word_entry: WordEntry, 

77 template_node: WikiNode, 

78): 

79 # https://ru.wiktionary.org/wiki/Шаблон:transcription-ru 

80 sound = Sound() 

81 

82 template_params = template_node.template_parameters 

83 

84 ipa = clean_node(wxr, {}, template_params.get("вручную", "")) 

85 if not ipa: 85 ↛ 91line 85 didn't jump to line 91 because the condition on line 85 was always true

86 cleaned_node = clean_node(wxr, {}, template_node) 

87 ipa_match = re.search(r"\[(.*?)\]", cleaned_node) 

88 if ipa_match: 88 ↛ 91line 88 didn't jump to line 91 because the condition on line 88 was always true

89 ipa = ipa_match.group(1) 

90 

91 if ipa: 91 ↛ 94line 91 didn't jump to line 94 because the condition on line 91 was always true

92 sound.ipa = ipa 

93 

94 extract_audio_file(wxr, sound, template_params, 2) 

95 

96 extract_homophones(wxr, sound, template_params) 

97 

98 extract_tags(wxr, sound, template_params) 

99 

100 if sound.model_dump(exclude_defaults=True) != {}: 100 ↛ exitline 100 didn't return from function 'process_transcription_ru_template' because the condition on line 100 was always true

101 word_entry.sounds.append(sound) 

102 

103 

104def process_transcriptions_ru_template( 

105 wxr: WiktextractContext, 

106 word_entry: WordEntry, 

107 template_node: WikiNode, 

108): 

109 sound_sg = Sound() 

110 sound_pl = Sound() 

111 

112 template_params = template_node.template_parameters 

113 

114 cleaned_node = clean_node(wxr, {}, template_node) 

115 ipa_matches = re.findall(r"\[(.*?)\]", cleaned_node) 

116 if len(ipa_matches) > 0: 116 ↛ 118line 116 didn't jump to line 118 because the condition on line 116 was always true

117 sound_sg.ipa = ipa_matches[0] 

118 if len(ipa_matches) > 1: 118 ↛ 121line 118 didn't jump to line 121 because the condition on line 118 was always true

119 sound_pl.ipa = ipa_matches[1] 

120 

121 extract_audio_file(wxr, sound_sg, template_params, 3) 

122 extract_audio_file(wxr, sound_pl, template_params, 4) 

123 

124 extract_tags(wxr, [sound_sg, sound_pl], template_params) 

125 

126 extract_homophones(wxr, sound_sg, template_params) 

127 

128 if sound_sg.model_dump(exclude_defaults=True) != {}: 128 ↛ 132line 128 didn't jump to line 132 because the condition on line 128 was always true

129 sound_sg.tags.append("singular") 

130 word_entry.sounds.append(sound_sg) 

131 

132 if sound_pl.model_dump(exclude_defaults=True) != {}: 132 ↛ exitline 132 didn't return from function 'process_transcriptions_ru_template' because the condition on line 132 was always true

133 sound_pl.tags.append("plural") 

134 word_entry.sounds.append(sound_pl) 

135 

136 

137def process_transcription_la_template( 

138 wxr: WiktextractContext, word_entry: WordEntry, template_node: WikiNode 

139): 

140 # https://ru.wiktionary.org/wiki/Шаблон:transcription-la 

141 sound = Sound() 

142 cleaned_node = clean_node(wxr, {}, template_node) 

143 ipa_match = re.search(r"\((.*?)\): \[(.*?)\]", cleaned_node) 

144 

145 if ipa_match: 145 ↛ exitline 145 didn't return from function 'process_transcription_la_template' because the condition on line 145 was always true

146 sound.ipa = ipa_match.group(2) 

147 sound.raw_tags = [ipa_match.group(1).strip()] 

148 word_entry.sounds.append(sound) 

149 

150 

151def process_transcription_grc_template( 

152 wxr: WiktextractContext, word_entry: WordEntry, template_node: WikiNode 

153): 

154 # https://ru.wiktionary.org/wiki/Шаблон:transcription-grc 

155 sound = Sound() 

156 cleaned_node = clean_node(wxr, {}, template_node) 

157 ipa_with_labels = re.findall(r"\* (.*?): \[(.*?)\]", cleaned_node) 

158 for label, ipa in ipa_with_labels: 

159 sound = Sound(ipa=ipa, raw_tags=[label.strip()]) 

160 word_entry.sounds.append(sound) 

161 

162 

163def extract_ipa( 

164 wxr: WiktextractContext, 

165 sound: Sound, 

166 template_params: dict[str, WikiNode], 

167 key: Union[str, int], 

168): 

169 ipa = clean_node(wxr, {}, template_params.get(key, "")) 

170 if ipa: 

171 sound.ipa = ipa 

172 

173 

174def extract_audio_file( 

175 wxr: WiktextractContext, 

176 sound: Sound, 

177 template_params: dict[str, WikiNode], 

178 key: Union[str, int], 

179): 

180 audio_file = clean_node(wxr, None, template_params.get(key, "")) 

181 if audio_file != "": 

182 set_sound_file_url_fields(wxr, audio_file, sound) 

183 

184 

185def extract_tags( 

186 wxr: WiktextractContext, 

187 sounds: Union[Sound, list[Sound]], 

188 template_params: dict[str, WikiNode], 

189): 

190 tags = clean_node(wxr, None, template_params.get("норма", "")) 

191 if tags != "": 

192 if isinstance(sounds, list): 

193 for sound in sounds: 

194 sound.raw_tags = [tags] 

195 else: 

196 sounds.raw_tags = [tags] 

197 

198 

199def extract_homophones( 

200 wxr: WiktextractContext, 

201 sounds: Union[Sound, list[Sound]], 

202 template_params: dict[str, WikiNode], 

203): 

204 homophones_raw = clean_node(wxr, {}, template_params.get("омофоны", "")) 

205 homophones = [ 

206 h.strip() for h in homophones_raw.split(",") if h.strip() != "" 

207 ] 

208 if homophones: 

209 if isinstance(sounds, list): 209 ↛ 210line 209 didn't jump to line 210 because the condition on line 209 was never true

210 for sound in sounds: 

211 sound.homophones = homophones 

212 else: 

213 sounds.homophones = homophones 

214 

215 

216TRANSCRIPTION_TEMPLATE_PROCESSORS = { 

217 "transcription": process_transcription_template, 

218 "transcriptions": process_transcriptions_template, 

219 "transcription-ru": process_transcription_ru_template, 

220 "transcriptions-ru": process_transcriptions_ru_template, 

221 "transcription-la": process_transcription_la_template, 

222 "transcription-uk": None, 

223 "transcription-grc": process_transcription_grc_template, 

224 "transcription eo": None, 

225} 

226 

227 

228def extract_pronunciation_section( 

229 wxr: WiktextractContext, 

230 word_entry: WordEntry, 

231 level_node: LevelNode, 

232) -> None: 

233 for child in level_node.find_child(NodeKind.TEMPLATE): 

234 template_name = child.template_name 

235 if template_name in TRANSCRIPTION_TEMPLATE_PROCESSORS: 235 ↛ 239line 235 didn't jump to line 239 because the condition on line 235 was always true

236 processor = TRANSCRIPTION_TEMPLATE_PROCESSORS.get(template_name) 

237 if processor is not None: 237 ↛ 233line 237 didn't jump to line 233 because the condition on line 237 was always true

238 processor(wxr, word_entry, child) 

239 elif template_name in ["audio", "аудио", "медиа"]: 

240 audio_file = clean_node( 

241 wxr, None, child.template_parameters.get(1, "") 

242 ).strip() 

243 if audio_file != "": 

244 if len(word_entry.sounds) > 0: 

245 set_sound_file_url_fields( 

246 wxr, audio_file, word_entry.sounds[-1] 

247 ) 

248 else: 

249 sound = Sound() 

250 set_sound_file_url_fields(wxr, audio_file, sound) 

251 word_entry.sounds.append(sound) 

252 

253 

254def extract_homophone_section( 

255 wxr: WiktextractContext, 

256 word_entry: WordEntry, 

257 level_node: LevelNode, 

258) -> None: 

259 homophones = [] 

260 for link_node in level_node.find_child_recursively(NodeKind.LINK): 

261 homophone = clean_node(wxr, None, link_node) 

262 if len(homophone) > 0: 262 ↛ 260line 262 didn't jump to line 260 because the condition on line 262 was always true

263 homophones.append(homophone) 

264 if len(homophones) > 0: 264 ↛ exitline 264 didn't return from function 'extract_homophone_section' because the condition on line 264 was always true

265 sound = Sound(homophones=homophones) 

266 word_entry.sounds.append(sound)