Coverage for src/wiktextract/extractor/ru/pronunciation.py: 85%

150 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-12 08:27 +0000

1import re 

2 

3from wikitextprocessor import HTMLNode, LevelNode, NodeKind, WikiNode 

4 

5from ...page import clean_node 

6from ...wxr_context import WiktextractContext 

7from ..share import set_sound_file_url_fields 

8from .models import Sound, WordEntry 

9from .tags import translate_raw_tags 

10 

11 

12def process_transcription_template( 

13 wxr: WiktextractContext, word_entry: WordEntry, template_node: WikiNode 

14): 

15 # https://ru.wiktionary.org/wiki/Шаблон:transcription 

16 

17 sound = Sound() 

18 template_params = template_node.template_parameters 

19 extract_ipa(wxr, sound, template_params, 1) 

20 extract_audio_file(wxr, sound, template_params, 2) 

21 extract_tags(wxr, sound, template_params) 

22 extract_homophones(wxr, sound, template_params) 

23 

24 if sound.ipa != "" or sound.audio != "" or len(sound.homophones) > 0: 

25 word_entry.sounds.append(sound) 

26 

27 

28def process_transcriptions_template( 

29 wxr: WiktextractContext, word_entry: WordEntry, template_node: WikiNode 

30): 

31 # https://ru.wiktionary.org/wiki/Шаблон:transcriptions 

32 

33 sound_sg = Sound() 

34 sound_pl = Sound() 

35 template_params = template_node.template_parameters 

36 extract_ipa(wxr, sound_sg, template_params, 1) 

37 extract_ipa(wxr, sound_pl, template_params, 2) 

38 extract_audio_file(wxr, sound_sg, template_params, 3) 

39 extract_audio_file(wxr, sound_pl, template_params, 4) 

40 extract_tags(wxr, [sound_sg, sound_pl], template_params) 

41 extract_homophones(wxr, sound_sg, template_params) 

42 

43 if sound_sg.ipa != "" or sound_sg.audio != "": 43 ↛ 47line 43 didn't jump to line 47 because the condition on line 43 was always true

44 sound_sg.tags.append("singular") 

45 word_entry.sounds.append(sound_sg) 

46 

47 if sound_pl.ipa != "" or sound_pl.audio != "": 

48 sound_pl.tags.append("plural") 

49 word_entry.sounds.append(sound_pl) 

50 

51 

52def process_transcription_ru_template( 

53 wxr: WiktextractContext, word_entry: WordEntry, template_node: WikiNode 

54): 

55 # https://ru.wiktionary.org/wiki/Шаблон:transcription-ru 

56 sound = Sound() 

57 template_params = template_node.template_parameters 

58 sound.ipa = clean_node(wxr, None, template_params.get("вручную", "")) 

59 if sound.ipa == "": 59 ↛ 64line 59 didn't jump to line 64 because the condition on line 59 was always true

60 cleaned_node = clean_node(wxr, None, template_node) 

61 ipa_match = re.search(r"\[.+?\]", cleaned_node) 

62 if ipa_match is not None: 62 ↛ 64line 62 didn't jump to line 64 because the condition on line 62 was always true

63 sound.ipa = ipa_match.group() 

64 extract_audio_file(wxr, sound, template_params, 2) 

65 extract_homophones(wxr, sound, template_params) 

66 extract_tags(wxr, sound, template_params) 

67 

68 if sound.ipa != "" or sound.audio != "" or len(sound.homophones) > 0: 68 ↛ exitline 68 didn't return from function 'process_transcription_ru_template' because the condition on line 68 was always true

69 word_entry.sounds.append(sound) 

70 

71 

72def process_transcriptions_ru_template( 

73 wxr: WiktextractContext, word_entry: WordEntry, template_node: WikiNode 

74): 

75 sound_sg = Sound() 

76 sound_pl = Sound() 

77 template_params = template_node.template_parameters 

78 cleaned_node = clean_node(wxr, None, template_node) 

79 ipa_matches = re.findall(r"\[.+?\]", cleaned_node) 

80 if len(ipa_matches) > 0: 80 ↛ 82line 80 didn't jump to line 82 because the condition on line 80 was always true

81 sound_sg.ipa = ipa_matches[0] 

82 if len(ipa_matches) > 1: 82 ↛ 84line 82 didn't jump to line 84 because the condition on line 82 was always true

83 sound_pl.ipa = ipa_matches[1] 

84 extract_audio_file(wxr, sound_sg, template_params, 3) 

85 extract_audio_file(wxr, sound_pl, template_params, 4) 

86 extract_tags(wxr, [sound_sg, sound_pl], template_params) 

87 extract_homophones(wxr, sound_sg, template_params) 

88 

89 if ( 89 ↛ 96line 89 didn't jump to line 96 because the condition on line 89 was always true

90 sound_sg.ipa != "" 

91 or sound_sg.audio != "" 

92 or len(sound_sg.homophones) > 0 

93 ): 

94 sound_sg.tags.append("singular") 

95 word_entry.sounds.append(sound_sg) 

96 if ( 96 ↛ exitline 96 didn't return from function 'process_transcriptions_ru_template' because the condition on line 96 was always true

97 sound_pl.ipa != "" 

98 or sound_pl.audio != "" 

99 or len(sound_pl.homophones) > 0 

100 ): 

101 sound_pl.tags.append("plural") 

102 word_entry.sounds.append(sound_pl) 

103 

104 

105def process_transcription_la_template( 

106 wxr: WiktextractContext, word_entry: WordEntry, template_node: WikiNode 

107): 

108 # https://ru.wiktionary.org/wiki/Шаблон:transcription-la 

109 sound = Sound() 

110 cleaned_node = clean_node(wxr, None, template_node) 

111 ipa_match = re.search(r"\((.+?)\): (\[.+?\])", cleaned_node) 

112 if ipa_match is not None: 112 ↛ exitline 112 didn't return from function 'process_transcription_la_template' because the condition on line 112 was always true

113 sound.ipa = ipa_match.group(2) 

114 sound.raw_tags.append(ipa_match.group(1).strip()) 

115 word_entry.sounds.append(sound) 

116 

117 

118def process_transcription_grc_template( 

119 wxr: WiktextractContext, word_entry: WordEntry, t_node: WikiNode 

120): 

121 # https://ru.wiktionary.org/wiki/Шаблон:transcription-grc 

122 expanded_node = wxr.wtp.parse( 

123 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

124 ) 

125 for node in expanded_node.children: 

126 if ( 

127 isinstance(node, HTMLNode) 

128 and node.tag == "span" 

129 and node.attrs.get("class", "") == "IPA" 

130 ): 

131 ipa = clean_node(wxr, None, node) 

132 if ipa != "": 132 ↛ 125line 132 didn't jump to line 125 because the condition on line 132 was always true

133 word_entry.sounds.append(Sound(ipa=ipa)) 

134 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

135 for list_item in node.find_child(NodeKind.LIST_ITEM): 

136 text = clean_node(wxr, None, list_item.children) 

137 for raw_tag, ipa in re.findall(r"(.+?): (\[.+?\])", text): 

138 word_entry.sounds.append( 

139 Sound(ipa=ipa, raw_tags=[raw_tag.strip()]) 

140 ) 

141 

142 

143def extract_ipa( 

144 wxr: WiktextractContext, 

145 sound: Sound, 

146 template_params: dict[str, WikiNode], 

147 key: str | int, 

148): 

149 ipa = clean_node(wxr, {}, template_params.get(key, "")) 

150 if ipa != "": 

151 sound.ipa = f"[{ipa}]" 

152 

153 

154def extract_audio_file( 

155 wxr: WiktextractContext, 

156 sound: Sound, 

157 template_params: dict[str, WikiNode], 

158 key: str | int, 

159): 

160 audio_file = clean_node(wxr, None, template_params.get(key, "")) 

161 if audio_file != "": 

162 set_sound_file_url_fields(wxr, audio_file, sound) 

163 

164 

165def extract_tags( 

166 wxr: WiktextractContext, 

167 sounds: Sound | list[Sound], 

168 template_params: dict[str, WikiNode], 

169): 

170 tags = clean_node(wxr, None, template_params.get("норма", "")) 

171 if tags != "": 

172 if isinstance(sounds, list): 

173 for sound in sounds: 

174 sound.raw_tags = [tags] 

175 else: 

176 sounds.raw_tags = [tags] 

177 

178 

179def extract_homophones( 

180 wxr: WiktextractContext, 

181 sounds: Sound | list[Sound], 

182 template_params: dict[str, WikiNode], 

183): 

184 homophones_raw = clean_node(wxr, {}, template_params.get("омофоны", "")) 

185 homophones = [ 

186 h.strip() for h in homophones_raw.split(",") if h.strip() != "" 

187 ] 

188 if homophones: 

189 if isinstance(sounds, list): 189 ↛ 190line 189 didn't jump to line 190 because the condition on line 189 was never true

190 for sound in sounds: 

191 sound.homophones = homophones 

192 else: 

193 sounds.homophones = homophones 

194 

195 

196TRANSCRIPTION_TEMPLATE_PROCESSORS = { 

197 "transcription": process_transcription_template, 

198 "transcriptions": process_transcriptions_template, 

199 "transcription-ru": process_transcription_ru_template, 

200 "transcriptions-ru": process_transcriptions_ru_template, 

201 "transcription-la": process_transcription_la_template, 

202 "transcription-uk": None, 

203 "transcription-grc": process_transcription_grc_template, 

204 "transcription eo": None, 

205} 

206 

207 

208def extract_pronunciation_section( 

209 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode 

210) -> None: 

211 for child in level_node.find_child(NodeKind.TEMPLATE): 

212 template_name = child.template_name 

213 if template_name in TRANSCRIPTION_TEMPLATE_PROCESSORS: 213 ↛ 217line 213 didn't jump to line 217 because the condition on line 213 was always true

214 processor = TRANSCRIPTION_TEMPLATE_PROCESSORS.get(template_name) 

215 if processor is not None: 215 ↛ 211line 215 didn't jump to line 211 because the condition on line 215 was always true

216 processor(wxr, word_entry, child) 

217 elif template_name in ["audio", "аудио", "медиа"]: 

218 audio_file = clean_node( 

219 wxr, None, child.template_parameters.get(1, "") 

220 ).strip() 

221 if audio_file != "": 

222 if len(word_entry.sounds) > 0: 

223 set_sound_file_url_fields( 

224 wxr, audio_file, word_entry.sounds[-1] 

225 ) 

226 else: 

227 sound = Sound() 

228 set_sound_file_url_fields(wxr, audio_file, sound) 

229 word_entry.sounds.append(sound) 

230 

231 

232def extract_homophone_section( 

233 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode 

234) -> None: 

235 homophones = [] 

236 for link_node in level_node.find_child_recursively(NodeKind.LINK): 

237 homophone = clean_node(wxr, None, link_node) 

238 if len(homophone) > 0: 238 ↛ 236line 238 didn't jump to line 236 because the condition on line 238 was always true

239 homophones.append(homophone) 

240 if len(homophones) > 0: 240 ↛ exitline 240 didn't return from function 'extract_homophone_section' because the condition on line 240 was always true

241 sound = Sound(homophones=homophones) 

242 word_entry.sounds.append(sound) 

243 

244 

245def extract_rhyme_section( 

246 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode 

247) -> None: 

248 for list_node in level_node.find_child(NodeKind.LIST): 

249 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

250 raw_tags = [] 

251 for node in list_item.children: 

252 if isinstance(node, str) and node.strip().endswith(":"): 

253 for raw_tag in node.strip(": ").split(","): 

254 raw_tag = raw_tag.strip() 

255 if raw_tag != "": 255 ↛ 253line 255 didn't jump to line 253 because the condition on line 255 was always true

256 raw_tags.append(raw_tag) 

257 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK: 

258 rhyme = clean_node(wxr, None, node) 

259 if rhyme != "": 259 ↛ 251line 259 didn't jump to line 251 because the condition on line 259 was always true

260 sound = Sound(rhymes=rhyme, raw_tags=raw_tags) 

261 translate_raw_tags(sound) 

262 word_entry.sounds.append(sound)