Coverage for src / wiktextract / extractor / fr / pronunciation.py: 84%

185 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2025-11-28 08:21 +0000

1import re 

2 

3from wikitextprocessor.parser import NodeKind, TemplateNode, WikiNode 

4 

5from ...page import clean_node 

6from ...wxr_context import WiktextractContext 

7from ..share import set_sound_file_url_fields 

8from .models import Linkage, Sound, WordEntry 

9from .tags import translate_raw_tags 

10 

11 

12def extract_pronunciation( 

13 wxr: WiktextractContext, 

14 page_data: list[WordEntry], 

15 level_node: WikiNode, 

16 base_data: WordEntry, 

17) -> None: 

18 sounds_list = [] 

19 lang_code = base_data.lang_code 

20 for node in level_node.find_child(NodeKind.LIST | NodeKind.TEMPLATE): 

21 if node.kind == NodeKind.LIST: 

22 for list_item_node in node.find_child(NodeKind.LIST_ITEM): 

23 sounds_list.extend( 

24 process_pron_list_item(wxr, list_item_node, [], lang_code) 

25 ) 

26 elif isinstance(node, TemplateNode): 26 ↛ 20line 26 didn't jump to line 20 because the condition on line 26 was always true

27 if node.template_name in ["cmn-pron", "zh-cmn-pron"]: 

28 sounds_list.extend(process_cmn_pron_template(wxr, node)) 

29 

30 if len(sounds_list) == 0: 

31 return 

32 

33 if level_node.kind == NodeKind.LEVEL3 and len(page_data) > 0: 33 ↛ 42line 33 didn't jump to line 42 because the condition on line 33 was always true

34 # Add extracted sound data to all sense dictionaries that have the same 

35 # language code when the prononciation subtitle is a level 3 title node. 

36 # Otherwise only add to the last one. 

37 for sense_data in page_data: 

38 if sense_data.lang_code == lang_code: 

39 sense_data.sounds.extend(sounds_list) 

40 for sound in sounds_list: 

41 sense_data.categories.extend(sound.categories) 

42 elif len(page_data) > 0: 

43 page_data[-1].sounds.extend(sounds_list) 

44 for sound in sounds_list: 

45 page_data[-1].categories.extend(sound.categories) 

46 else: 

47 base_data.sounds.extend(sounds_list) 

48 for sound in sounds_list: 

49 base_data.categories.extend(sound.categories) 

50 

51 

52PRON_TEMPLATES = frozenset( 

53 [ 

54 "pron", # redirect to "prononciation" 

55 "prononciation", 

56 "//", # redirect to "prononciation" 

57 "phon", # redirect to "prononciation" 

58 "pron-recons", # use "pron" 

59 "prononciation reconstruite", # redirect to "pron-recons" 

60 "pron recons", # redirect to "pron-recons" 

61 "phono", 

62 "pron-API", 

63 "API", 

64 ] 

65) 

66 

67ASPIRATED_H_TEMPLATES = frozenset( 

68 [ 

69 "h aspiré", 

70 "h", # redirect to "h aspiré" 

71 "h muet", 

72 ] 

73) 

74 

75 

76def process_pron_list_item( 

77 wxr: WiktextractContext, 

78 list_item_node: WikiNode, 

79 parent_raw_tags: list[str], 

80 lang_code: str, 

81) -> list[Sound]: 

82 current_raw_tags = parent_raw_tags[:] 

83 sounds_list = [] 

84 pron_key = "zh_pron" if lang_code == "zh" else "ipa" 

85 after_colon = False 

86 for child_index, list_item_child in enumerate(list_item_node.children): 

87 if isinstance(list_item_child, TemplateNode): 

88 sounds_list.extend( 

89 process_sound_list_item_templates( 

90 wxr, 

91 list_item_child, 

92 current_raw_tags, 

93 after_colon, 

94 list_item_node.children[child_index - 1 : child_index], 

95 lang_code, 

96 ) 

97 ) 

98 elif isinstance(list_item_child, WikiNode): 

99 if list_item_child.kind == NodeKind.BOLD: 

100 current_raw_tags.append(clean_node(wxr, None, list_item_child)) 

101 elif list_item_child.kind == NodeKind.LINK: 

102 for span_tag in list_item_child.find_html_recursively("span"): 

103 sound = Sound( 

104 ipa=clean_node(wxr, None, span_tag), 

105 raw_tags=current_raw_tags[:], 

106 ) 

107 translate_raw_tags(sound) 

108 sounds_list.append(sound) 

109 elif isinstance(list_item_child, str): 109 ↛ 86line 109 didn't jump to line 86 because the condition on line 109 was always true

110 if ":" in list_item_child: 

111 after_colon = True 

112 pron_text = list_item_child[ 

113 list_item_child.find(":") + 1 : 

114 ].strip() 

115 if len(pron_text) > 0: 

116 sound = Sound(raw_tags=current_raw_tags[:]) 

117 setattr(sound, pron_key, pron_text) 

118 translate_raw_tags(sound) 

119 sounds_list.append(sound) 

120 

121 for nest_list_item in list_item_node.find_child_recursively( 

122 NodeKind.LIST_ITEM 

123 ): 

124 sounds_list.extend( 

125 process_pron_list_item( 

126 wxr, nest_list_item, current_raw_tags, lang_code 

127 ) 

128 ) 

129 

130 return sounds_list 

131 

132 

133def process_sound_list_item_templates( 

134 wxr: WiktextractContext, 

135 t_node: TemplateNode, 

136 raw_tags: list[str], 

137 after_colon: bool, 

138 pre_nodes: list[WikiNode], 

139 lang_code: str, 

140) -> list[Sound]: 

141 if t_node.template_name in PRON_TEMPLATES: 

142 return process_pron_template( 

143 wxr, t_node, raw_tags, lang_code, pre_nodes 

144 ) 

145 elif t_node.template_name == "lang": 

146 return extract_lang_template(wxr, t_node, raw_tags, lang_code) 

147 elif t_node.template_name in { 

148 "écouter", 

149 "audio", 

150 "pron-rég", 

151 }: 

152 return [process_ecouter_template(wxr, t_node, raw_tags)] 

153 elif t_node.template_name == "pron-rimes": 

154 return [process_pron_rimes_template(wxr, t_node, raw_tags)] 

155 elif t_node.template_name in ASPIRATED_H_TEMPLATES: 155 ↛ 156line 155 didn't jump to line 156 because the condition on line 155 was never true

156 pass 

157 elif not after_colon: # location 157 ↛ 161line 157 didn't jump to line 161 because the condition on line 157 was always true

158 raw_tag = clean_node(wxr, None, t_node) 

159 raw_tags.append(raw_tag) 

160 

161 return [] 

162 

163 

164def process_pron_template( 

165 wxr: WiktextractContext, 

166 t_node: TemplateNode, 

167 raw_tags: list[str], 

168 lang_code: str, 

169 previous_nodes: list[WikiNode] = [], 

170) -> list[Sound]: 

171 if ( 

172 t_node.template_name in PRON_TEMPLATES 

173 and clean_node(wxr, None, t_node.template_parameters.get(1, "")) == "" 

174 ): 

175 # some pages don't pass IPA parameter to the "pron" template 

176 # and expand to an edit link for adding the missing data. 

177 return [] 

178 sounds_list = [] 

179 # https://en.wikipedia.org/wiki/Aspirated_h 

180 # https://fr.wiktionary.org/wiki/Modèle:h_aspiré 

181 aspirated_h = "" 

182 if len(previous_nodes) > 0 and isinstance(previous_nodes[-1], TemplateNode): 

183 if previous_nodes[-1].template_name in ASPIRATED_H_TEMPLATES: 183 ↛ 186line 183 didn't jump to line 186 because the condition on line 183 was always true

184 aspirated_h = clean_node(wxr, None, previous_nodes[-1]) 

185 

186 expanded_node = wxr.wtp.parse( 

187 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

188 ) 

189 for span_tag in expanded_node.find_html_recursively( 

190 "span", attr_name="class", attr_value="API" 

191 ): 

192 ipa = clean_node(wxr, None, span_tag) 

193 if ipa != "": 193 ↛ 189line 193 didn't jump to line 189 because the condition on line 193 was always true

194 sound = Sound(raw_tags=raw_tags[:], ipa=aspirated_h + ipa) 

195 translate_raw_tags(sound) 

196 sounds_list.append(sound) 

197 return sounds_list 

198 

199 

200def extract_lang_template( 

201 wxr: WiktextractContext, 

202 t_node: TemplateNode, 

203 raw_tags: list[str], 

204 lang_code: str, 

205) -> list[Sound]: 

206 sounds = [] 

207 field = "zh_pron" if lang_code == "zh" else "ipa" 

208 pron_texts = clean_node(wxr, None, t_node) 

209 prons = set() 

210 for pron_text in re.split(",|,", pron_texts): 

211 pron_text = pron_text.strip() 

212 if len(pron_text) > 0 and pron_text not in prons: 

213 prons.add(pron_text) 

214 sound = Sound(raw_tags=raw_tags[:]) 

215 setattr(sound, field, pron_text) 

216 translate_raw_tags(sound) 

217 sounds.append(sound) 

218 return sounds 

219 

220 

221def process_ecouter_template( 

222 wxr: WiktextractContext, 

223 template_node: TemplateNode, 

224 raw_tags: list[str], 

225) -> Sound: 

226 # sound file template: https://fr.wiktionary.org/wiki/Modèle:écouter 

227 sound = Sound() 

228 location = clean_node( 

229 wxr, None, template_node.template_parameters.get(1, "") 

230 ) 

231 if location.startswith("(") and location.endswith(")"): 231 ↛ 232line 231 didn't jump to line 232 because the condition on line 231 was never true

232 location = location.strip("()") 

233 ipa = clean_node( 

234 wxr, 

235 None, 

236 template_node.template_parameters.get( 

237 2, template_node.template_parameters.get("pron", "") 

238 ), 

239 ) 

240 audio_file = clean_node( 

241 wxr, None, template_node.template_parameters.get("audio", "") 

242 ) 

243 if len(raw_tags) > 0: 243 ↛ 244line 243 didn't jump to line 244 because the condition on line 243 was never true

244 sound.raw_tags = raw_tags[:] 

245 if len(location) > 0: 245 ↛ 247line 245 didn't jump to line 247 because the condition on line 245 was always true

246 sound.raw_tags.append(location) 

247 if len(ipa) > 0: 

248 sound.ipa = f"[{ipa}]" 

249 if len(audio_file) > 0: 249 ↛ 251line 249 didn't jump to line 251 because the condition on line 249 was always true

250 set_sound_file_url_fields(wxr, audio_file, sound) 

251 translate_raw_tags(sound) 

252 return sound 

253 

254 

255def is_ipa_text(text: str) -> bool: 

256 # check if the text is IPA, used for inflection table cell text 

257 if text.startswith(("\\", "*\\")) and text.endswith("\\"): 

258 return True 

259 if text.startswith("/") and text.endswith("/"): 259 ↛ 260line 259 didn't jump to line 260 because the condition on line 259 was never true

260 return True 

261 if text.startswith("[") and text.endswith("]"): 261 ↛ 262line 261 didn't jump to line 262 because the condition on line 261 was never true

262 return True 

263 if text.startswith("ou ") and text.endswith("\\"): 263 ↛ 266line 263 didn't jump to line 266 because the condition on line 263 was never true

264 # some inflection table template like "en-nom-rég" might have a second 

265 # ipa text in a new line 

266 return True 

267 return False 

268 

269 

270def process_pron_rimes_template( 

271 wxr: WiktextractContext, 

272 template_node: TemplateNode, 

273 raw_tags: list[str], 

274) -> Sound: 

275 # https://fr.wiktionary.org/wiki/Modèle:pron-rimes 

276 sound = Sound() 

277 expanded_node = wxr.wtp.parse( 

278 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

279 ) 

280 for index, span_tag in enumerate( 

281 expanded_node.find_html_recursively("span") 

282 ): 

283 span_text = clean_node(wxr, None, span_tag) 

284 if index == 0: 

285 sound.ipa = span_text 

286 elif index == 1: 286 ↛ 280line 286 didn't jump to line 280 because the condition on line 286 was always true

287 sound.rhymes = span_text 

288 if len(raw_tags) > 0: 288 ↛ 289line 288 didn't jump to line 289 because the condition on line 288 was never true

289 sound.raw_tags = raw_tags[:] 

290 translate_raw_tags(sound) 

291 clean_node(wxr, sound, expanded_node) 

292 return sound 

293 

294 

295def process_cmn_pron_template( 

296 wxr: WiktextractContext, template_node: TemplateNode 

297) -> list[Sound]: 

298 # https://fr.wiktionary.org/wiki/Modèle:cmn-pron 

299 sounds_list = [] 

300 expanded_node = wxr.wtp.parse( 

301 wxr.wtp.node_to_wikitext(template_node), 

302 pre_expand=True, 

303 additional_expand={template_node.template_name}, 

304 ) 

305 for list_node in expanded_node.find_child(NodeKind.LIST): 

306 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

307 sounds_list.extend(process_pron_list_item(wxr, list_item, [], "zh")) 

308 

309 return sounds_list 

310 

311 

312def extract_homophone_section( 

313 wxr: WiktextractContext, 

314 page_data: list[WordEntry], 

315 base_data: WordEntry, 

316 level_node: WikiNode, 

317 title_cats: list[str], 

318) -> None: 

319 sounds = [] 

320 for list_node in level_node.find_child(NodeKind.LIST): 

321 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

322 sounds.extend(extract_homophone_list_item(wxr, list_item)) 

323 

324 if len(page_data) > 0: 324 ↛ 332line 324 didn't jump to line 332 because the condition on line 324 was always true

325 for data in page_data: 

326 if data.lang_code == base_data.lang_code: 326 ↛ 325line 326 didn't jump to line 325 because the condition on line 326 was always true

327 data.sounds.extend(sounds) 

328 data.categories.extend(title_cats) 

329 for sound in sounds: 

330 data.categories.extend(sound.categories) 

331 else: 

332 base_data.sounds.extend(sounds) 

333 base_data.categories.extend(title_cats) 

334 for sound in sounds: 

335 base_data.categories.extend(sound.categories) 

336 

337 

338def extract_homophone_list_item( 

339 wxr: WiktextractContext, list_item: WikiNode 

340) -> list[Sound]: 

341 sounds = [] 

342 for node in list_item.children: 

343 if isinstance(node, WikiNode) and node.kind == NodeKind.LINK: 343 ↛ 344line 343 didn't jump to line 344 because the condition on line 343 was never true

344 word = clean_node(wxr, None, node) 

345 if word != "": 

346 sounds.append(Sound(homophone=word)) 

347 elif isinstance(node, TemplateNode) and node.template_name in [ 

348 "l", 

349 "lien", 

350 ]: 

351 from .linkage import process_lien_template 

352 

353 l_data = Linkage(word="") 

354 process_lien_template(wxr, node, l_data) 

355 if l_data.word != "": 355 ↛ 342line 355 didn't jump to line 342 because the condition on line 355 was always true

356 sounds.append(Sound(homophone=l_data.word, roman=l_data.roman)) 

357 

358 return sounds