Coverage for src/wiktextract/extractor/fr/pronunciation.py: 84%

185 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-06-13 07:43 +0000

1import re 

2 

3from wikitextprocessor.parser import NodeKind, TemplateNode, WikiNode 

4 

5from ...page import clean_node 

6from ...wxr_context import WiktextractContext 

7from ..share import set_sound_file_url_fields 

8from .models import Linkage, Sound, WordEntry 

9from .tags import translate_raw_tags 

10 

11 

12def extract_pronunciation( 

13 wxr: WiktextractContext, 

14 page_data: list[WordEntry], 

15 level_node: WikiNode, 

16 base_data: WordEntry, 

17) -> None: 

18 sounds_list = [] 

19 lang_code = base_data.lang_code 

20 for node in level_node.find_child(NodeKind.LIST | NodeKind.TEMPLATE): 

21 if node.kind == NodeKind.LIST: 

22 for list_item_node in node.find_child(NodeKind.LIST_ITEM): 

23 sounds_list.extend( 

24 process_pron_list_item(wxr, list_item_node, [], lang_code) 

25 ) 

26 elif isinstance(node, TemplateNode): 26 ↛ 20line 26 didn't jump to line 20 because the condition on line 26 was always true

27 if node.template_name in ["cmn-pron", "zh-cmn-pron"]: 

28 sounds_list.extend(process_cmn_pron_template(wxr, node)) 

29 

30 if len(sounds_list) == 0: 

31 return 

32 

33 if level_node.kind == NodeKind.LEVEL3 and len(page_data) > 0: 33 ↛ 42line 33 didn't jump to line 42 because the condition on line 33 was always true

34 # Add extracted sound data to all sense dictionaries that have the same 

35 # language code when the prononciation subtitle is a level 3 title node. 

36 # Otherwise only add to the last one. 

37 for sense_data in page_data: 

38 if sense_data.lang_code == lang_code: 

39 sense_data.sounds.extend(sounds_list) 

40 for sound in sounds_list: 

41 sense_data.categories.extend(sound.categories) 

42 elif len(page_data) > 0: 

43 page_data[-1].sounds.extend(sounds_list) 

44 for sound in sounds_list: 

45 page_data[-1].categories.extend(sound.categories) 

46 else: 

47 base_data.sounds.extend(sounds_list) 

48 for sound in sounds_list: 

49 base_data.categories.extend(sound.categories) 

50 

51 

52PRON_TEMPLATES = frozenset( 

53 [ 

54 "pron", # redirect to "prononciation" 

55 "prononciation", 

56 "//", # redirect to "prononciation" 

57 "phon", # redirect to "prononciation" 

58 "pron-recons", # use "pron" 

59 "prononciation reconstruite", # redirect to "pron-recons" 

60 "pron recons", # redirect to "pron-recons" 

61 "phono", 

62 ] 

63) 

64 

65ASPIRATED_H_TEMPLATES = frozenset( 

66 [ 

67 "h aspiré", 

68 "h", # redirect to "h aspiré" 

69 "h muet", 

70 ] 

71) 

72 

73 

74def process_pron_list_item( 

75 wxr: WiktextractContext, 

76 list_item_node: WikiNode, 

77 parent_raw_tags: list[str], 

78 lang_code: str, 

79) -> list[Sound]: 

80 current_raw_tags = parent_raw_tags[:] 

81 sounds_list = [] 

82 pron_key = "zh_pron" if lang_code == "zh" else "ipa" 

83 after_colon = False 

84 for child_index, list_item_child in enumerate(list_item_node.children): 

85 if isinstance(list_item_child, TemplateNode): 

86 sounds_list.extend( 

87 process_sound_list_item_templates( 

88 wxr, 

89 list_item_child, 

90 current_raw_tags, 

91 after_colon, 

92 list_item_node.children[child_index - 1 : child_index], 

93 lang_code, 

94 ) 

95 ) 

96 elif isinstance(list_item_child, WikiNode): 

97 if list_item_child.kind == NodeKind.BOLD: 

98 current_raw_tags.append(clean_node(wxr, None, list_item_child)) 

99 elif list_item_child.kind == NodeKind.LINK: 

100 for span_tag in list_item_child.find_html_recursively("span"): 

101 sound = Sound( 

102 ipa=clean_node(wxr, None, span_tag), 

103 raw_tags=current_raw_tags[:], 

104 ) 

105 translate_raw_tags(sound) 

106 sounds_list.append(sound) 

107 elif isinstance(list_item_child, str): 107 ↛ 84line 107 didn't jump to line 84 because the condition on line 107 was always true

108 if ":" in list_item_child: 

109 after_colon = True 

110 pron_text = list_item_child[ 

111 list_item_child.find(":") + 1 : 

112 ].strip() 

113 if len(pron_text) > 0: 

114 sound = Sound(raw_tags=current_raw_tags[:]) 

115 setattr(sound, pron_key, pron_text) 

116 translate_raw_tags(sound) 

117 sounds_list.append(sound) 

118 

119 for nest_list_item in list_item_node.find_child_recursively( 

120 NodeKind.LIST_ITEM 

121 ): 

122 sounds_list.extend( 

123 process_pron_list_item( 

124 wxr, nest_list_item, current_raw_tags, lang_code 

125 ) 

126 ) 

127 

128 return sounds_list 

129 

130 

131def process_sound_list_item_templates( 

132 wxr: WiktextractContext, 

133 t_node: TemplateNode, 

134 raw_tags: list[str], 

135 after_colon: bool, 

136 pre_nodes: list[WikiNode], 

137 lang_code: str, 

138) -> list[Sound]: 

139 if t_node.template_name in PRON_TEMPLATES: 

140 return process_pron_template( 

141 wxr, t_node, raw_tags, lang_code, pre_nodes 

142 ) 

143 elif t_node.template_name == "lang": 

144 return extract_lang_template(wxr, t_node, raw_tags, lang_code) 

145 elif t_node.template_name in { 

146 "écouter", 

147 "audio", 

148 "pron-rég", 

149 }: 

150 return [process_ecouter_template(wxr, t_node, raw_tags)] 

151 elif t_node.template_name == "pron-rimes": 

152 return [process_pron_rimes_template(wxr, t_node, raw_tags)] 

153 elif t_node.template_name in ASPIRATED_H_TEMPLATES: 153 ↛ 154line 153 didn't jump to line 154 because the condition on line 153 was never true

154 pass 

155 elif not after_colon: # location 155 ↛ 159line 155 didn't jump to line 159 because the condition on line 155 was always true

156 raw_tag = clean_node(wxr, None, t_node) 

157 raw_tags.append(raw_tag) 

158 

159 return [] 

160 

161 

162def process_pron_template( 

163 wxr: WiktextractContext, 

164 t_node: TemplateNode, 

165 raw_tags: list[str], 

166 lang_code: str, 

167 previous_nodes: list[WikiNode] = [], 

168) -> list[Sound]: 

169 if ( 

170 t_node.template_name in PRON_TEMPLATES 

171 and clean_node(wxr, None, t_node.template_parameters.get(1, "")) == "" 

172 ): 

173 # some pages don't pass IPA parameter to the "pron" template 

174 # and expand to an edit link for adding the missing data. 

175 return [] 

176 sounds_list = [] 

177 # https://en.wikipedia.org/wiki/Aspirated_h 

178 # https://fr.wiktionary.org/wiki/Modèle:h_aspiré 

179 aspirated_h = "" 

180 if len(previous_nodes) > 0 and isinstance(previous_nodes[-1], TemplateNode): 

181 if previous_nodes[-1].template_name in ASPIRATED_H_TEMPLATES: 181 ↛ 184line 181 didn't jump to line 184 because the condition on line 181 was always true

182 aspirated_h = clean_node(wxr, None, previous_nodes[-1]) 

183 

184 expanded_node = wxr.wtp.parse( 

185 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

186 ) 

187 for span_tag in expanded_node.find_html_recursively( 

188 "span", attr_name="class", attr_value="API" 

189 ): 

190 ipa = clean_node(wxr, None, span_tag) 

191 if ipa != "": 191 ↛ 187line 191 didn't jump to line 187 because the condition on line 191 was always true

192 sound = Sound(raw_tags=raw_tags[:], ipa=aspirated_h + ipa) 

193 translate_raw_tags(sound) 

194 sounds_list.append(sound) 

195 return sounds_list 

196 

197 

198def extract_lang_template( 

199 wxr: WiktextractContext, 

200 t_node: TemplateNode, 

201 raw_tags: list[str], 

202 lang_code: str, 

203) -> list[Sound]: 

204 sounds = [] 

205 field = "zh_pron" if lang_code == "zh" else "ipa" 

206 pron_texts = clean_node(wxr, None, t_node) 

207 prons = set() 

208 for pron_text in re.split(",|,", pron_texts): 

209 pron_text = pron_text.strip() 

210 if len(pron_text) > 0 and pron_text not in prons: 

211 prons.add(pron_text) 

212 sound = Sound(raw_tags=raw_tags[:]) 

213 setattr(sound, field, pron_text) 

214 translate_raw_tags(sound) 

215 sounds.append(sound) 

216 return sounds 

217 

218 

219def process_ecouter_template( 

220 wxr: WiktextractContext, 

221 template_node: TemplateNode, 

222 raw_tags: list[str], 

223) -> Sound: 

224 # sound file template: https://fr.wiktionary.org/wiki/Modèle:écouter 

225 sound = Sound() 

226 location = clean_node( 

227 wxr, None, template_node.template_parameters.get(1, "") 

228 ) 

229 if location.startswith("(") and location.endswith(")"): 229 ↛ 230line 229 didn't jump to line 230 because the condition on line 229 was never true

230 location = location.strip("()") 

231 ipa = clean_node( 

232 wxr, 

233 None, 

234 template_node.template_parameters.get( 

235 2, template_node.template_parameters.get("pron", "") 

236 ), 

237 ) 

238 audio_file = clean_node( 

239 wxr, None, template_node.template_parameters.get("audio", "") 

240 ) 

241 if len(raw_tags) > 0: 241 ↛ 242line 241 didn't jump to line 242 because the condition on line 241 was never true

242 sound.raw_tags = raw_tags[:] 

243 if len(location) > 0: 243 ↛ 245line 243 didn't jump to line 245 because the condition on line 243 was always true

244 sound.raw_tags.append(location) 

245 if len(ipa) > 0: 

246 sound.ipa = ipa 

247 if len(audio_file) > 0: 247 ↛ 249line 247 didn't jump to line 249 because the condition on line 247 was always true

248 set_sound_file_url_fields(wxr, audio_file, sound) 

249 translate_raw_tags(sound) 

250 return sound 

251 

252 

253def is_ipa_text(text: str) -> bool: 

254 # check if the text is IPA, used for inflection table cell text 

255 if text.startswith("\\") and text.endswith("\\"): 

256 return True 

257 if text.startswith("/") and text.endswith("/"): 257 ↛ 258line 257 didn't jump to line 258 because the condition on line 257 was never true

258 return True 

259 if text.startswith("[") and text.endswith("]"): 259 ↛ 260line 259 didn't jump to line 260 because the condition on line 259 was never true

260 return True 

261 if text.startswith("ou ") and text.endswith("\\"): 

262 # some inflection table template like "en-nom-rég" might have a second 

263 # ipa text in a new line 

264 return True 

265 return False 

266 

267 

268def process_pron_rimes_template( 

269 wxr: WiktextractContext, 

270 template_node: TemplateNode, 

271 raw_tags: list[str], 

272) -> Sound: 

273 # https://fr.wiktionary.org/wiki/Modèle:pron-rimes 

274 sound = Sound() 

275 expanded_node = wxr.wtp.parse( 

276 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

277 ) 

278 for index, span_tag in enumerate( 

279 expanded_node.find_html_recursively("span") 

280 ): 

281 span_text = clean_node(wxr, None, span_tag) 

282 if index == 0: 

283 sound.ipa = span_text 

284 elif index == 1: 284 ↛ 278line 284 didn't jump to line 278 because the condition on line 284 was always true

285 sound.rhymes = span_text 

286 if len(raw_tags) > 0: 286 ↛ 287line 286 didn't jump to line 287 because the condition on line 286 was never true

287 sound.raw_tags = raw_tags[:] 

288 translate_raw_tags(sound) 

289 clean_node(wxr, sound, expanded_node) 

290 return sound 

291 

292 

293def process_cmn_pron_template( 

294 wxr: WiktextractContext, template_node: TemplateNode 

295) -> list[Sound]: 

296 # https://fr.wiktionary.org/wiki/Modèle:cmn-pron 

297 sounds_list = [] 

298 expanded_node = wxr.wtp.parse( 

299 wxr.wtp.node_to_wikitext(template_node), 

300 pre_expand=True, 

301 additional_expand={template_node.template_name}, 

302 ) 

303 for list_node in expanded_node.find_child(NodeKind.LIST): 

304 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

305 sounds_list.extend(process_pron_list_item(wxr, list_item, [], "zh")) 

306 

307 return sounds_list 

308 

309 

310def extract_homophone_section( 

311 wxr: WiktextractContext, 

312 page_data: list[WordEntry], 

313 base_data: WordEntry, 

314 level_node: WikiNode, 

315 title_cats: list[str], 

316) -> None: 

317 sounds = [] 

318 for list_node in level_node.find_child(NodeKind.LIST): 

319 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

320 sounds.extend(extract_homophone_list_item(wxr, list_item)) 

321 

322 if len(page_data) > 0: 322 ↛ 330line 322 didn't jump to line 330 because the condition on line 322 was always true

323 for data in page_data: 

324 if data.lang_code == base_data.lang_code: 324 ↛ 323line 324 didn't jump to line 323 because the condition on line 324 was always true

325 data.sounds.extend(sounds) 

326 data.categories.extend(title_cats) 

327 for sound in sounds: 

328 data.categories.extend(sound.categories) 

329 else: 

330 base_data.sounds.extend(sounds) 

331 base_data.categories.extend(title_cats) 

332 for sound in sounds: 

333 base_data.categories.extend(sound.categories) 

334 

335 

336def extract_homophone_list_item( 

337 wxr: WiktextractContext, list_item: WikiNode 

338) -> list[Sound]: 

339 sounds = [] 

340 for node in list_item.children: 

341 if isinstance(node, WikiNode) and node.kind == NodeKind.LINK: 341 ↛ 342line 341 didn't jump to line 342 because the condition on line 341 was never true

342 word = clean_node(wxr, None, node) 

343 if word != "": 

344 sounds.append(Sound(homophone=word)) 

345 elif isinstance(node, TemplateNode) and node.template_name in [ 

346 "l", 

347 "lien", 

348 ]: 

349 from .linkage import process_lien_template 

350 

351 l_data = Linkage(word="") 

352 process_lien_template(wxr, node, l_data) 

353 if l_data.word != "": 353 ↛ 340line 353 didn't jump to line 340 because the condition on line 353 was always true

354 sounds.append(Sound(homophone=l_data.word, roman=l_data.roman)) 

355 

356 return sounds