Coverage for src/wiktextract/extractor/fr/pronunciation.py: 86%

140 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-10-25 10:11 +0000

1import re 

2 

3from wikitextprocessor.parser import NodeKind, TemplateNode, WikiNode 

4 

5from ...page import clean_node 

6from ...wxr_context import WiktextractContext 

7from ..share import set_sound_file_url_fields 

8from .models import Sound, WordEntry 

9from .tags import translate_raw_tags 

10 

11 

12def extract_pronunciation( 

13 wxr: WiktextractContext, 

14 page_data: list[WordEntry], 

15 level_node: WikiNode, 

16 base_data: WordEntry, 

17) -> None: 

18 sounds_list = [] 

19 lang_code = base_data.lang_code 

20 for node in level_node.find_child(NodeKind.LIST | NodeKind.TEMPLATE): 

21 if node.kind == NodeKind.LIST: 

22 for list_item_node in node.find_child(NodeKind.LIST_ITEM): 

23 sounds_list.extend( 

24 process_pron_list_item(wxr, list_item_node, [], lang_code) 

25 ) 

26 elif isinstance(node, TemplateNode): 26 ↛ 20line 26 didn't jump to line 20 because the condition on line 26 was always true

27 if node.template_name in ["cmn-pron", "zh-cmn-pron"]: 

28 sounds_list.extend(process_cmn_pron_template(wxr, node)) 

29 

30 if len(sounds_list) == 0: 30 ↛ 31line 30 didn't jump to line 31 because the condition on line 30 was never true

31 return 

32 

33 if level_node.kind == NodeKind.LEVEL3 and len(page_data) > 0: 33 ↛ 42line 33 didn't jump to line 42 because the condition on line 33 was always true

34 # Add extracted sound data to all sense dictionaries that have the same 

35 # language code when the prononciation subtitle is a level 3 title node. 

36 # Otherwise only add to the last one. 

37 for sense_data in page_data: 

38 if sense_data.lang_code == lang_code: 

39 sense_data.sounds.extend(sounds_list) 

40 for sound in sounds_list: 

41 sense_data.categories.extend(sound.categories) 

42 elif len(page_data) > 0: 

43 page_data[-1].sounds.extend(sounds_list) 

44 for sound in sounds_list: 

45 page_data[-1].categories.extend(sound.categories) 

46 else: 

47 base_data.sounds.extend(sounds_list) 

48 for sound in sounds_list: 

49 base_data.categories.extend(sound.categories) 

50 

51 

52PRON_TEMPLATES = frozenset( 

53 [ 

54 "pron", # redirect to "prononciation" 

55 "prononciation", 

56 "//", # redirect to "prononciation" 

57 "phon", # redirect to "prononciation" 

58 "pron-recons", # use "pron" 

59 "prononciation reconstruite", # redirect to "pron-recons" 

60 "pron recons", # redirect to "pron-recons" 

61 "lang", # used in template "cmn-pron", which expands to list of Pinyin 

62 ] 

63) 

64 

65ASPIRATED_H_TEMPLATES = frozenset( 

66 [ 

67 "h aspiré", 

68 "h", # redirect to "h aspiré" 

69 "h muet", 

70 ] 

71) 

72 

73 

74def process_pron_list_item( 

75 wxr: WiktextractContext, 

76 list_item_node: WikiNode, 

77 parent_raw_tags: list[str], 

78 lang_code: str, 

79) -> list[Sound]: 

80 current_raw_tags = parent_raw_tags[:] 

81 sounds_list = [] 

82 pron_key = "zh_pron" if lang_code == "zh" else "ipa" 

83 after_colon = False 

84 for child_index, list_item_child in enumerate(list_item_node.children): 

85 if isinstance(list_item_child, TemplateNode): 

86 sounds_list.extend( 

87 process_sound_list_item_templates( 

88 wxr, 

89 list_item_child, 

90 current_raw_tags, 

91 after_colon, 

92 list_item_node.children[child_index - 1 : child_index], 

93 ) 

94 ) 

95 elif isinstance(list_item_child, WikiNode): 

96 if list_item_child.kind == NodeKind.BOLD: 

97 current_raw_tags.append(clean_node(wxr, None, list_item_child)) 

98 elif list_item_child.kind == NodeKind.LINK: 

99 for span_tag in list_item_child.find_html_recursively("span"): 

100 sound = Sound( 

101 ipa=clean_node(wxr, None, span_tag), 

102 raw_tags=current_raw_tags[:], 

103 ) 

104 translate_raw_tags(sound) 

105 sounds_list.append(sound) 

106 elif isinstance(list_item_child, str): 106 ↛ 84line 106 didn't jump to line 84 because the condition on line 106 was always true

107 if ":" in list_item_child: 

108 after_colon = True 

109 pron_text = list_item_child[ 

110 list_item_child.find(":") + 1 : 

111 ].strip() 

112 if len(pron_text) > 0: 

113 sound = Sound(raw_tags=current_raw_tags[:]) 

114 setattr(sound, pron_key, pron_text) 

115 translate_raw_tags(sound) 

116 sounds_list.append(sound) 

117 

118 for nest_list_item in list_item_node.find_child_recursively( 

119 NodeKind.LIST_ITEM 

120 ): 

121 sounds_list.extend( 

122 process_pron_list_item( 

123 wxr, nest_list_item, current_raw_tags, lang_code 

124 ) 

125 ) 

126 

127 return sounds_list 

128 

129 

130def process_sound_list_item_templates( 

131 wxr: WiktextractContext, 

132 template_node: TemplateNode, 

133 raw_tags: list[str], 

134 after_colon: bool, 

135 pre_nodes: list[WikiNode], 

136) -> list[Sound]: 

137 if template_node.template_name in PRON_TEMPLATES: 

138 return process_pron_template(wxr, template_node, raw_tags, pre_nodes) 

139 elif template_node.template_name in { 

140 "écouter", 

141 "audio", 

142 "pron-rég", 

143 }: 

144 return [process_ecouter_template(wxr, template_node, raw_tags)] 

145 elif template_node.template_name == "pron-rimes": 

146 return [process_pron_rimes_template(wxr, template_node, raw_tags)] 

147 elif template_node.template_name in ASPIRATED_H_TEMPLATES: 147 ↛ 148line 147 didn't jump to line 148 because the condition on line 147 was never true

148 pass 

149 elif not after_colon: # location 149 ↛ 153line 149 didn't jump to line 153 because the condition on line 149 was always true

150 raw_tag = clean_node(wxr, None, template_node) 

151 raw_tags.append(raw_tag) 

152 

153 return [] 

154 

155 

156def process_pron_template( 

157 wxr: WiktextractContext, 

158 template_node: TemplateNode, 

159 raw_tags: list[str], 

160 previous_nodes: list[WikiNode] = [], 

161) -> list[Sound]: 

162 if ( 

163 template_node.template_name in PRON_TEMPLATES 

164 and isinstance(template_node.template_parameters.get(1, ""), str) 

165 and len(template_node.template_parameters.get(1, "")) == 0 

166 ): 

167 # some pages don't pass IPA parameter to the "pron" template 

168 # and expand to an edit link for adding the missing data. 

169 return [] 

170 sounds_list = [] 

171 pron_texts = clean_node(wxr, None, template_node) 

172 # https://en.wikipedia.org/wiki/Aspirated_h 

173 # https://fr.wiktionary.org/wiki/Modèle:h_aspiré 

174 aspirated_h = "" 

175 if len(previous_nodes) > 0 and isinstance(previous_nodes[-1], TemplateNode): 

176 if previous_nodes[-1].template_name in ASPIRATED_H_TEMPLATES: 176 ↛ 179line 176 didn't jump to line 179 because the condition on line 176 was always true

177 aspirated_h = clean_node(wxr, None, previous_nodes[-1]) 

178 

179 if len(pron_texts) > 0: 179 ↛ 190line 179 didn't jump to line 190 because the condition on line 179 was always true

180 use_key = "zh_pron" if template_node.template_name == "lang" else "ipa" 

181 prons = set() 

182 for pron_text in re.split(",|,", pron_texts): 

183 pron_text = pron_text.strip() 

184 if len(pron_text) > 0 and pron_text not in prons: 

185 prons.add(pron_text) 

186 sound = Sound(raw_tags=raw_tags[:]) 

187 setattr(sound, use_key, aspirated_h + pron_text) 

188 translate_raw_tags(sound) 

189 sounds_list.append(sound) 

190 return sounds_list 

191 

192 

193def process_ecouter_template( 

194 wxr: WiktextractContext, 

195 template_node: TemplateNode, 

196 raw_tags: list[str], 

197) -> Sound: 

198 # sound file template: https://fr.wiktionary.org/wiki/Modèle:écouter 

199 sound = Sound() 

200 location = clean_node( 

201 wxr, None, template_node.template_parameters.get(1, "") 

202 ) 

203 if location.startswith("(") and location.endswith(")"): 203 ↛ 204line 203 didn't jump to line 204 because the condition on line 203 was never true

204 location = location.strip("()") 

205 ipa = clean_node( 

206 wxr, 

207 None, 

208 template_node.template_parameters.get( 

209 2, template_node.template_parameters.get("pron", "") 

210 ), 

211 ) 

212 audio_file = clean_node( 

213 wxr, None, template_node.template_parameters.get("audio", "") 

214 ) 

215 if len(raw_tags) > 0: 215 ↛ 216line 215 didn't jump to line 216 because the condition on line 215 was never true

216 sound.raw_tags = raw_tags[:] 

217 if len(location) > 0: 217 ↛ 219line 217 didn't jump to line 219 because the condition on line 217 was always true

218 sound.raw_tags.append(location) 

219 if len(ipa) > 0: 

220 sound.ipa = ipa 

221 if len(audio_file) > 0: 221 ↛ 223line 221 didn't jump to line 223 because the condition on line 221 was always true

222 set_sound_file_url_fields(wxr, audio_file, sound) 

223 translate_raw_tags(sound) 

224 return sound 

225 

226 

227def is_ipa_text(text: str) -> bool: 

228 # check if the text is IPA, used for inflection table cell text 

229 if text.startswith("\\") and text.endswith("\\"): 

230 return True 

231 if text.startswith("ou ") and text.endswith("\\"): 

232 # some inflection table template like "en-nom-rég" might have a second 

233 # ipa text in a new line 

234 return True 

235 return False 

236 

237 

238def process_pron_rimes_template( 

239 wxr: WiktextractContext, 

240 template_node: TemplateNode, 

241 raw_tags: list[str], 

242) -> Sound: 

243 # https://fr.wiktionary.org/wiki/Modèle:pron-rimes 

244 sound = Sound() 

245 expanded_node = wxr.wtp.parse( 

246 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

247 ) 

248 for index, span_tag in enumerate( 

249 expanded_node.find_html_recursively("span") 

250 ): 

251 span_text = clean_node(wxr, None, span_tag) 

252 if index == 0: 

253 sound.ipa = span_text 

254 elif index == 1: 254 ↛ 248line 254 didn't jump to line 248 because the condition on line 254 was always true

255 sound.rhymes = span_text 

256 if len(raw_tags) > 0: 256 ↛ 257line 256 didn't jump to line 257 because the condition on line 256 was never true

257 sound.raw_tags = raw_tags[:] 

258 translate_raw_tags(sound) 

259 clean_node(wxr, sound, expanded_node) 

260 return sound 

261 

262 

263def process_cmn_pron_template( 

264 wxr: WiktextractContext, template_node: TemplateNode 

265) -> list[Sound]: 

266 # https://fr.wiktionary.org/wiki/Modèle:cmn-pron 

267 sounds_list = [] 

268 expanded_node = wxr.wtp.parse( 

269 wxr.wtp.node_to_wikitext(template_node), 

270 pre_expand=True, 

271 additional_expand={template_node.template_name}, 

272 ) 

273 for list_node in expanded_node.find_child(NodeKind.LIST): 

274 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

275 sounds_list.extend(process_pron_list_item(wxr, list_item, [], "zh")) 

276 

277 return sounds_list