Coverage for src/wiktextract/extractor/fr/pronunciation.py: 86%

1import re

3from wikitextprocessor.parser import NodeKind, TemplateNode, WikiNode

5from ...page import clean_node

6from ...wxr_context import WiktextractContext

7from ..share import set_sound_file_url_fields

8from .models import Sound, WordEntry

9from .tags import translate_raw_tags

12def extract_pronunciation(

13 wxr: WiktextractContext,

14 page_data: list[WordEntry],

15 level_node: WikiNode,

16 base_data: WordEntry,

17) -> None:

18 sounds_list = []

19 lang_code = base_data.lang_code

20 for node in level_node.find_child(NodeKind.LIST | NodeKind.TEMPLATE):

21 if node.kind == NodeKind.LIST:

22 for list_item_node in node.find_child(NodeKind.LIST_ITEM):

23 sounds_list.extend(

24 process_pron_list_item(wxr, list_item_node, [], lang_code)

25 )

26 elif isinstance(node, TemplateNode): 26 ↛ 20line 26 didn't jump to line 20 because the condition on line 26 was always true

27 if node.template_name in ["cmn-pron", "zh-cmn-pron"]:

28 sounds_list.extend(process_cmn_pron_template(wxr, node))

30 if len(sounds_list) == 0: 30 ↛ 31line 30 didn't jump to line 31 because the condition on line 30 was never true

31 return

33 if level_node.kind == NodeKind.LEVEL3 and len(page_data) > 0: 33 ↛ 42line 33 didn't jump to line 42 because the condition on line 33 was always true

34 # Add extracted sound data to all sense dictionaries that have the same

35 # language code when the prononciation subtitle is a level 3 title node.

36 # Otherwise only add to the last one.

37 for sense_data in page_data:

38 if sense_data.lang_code == lang_code:

39 sense_data.sounds.extend(sounds_list)

40 for sound in sounds_list:

41 sense_data.categories.extend(sound.categories)

42 elif len(page_data) > 0:

43 page_data[-1].sounds.extend(sounds_list)

44 for sound in sounds_list:

45 page_data[-1].categories.extend(sound.categories)

46 else:

47 base_data.sounds.extend(sounds_list)

48 for sound in sounds_list:

49 base_data.categories.extend(sound.categories)

52PRON_TEMPLATES = frozenset(

53 [

54 "pron", # redirect to "prononciation"

55 "prononciation",

56 "//", # redirect to "prononciation"

57 "phon", # redirect to "prononciation"

58 "pron-recons", # use "pron"

59 "prononciation reconstruite", # redirect to "pron-recons"

60 "pron recons", # redirect to "pron-recons"

61 "lang", # used in template "cmn-pron", which expands to list of Pinyin

62 ]

63)

65ASPIRATED_H_TEMPLATES = frozenset(

66 [

67 "h aspiré",

68 "h", # redirect to "h aspiré"

69 "h muet",

70 ]

71)

74def process_pron_list_item(

75 wxr: WiktextractContext,

76 list_item_node: WikiNode,

77 parent_raw_tags: list[str],

78 lang_code: str,

79) -> list[Sound]:

80 current_raw_tags = parent_raw_tags[:]

81 sounds_list = []

82 pron_key = "zh_pron" if lang_code == "zh" else "ipa"

83 after_colon = False

84 for child_index, list_item_child in enumerate(list_item_node.children):

85 if isinstance(list_item_child, TemplateNode):

86 sounds_list.extend(

87 process_sound_list_item_templates(

88 wxr,

89 list_item_child,

90 current_raw_tags,

91 after_colon,

92 list_item_node.children[child_index - 1 : child_index],

93 )

94 )

95 elif isinstance(list_item_child, WikiNode):

96 if list_item_child.kind == NodeKind.BOLD:

97 current_raw_tags.append(clean_node(wxr, None, list_item_child))

98 elif list_item_child.kind == NodeKind.LINK:

99 for span_tag in list_item_child.find_html_recursively("span"):

100 sound = Sound(

101 ipa=clean_node(wxr, None, span_tag),

102 raw_tags=current_raw_tags[:],

103 )

104 translate_raw_tags(sound)

105 sounds_list.append(sound)

106 elif isinstance(list_item_child, str): 106 ↛ 84line 106 didn't jump to line 84 because the condition on line 106 was always true

107 if ":" in list_item_child:

108 after_colon = True

109 pron_text = list_item_child[

110 list_item_child.find(":") + 1 :

111 ].strip()

112 if len(pron_text) > 0:

113 sound = Sound(raw_tags=current_raw_tags[:])

114 setattr(sound, pron_key, pron_text)

115 translate_raw_tags(sound)

116 sounds_list.append(sound)

117

118 for nest_list_item in list_item_node.find_child_recursively(

119 NodeKind.LIST_ITEM

120 ):

121 sounds_list.extend(

122 process_pron_list_item(

123 wxr, nest_list_item, current_raw_tags, lang_code

124 )

125 )

126

127 return sounds_list

128

129

130def process_sound_list_item_templates(

131 wxr: WiktextractContext,

132 template_node: TemplateNode,

133 raw_tags: list[str],

134 after_colon: bool,

135 pre_nodes: list[WikiNode],

136) -> list[Sound]:

137 if template_node.template_name in PRON_TEMPLATES:

138 return process_pron_template(wxr, template_node, raw_tags, pre_nodes)

139 elif template_node.template_name in {

140 "écouter",

141 "audio",

142 "pron-rég",

143 }:

144 return [process_ecouter_template(wxr, template_node, raw_tags)]

145 elif template_node.template_name == "pron-rimes":

146 return [process_pron_rimes_template(wxr, template_node, raw_tags)]

147 elif template_node.template_name in ASPIRATED_H_TEMPLATES: 147 ↛ 148line 147 didn't jump to line 148 because the condition on line 147 was never true

148 pass

149 elif not after_colon: # location 149 ↛ 153line 149 didn't jump to line 153 because the condition on line 149 was always true

150 raw_tag = clean_node(wxr, None, template_node)

151 raw_tags.append(raw_tag)

152

153 return []

154

155

156def process_pron_template(

157 wxr: WiktextractContext,

158 template_node: TemplateNode,

159 raw_tags: list[str],

160 previous_nodes: list[WikiNode] = [],

161) -> list[Sound]:

162 if (

163 template_node.template_name in PRON_TEMPLATES

164 and isinstance(template_node.template_parameters.get(1, ""), str)

165 and len(template_node.template_parameters.get(1, "")) == 0

166 ):

167 # some pages don't pass IPA parameter to the "pron" template

168 # and expand to an edit link for adding the missing data.

169 return []

170 sounds_list = []

171 pron_texts = clean_node(wxr, None, template_node)

172 # https://en.wikipedia.org/wiki/Aspirated_h

173 # https://fr.wiktionary.org/wiki/Modèle:h_aspiré

174 aspirated_h = ""

175 if len(previous_nodes) > 0 and isinstance(previous_nodes[-1], TemplateNode):

176 if previous_nodes[-1].template_name in ASPIRATED_H_TEMPLATES: 176 ↛ 179line 176 didn't jump to line 179 because the condition on line 176 was always true

177 aspirated_h = clean_node(wxr, None, previous_nodes[-1])

178

179 if len(pron_texts) > 0: 179 ↛ 190line 179 didn't jump to line 190 because the condition on line 179 was always true

180 use_key = "zh_pron" if template_node.template_name == "lang" else "ipa"

181 prons = set()

182 for pron_text in re.split(",|，", pron_texts):

183 pron_text = pron_text.strip()

184 if len(pron_text) > 0 and pron_text not in prons:

185 prons.add(pron_text)

186 sound = Sound(raw_tags=raw_tags[:])

187 setattr(sound, use_key, aspirated_h + pron_text)

188 translate_raw_tags(sound)

189 sounds_list.append(sound)

190 return sounds_list

191

192

193def process_ecouter_template(

194 wxr: WiktextractContext,

195 template_node: TemplateNode,

196 raw_tags: list[str],

197) -> Sound:

198 # sound file template: https://fr.wiktionary.org/wiki/Modèle:écouter

199 sound = Sound()

200 location = clean_node(

201 wxr, None, template_node.template_parameters.get(1, "")

202 )

203 if location.startswith("(") and location.endswith(")"): 203 ↛ 204line 203 didn't jump to line 204 because the condition on line 203 was never true

204 location = location.strip("()")

205 ipa = clean_node(

206 wxr,

207 None,

208 template_node.template_parameters.get(

209 2, template_node.template_parameters.get("pron", "")

210 ),

211 )

212 audio_file = clean_node(

213 wxr, None, template_node.template_parameters.get("audio", "")

214 )

215 if len(raw_tags) > 0: 215 ↛ 216line 215 didn't jump to line 216 because the condition on line 215 was never true

216 sound.raw_tags = raw_tags[:]

217 if len(location) > 0: 217 ↛ 219line 217 didn't jump to line 219 because the condition on line 217 was always true

218 sound.raw_tags.append(location)

219 if len(ipa) > 0:

220 sound.ipa = ipa

221 if len(audio_file) > 0: 221 ↛ 223line 221 didn't jump to line 223 because the condition on line 221 was always true

222 set_sound_file_url_fields(wxr, audio_file, sound)

223 translate_raw_tags(sound)

224 return sound

225

226

227def is_ipa_text(text: str) -> bool:

228 # check if the text is IPA, used for inflection table cell text

229 if text.startswith("\\") and text.endswith("\\"):

230 return True

231 if text.startswith("ou ") and text.endswith("\\"):

232 # some inflection table template like "en-nom-rég" might have a second

233 # ipa text in a new line

234 return True

235 return False

236

237

238def process_pron_rimes_template(

239 wxr: WiktextractContext,

240 template_node: TemplateNode,

241 raw_tags: list[str],

242) -> Sound:

243 # https://fr.wiktionary.org/wiki/Modèle:pron-rimes

244 sound = Sound()

245 expanded_node = wxr.wtp.parse(

246 wxr.wtp.node_to_wikitext(template_node), expand_all=True

247 )

248 for index, span_tag in enumerate(

249 expanded_node.find_html_recursively("span")

250 ):

251 span_text = clean_node(wxr, None, span_tag)

252 if index == 0:

253 sound.ipa = span_text

254 elif index == 1: 254 ↛ 248line 254 didn't jump to line 248 because the condition on line 254 was always true

255 sound.rhymes = span_text

256 if len(raw_tags) > 0: 256 ↛ 257line 256 didn't jump to line 257 because the condition on line 256 was never true

257 sound.raw_tags = raw_tags[:]

258 translate_raw_tags(sound)

259 clean_node(wxr, sound, expanded_node)

260 return sound

261

262

263def process_cmn_pron_template(

264 wxr: WiktextractContext, template_node: TemplateNode

265) -> list[Sound]:

266 # https://fr.wiktionary.org/wiki/Modèle:cmn-pron

267 sounds_list = []

268 expanded_node = wxr.wtp.parse(

269 wxr.wtp.node_to_wikitext(template_node),

270 pre_expand=True,

271 additional_expand={template_node.template_name},

272 )

273 for list_node in expanded_node.find_child(NodeKind.LIST):

274 for list_item in list_node.find_child(NodeKind.LIST_ITEM):

275 sounds_list.extend(process_pron_list_item(wxr, list_item, [], "zh"))

276

277 return sounds_list