Coverage for src / wiktextract / extractor / simple / pronunciation.py: 89%

149 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2025-12-12 08:09 +0000

1import re 

2from copy import copy 

3 

4from wikitextprocessor import NodeKind, TemplateArgs, WikiNode 

5from wikitextprocessor.parser import LEVEL_KIND_FLAGS # , print_tree 

6 

7from wiktextract import WiktextractContext 

8from wiktextract.page import clean_node 

9 

10# from wiktextract.wxr_logging import logger 

11from .models import Sound, WordEntry 

12from .parse_utils import PANEL_TEMPLATES 

13from .tags_utils import convert_tags 

14 

15REMOVE_HYPHENATION_RE = re.compile(r"(?i)\s*hyphenation\s*,?:?\s*(.+)") 

16 

17 

18def recurse_list( 

19 wxr: WiktextractContext, 

20 node: WikiNode, 

21 sound_templates: list[Sound], 

22 poses: list[str], 

23 raw_tags: list[str], 

24) -> tuple[list[str] | None, list[str] | None]: 

25 assert node.kind == NodeKind.LIST 

26 

27 this_level_tags = raw_tags[:] 

28 

29 if len(node.children) == 1: 

30 ### HACKYHACKHACK ### 

31 # ; pos or raw tags 

32 # * pron 1 

33 # * pron 2 

34 # The first line is typically used in Simple English Wiktionary 

35 # for tagging entries "below" it, even though ";" shouldn't 

36 # be used to make things bold according to wikitext guidelines 

37 # (creates broken HTML5 and breaks screen-readers). The ";" list 

38 # is also separate from the "*" list, so they're completely separated 

39 # in our parse tree; two different LIST objects! 

40 return recurse_list_item( 

41 wxr, 

42 # Guaranteed LIST_ITEM 

43 node.children[0], # type:ignore[arg-type] 

44 sound_templates, 

45 poses, 

46 this_level_tags, 

47 ) 

48 for child in node.children: 

49 new_pos, new_tags = recurse_list_item( 

50 wxr, 

51 # We are pretty much guaranteed a LIST will only only have 

52 # LIST_ITEM children. 

53 child, # type:ignore[arg-type] 

54 sound_templates, 

55 poses, 

56 this_level_tags, 

57 ) 

58 if new_pos is not None: 

59 poses = new_pos 

60 if new_tags: 60 ↛ 61line 60 didn't jump to line 61 because the condition on line 60 was never true

61 this_level_tags = raw_tags + new_tags 

62 

63 return None, None 

64 

65 

66def recurse_list_item( 

67 wxr: WiktextractContext, 

68 node: WikiNode, 

69 sound_templates: list[Sound], 

70 poses: list[str], 

71 raw_tags: list[str], 

72) -> tuple[list[str] | None, list[str] | None]: 

73 """Recurse through list and list_item nodes. In some cases, a call might 

74 return a tuple of a POS string and list of raw tags, which can be applied 

75 to `pos` and `raw_tags` parameters on the same level.""" 

76 

77 # We can trust that contents has only stuff from the beginning of this 

78 # list_item because because lists would "consume" the rest. 

79 assert node.kind in (NodeKind.LIST_ITEM, NodeKind.ROOT) 

80 

81 contents = list(node.invert_find_child(NodeKind.LIST)) 

82 

83 text = clean_node(wxr, None, contents).strip() 

84 text_raw_tags: list[str] = [] 

85 text_poses: list[str] = [] 

86 

87 if "__SOUND" not in text: 

88 # This is text without a pronunciation template like {{ipa}}. 

89 # Simple Wikt. is pretty consistent with its pron. templates, so 

90 # we dismiss the possibility that someone put a stray string of 

91 # IPA here, and treat the text as a description or a reference 

92 # to a sense. 

93 # XXX extract raw tags more gracefully 

94 text_tags, text_raw_tags, text_poses = convert_tags([text]) 

95 text_raw_tags.extend(text_tags) # let's ignore the normal tags for now 

96 

97 if len(text_poses) > 0 or len(text_raw_tags) > 0: 

98 if len(contents) == len(node.children): 

99 # No sublists in this node 

100 return ( 

101 text_poses or None, 

102 text_raw_tags or None, 

103 ) # return "noun 1" 

104 

105 line_raw_tags: list[str] = [] 

106 line_poses = [] 

107 for sound_m in re.findall(r"([^_]*)__SOUND_(\d+)__", text): 

108 part_tags, new_raw_tags, part_poses = convert_tags([sound_m[0]]) 

109 new_raw_tags.extend(part_tags) 

110 if len(part_poses) > 0: 110 ↛ 111line 110 didn't jump to line 111 because the condition on line 110 was never true

111 line_poses = part_poses 

112 if len(new_raw_tags) > 0: 

113 line_raw_tags = new_raw_tags 

114 

115 i = int(sound_m[-1]) # (\d+) 

116 sound = sound_templates[i] # the Sound object 

117 

118 # These sound data are attached to POS data later; for this, we 

119 # use the sound.pos field. 

120 if len(line_poses) > 0 or len(poses) > 0: 

121 sound.poses = line_poses or poses 

122 

123 # print(f"{raw_tags=}, {line_raw_tags=}") 

124 for d in raw_tags + line_raw_tags: 

125 sound.raw_tags.append(d) 

126 

127 this_level_tags = raw_tags + text_raw_tags 

128 

129 if len(text_poses) > 0: 

130 poses = copy(poses) 

131 poses.extend(text_poses) 

132 

133 for li in node.find_child(NodeKind.LIST): 

134 new_poses, new_tags2 = recurse_list( 

135 wxr, li, sound_templates, poses, this_level_tags 

136 ) 

137 if new_poses is not None: 

138 poses = new_poses 

139 if new_tags2 is not None: 139 ↛ 140line 139 didn't jump to line 140 because the condition on line 139 was never true

140 this_level_tags = raw_tags + new_tags2 

141 

142 return None, None 

143 

144 

145def recursively_complete_sound_data( 

146 wxr: WiktextractContext, node: WikiNode, sound_templates: list[Sound] 

147) -> None: 

148 """Parse all the lists for pronunciation data recursively.""" 

149 

150 # node should be NodeKind.ROOT 

151 recurse_list_item(wxr, node, sound_templates, [], []) 

152 return None 

153 

154 

155def process_pron( 

156 wxr: WiktextractContext, 

157 node: WikiNode, 

158 target_data: WordEntry, 

159) -> None: 

160 """Process a Pronunciation section WikiNode, extracting Sound data entries 

161 which are inserted into target_data.sounds. target_data is a WordEntry, so 

162 can be base_data (used to complete other entries) or an individual POS 

163 entry.""" 

164 

165 # XXX: figure out a way to collect category here with clean_node so that 

166 # it can be properly assigned to the right POS WordEntry; currently, 

167 # clean_node insert the category stuff straight into whatever data object 

168 # is its target, which would make target_data (if used for this) basically 

169 # have every category from every pron-section level. We already use a hack 

170 # to later assign sound data to the correct POS with the `pos` field in 

171 # SoundEntry... Currently ignoring categories for sound. 

172 

173 # print("====") 

174 # print_tree(pr_node) 

175 

176 # We save data in parse_pronunciation_template_fn into this local list, 

177 # so the template_fn has to be defined inside this larger function so 

178 # that it has easy access to sound_templates. Can't be defined outside 

179 # this function either, because we need access to `wxr` from here, and 

180 # the template_fn signature is already set in wikitextprocessor. 

181 sound_templates: list[Sound] = [] 

182 

183 # Template handling functions are function objects that are used to process 

184 # a parsed template node (a WikiNode object parsed from `{{template|arg}}` 

185 # that hasn't been expanded yet into text). The function objects are passed 

186 # into certain functions like like clean_node. When the expander comes 

187 # across a template, before expanding the template it calls whatever was 

188 # passed to template_fn=. The handler function can return a string, which is 

189 # inserted into the returned text, or None in case nothing special will be 

190 # done with the template and it will use the normal expanded value. 

191 # post_template_fn= is the same, except it happens after the template has 

192 # already been expanded and receives that text as a parameter. If you want 

193 # to replace templates based on their names or arguments before expansion, 

194 # use template_fn=, if you want access to the expanded text without doing 

195 # it manually, use post_template_fn= 

196 def parse_pronunciation_template_fn( 

197 name: str, ht: TemplateArgs 

198 ) -> str | None: 

199 lname = name.lower() 

200 if lname in PANEL_TEMPLATES: 200 ↛ 201line 200 didn't jump to line 201 because the condition on line 200 was never true

201 return "" 

202 if lname == "audio": 

203 filename = ht.get(1) or "" 

204 desc = ht.get(2) or "" 

205 desc = clean_node(wxr, None, [desc]).strip() 

206 audio = Sound(audio=filename.strip()) 

207 if desc: 207 ↛ 209line 207 didn't jump to line 209 because the condition on line 207 was always true

208 audio.raw_tags.append(desc) 

209 sound_templates.append(audio) 

210 return "__SOUND_" + str(len(sound_templates) - 1) + "__" 

211 if lname == "audio-ipa": 

212 filename = ht.get(1) or "" 

213 ipa = ht.get(2) or "" 

214 ipa = clean_node(wxr, None, [ipa]) 

215 audio = Sound(audio=filename.strip()) 

216 if ipa: 216 ↛ 218line 216 didn't jump to line 218 because the condition on line 216 was always true

217 audio.ipa = ipa 

218 sound_templates.append(audio) 

219 return "__SOUND_" + str(len(sound_templates) - 1) + "__" 

220 if lname == "ipachar": 

221 ipa = ht.get(1) or "" 

222 if ipa: 222 ↛ 233line 222 didn't jump to line 233 because the condition on line 222 was always true

223 ipa = clean_node(wxr, None, [ipa]) 

224 audio = Sound(ipa=ipa.strip()) 

225 sound_templates.append(audio) 

226 return "__SOUND_" + str(len(sound_templates) - 1) + "__" 

227 # Simple Wiktionary AHD = enPR 

228 # The IPA templates of Simple Wiktionary are simple enough that we can 

229 # just pull the data from the arguments and clean them up and use as 

230 # is; in contrast with en.wiktionary's templates, where you can have 

231 # processed qualifiers everywhere, it becomes necessary to do all of 

232 # this in post_template_fn= and parse the expanded output. 

233 if lname in ( 

234 "ipa", 

235 "sampa", 

236 "ahd", 

237 "enpr", 

238 ): 

239 for ipa in (ht.get(x, "") for x in (1, 2, 3, 4)): 239 ↛ 250line 239 didn't jump to line 250 because the loop on line 239 didn't complete

240 if ipa: 240 ↛ 239line 240 didn't jump to line 239 because the condition on line 240 was always true

241 ipa = clean_node(wxr, None, [ipa]) 

242 if lname == "ipa": 

243 audio = Sound(ipa=ipa.strip()) 

244 elif lname == "sampa": 244 ↛ 245line 244 didn't jump to line 245 because the condition on line 244 was never true

245 audio = Sound(sampa=ipa.strip()) 

246 elif lname in ("ahd", "enpr"): 246 ↛ 248line 246 didn't jump to line 248 because the condition on line 246 was always true

247 audio = Sound(enpr=ipa.strip()) 

248 sound_templates.append(audio) 

249 return "__SOUND_" + str(len(sound_templates) - 1) + "__" 

250 if lname in ( 

251 "homophone", 

252 "homophones", 

253 "hmp", 

254 ): 

255 homophones = [s for s in ht.values()] 

256 audio = Sound() 

257 for hp in homophones: 

258 hp = clean_node(wxr, None, [hp]) 

259 audio.homophones.append(hp) 

260 if homophones: 260 ↛ 264line 260 didn't jump to line 264 because the condition on line 260 was always true

261 sound_templates.append(audio) 

262 return "__SOUND_" + str(len(sound_templates) - 1) + "__" 

263 

264 return None 

265 

266 def post_parse_pronunciation_template_fn( 

267 name: str, 

268 ht: TemplateArgs, 

269 expanded: str, 

270 ) -> str | None: 

271 lname = name.lower() 

272 if lname in PANEL_TEMPLATES: 272 ↛ 273line 272 didn't jump to line 273 because the condition on line 272 was never true

273 return "" 

274 if lname in ("hyph", "hyphenation"): 

275 # Add hyphenation template output straight to data 

276 # English hyphenation rules don't make sense. You don't break up 

277 # "united" into "ụ-nit-ed", that t definitely belongs at the 

278 # beginning of the last syllable. """you-night.... ED""". Bah. 

279 text = clean_node( 

280 wxr, 

281 None, 

282 [expanded], 

283 ) # clean_node strip()s by default 

284 m = REMOVE_HYPHENATION_RE.match(text) 

285 if m: 285 ↛ 287line 285 didn't jump to line 287 because the condition on line 285 was always true

286 text = m.group(1) 

287 if text and target_data.hyphenation: 287 ↛ 288line 287 didn't jump to line 288 because the condition on line 287 was never true

288 target_data.hyphenation += "; " + text 

289 elif text: 289 ↛ 292line 289 didn't jump to line 292 because the condition on line 289 was always true

290 target_data.hyphenation = text 

291 else: 

292 return None 

293 return "" 

294 return None 

295 

296 # Using the already parsed parse-tree would be ideal, but because wikitext 

297 # is written by humans, sometimes that parse-tree does not actually reflect 

298 # what it *is*. A pronunciation section for Simple Wiktionary is relatively 

299 # simple and usually pretty similar, but if we suddenly were introduced to 

300 # something like a template that generates more entries that are appended to 

301 # the source text as bullet points, that isn't reflected in the parse tree 

302 # which has unexpanded templates. The parse tree also doesn't understand 

303 # what a line is; newlines are just parts of strings, or something that 

304 # would be created by a parse node implicitly. We know that a line is 

305 # meaningful in this context: if we use clean_node to revert the parse tree 

306 # and expand template nodes found in it so that we have a 'clean' raw-text 

307 # representation with a few remnants of wikitext syntax (like headers and 

308 # bullet point syntax), then we can handle each line separately and also 

309 # keep an eye out on the list hierarchy/depth at the same time. The expanded 

310 # templates can also be handled in template_fn and post_template_fn, 

311 # even going so far as to leave magic markers in the text that are easily 

312 # regexable later. 

313 parts: list[str] = [] 

314 for i, child in enumerate(node.invert_find_child(LEVEL_KIND_FLAGS)): 

315 parts.append( 

316 clean_node( 

317 wxr, 

318 None, 

319 child, 

320 template_fn=parse_pronunciation_template_fn, 

321 post_template_fn=post_parse_pronunciation_template_fn, 

322 no_strip=True, 

323 ) 

324 ) 

325 pron_main = "".join(parts) 

326 

327 # logger.debug(f"{wxr.wtp.title}\n{pron_main}") 

328 

329 # We parse the already expanded and cleaned text; templates have been either 

330 # expanded or they've been replaced with something in the _template_fn 

331 # handler functions. We're left with a "bare-bones" parse tree that mainly 

332 # has list structure. 

333 # This is future-proofing, but it's an issue in other extractors: if a 

334 # template is used to generate the pronunciation section list, it has 

335 # been expanded here and properly parsed. 

336 pron_root = wxr.wtp.parse(pron_main) 

337 # logger.debug(print_tree(pron_root, indent=2, ret_value=True)) 

338 

339 recursively_complete_sound_data(wxr, pron_root, sound_templates) 

340 

341 # print(pron_main) 

342 # for st in sound_templates: 

343 # print(st.model_dump_json(exclude_defaults=True)) 

344 

345 # print(target_data) 

346 

347 # remove duplicate tags 

348 for st in sound_templates: 

349 legit_tags, raw_tags, poses = convert_tags(st.raw_tags) 

350 if len(legit_tags) > 0: 

351 st.tags = sorted(set(legit_tags)) 

352 st.raw_tags = sorted(set(raw_tags)) 

353 if len(poses) > 0: 353 ↛ 354line 353 didn't jump to line 354 because the condition on line 353 was never true

354 st.poses.extend(poses) 

355 st.poses = sorted(set(st.poses)) 

356 

357 if len(sound_templates) > 0: 

358 # completely replace sound data with new 

359 target_data.sounds = sound_templates 

360 

361 return None