Coverage for src/wiktextract/extractor/simple/pronunciation.py: 89%

149 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2024-12-27 08:07 +0000

1import re 

2from copy import copy 

3 

4from wikitextprocessor import NodeKind, TemplateArgs, WikiNode 

5from wikitextprocessor.parser import LEVEL_KIND_FLAGS # , print_tree 

6 

7from wiktextract import WiktextractContext 

8from wiktextract.page import clean_node 

9 

10# from wiktextract.wxr_logging import logger 

11from .models import Sound, WordEntry 

12from .parse_utils import PANEL_TEMPLATES 

13from .tags_utils import convert_tags 

14 

15REMOVE_HYPHENATION_RE = re.compile(r"(?i)\s*hyphenation\s*,?:?\s*(.+)") 

16 

17def recurse_list( 

18 wxr: WiktextractContext, 

19 node: WikiNode, 

20 sound_templates: list[Sound], 

21 poses: list[str], 

22 raw_tags: list[str], 

23) -> tuple[list[str] | None, list[str] | None]: 

24 assert node.kind == NodeKind.LIST 

25 

26 this_level_tags = raw_tags[:] 

27 

28 if len(node.children) == 1: 

29 ### HACKYHACKHACK ### 

30 # ; pos or raw tags 

31 # * pron 1 

32 # * pron 2 

33 # The first line is typically used in Simple English Wiktionary 

34 # for tagging entries "below" it, even though ";" shouldn't 

35 # be used to make things bold according to wikitext guidelines 

36 # (creates broken HTML5 and breaks screen-readers). The ";" list 

37 # is also separate from the "*" list, so they're completely separated 

38 # in our parse tree; two different LIST objects! 

39 return recurse_list_item( 

40 wxr, 

41 # Guaranteed LIST_ITEM 

42 node.children[0], # type:ignore[arg-type] 

43 sound_templates, 

44 poses, 

45 this_level_tags, 

46 ) 

47 for child in node.children: 

48 new_pos, new_tags = recurse_list_item( 

49 wxr, 

50 # We are pretty much guaranteed a LIST will only only have 

51 # LIST_ITEM children. 

52 child, # type:ignore[arg-type] 

53 sound_templates, 

54 poses, 

55 this_level_tags, 

56 ) 

57 if new_pos is not None: 

58 poses = new_pos 

59 if new_tags: 59 ↛ 60line 59 didn't jump to line 60 because the condition on line 59 was never true

60 this_level_tags = raw_tags + new_tags 

61 

62 return None, None 

63 

64 

65def recurse_list_item( 

66 wxr: WiktextractContext, 

67 node: WikiNode, 

68 sound_templates: list[Sound], 

69 poses: list[str], 

70 raw_tags: list[str], 

71) -> tuple[list[str] | None, list[str] | None]: 

72 """Recurse through list and list_item nodes. In some cases, a call might 

73 return a tuple of a POS string and list of raw tags, which can be applied 

74 to `pos` and `raw_tags` parameters on the same level.""" 

75 

76 # We can trust that contents has only stuff from the beginning of this 

77 # list_item because because lists would "consume" the rest. 

78 assert node.kind in (NodeKind.LIST_ITEM, NodeKind.ROOT) 

79 

80 contents = list(node.invert_find_child(NodeKind.LIST)) 

81 

82 text = clean_node(wxr, None, contents).strip() 

83 text_raw_tags: list[str] = [] 

84 text_poses: list[str] = [] 

85 

86 if "__SOUND" not in text: 

87 # This is text without a pronunciation template like {{ipa}}. 

88 # Simple Wikt. is pretty consistent with its pron. templates, so 

89 # we dismiss the possibility that someone put a stray string of 

90 # IPA here, and treat the text as a description or a reference 

91 # to a sense. 

92 # XXX extract raw tags more gracefully 

93 text_tags, text_raw_tags, text_poses = convert_tags([text]) 

94 text_raw_tags.extend(text_tags) # let's ignore the normal tags for now 

95 

96 if len(text_poses) > 0 or len(text_raw_tags) > 0: 

97 if len(contents) == len(node.children): 

98 # No sublists in this node 

99 return ( 

100 text_poses or None, 

101 text_raw_tags or None, 

102 ) # return "noun 1" 

103 

104 line_raw_tags: list[str] = [] 

105 line_poses = [] 

106 for sound_m in re.findall(r"([^_]*)__SOUND_(\d+)__", text): 

107 part_tags, new_raw_tags, part_poses = convert_tags([sound_m[0]]) 

108 new_raw_tags.extend(part_tags) 

109 if len(part_poses) > 0: 109 ↛ 110line 109 didn't jump to line 110 because the condition on line 109 was never true

110 line_poses = part_poses 

111 if len(new_raw_tags) > 0: 

112 line_raw_tags = new_raw_tags 

113 

114 i = int(sound_m[-1]) # (\d+) 

115 sound = sound_templates[i] # the Sound object 

116 

117 # These sound datas are attached to POS data later; for this, we 

118 # use the sound.pos field. 

119 if len(line_poses) > 0 or len(poses) > 0: 

120 sound.poses = line_poses or poses 

121 

122 # print(f"{raw_tags=}, {line_raw_tags=}") 

123 for d in raw_tags + line_raw_tags: 

124 sound.raw_tags.append(d) 

125 

126 this_level_tags = raw_tags + text_raw_tags 

127 

128 if len(text_poses) > 0: 

129 poses = copy(poses) 

130 poses.extend(text_poses) 

131 

132 for li in node.find_child(NodeKind.LIST): 

133 new_poses, new_tags2 = recurse_list( 

134 wxr, li, sound_templates, poses, this_level_tags 

135 ) 

136 if new_poses is not None: 

137 poses = new_poses 

138 if new_tags2 is not None: 138 ↛ 139line 138 didn't jump to line 139 because the condition on line 138 was never true

139 this_level_tags = raw_tags + new_tags2 

140 

141 return None, None 

142 

143 

144def recursively_complete_sound_data( 

145 wxr: WiktextractContext, node: WikiNode, sound_templates: list[Sound] 

146) -> None: 

147 """Parse all the lists for pronunciation data recursively.""" 

148 

149 # node should be NodeKind.ROOT 

150 recurse_list_item(wxr, node, sound_templates, [], []) 

151 return None 

152 

153 

154def process_pron( 

155 wxr: WiktextractContext, 

156 node: WikiNode, 

157 target_data: WordEntry, 

158) -> None: 

159 """Process a Pronunciation section WikiNode, extracting Sound data entries 

160 which are inserted into target_data.sounds. target_data is a WordEntry, so 

161 can be base_data (used to complete other entries) or an individual POS 

162 entry.""" 

163 

164 # XXX: figure out a way to collect category here with clean_node so that 

165 # it can be properly assigned to the right POS WordEntry; currently, 

166 # clean_node insert the category stuff straight into whatever data object 

167 # is its target, which would make target_data (if used for this) basically 

168 # have every category from every pron-section level. We already use a hack 

169 # to later assign sound data to the correct POS with the `pos` field in 

170 # SoundEntry... Currently ignoring categories for sound. 

171 

172 # print("====") 

173 # print_tree(pr_node) 

174 

175 # We save data in parse_pronunciation_template_fn into this local list, 

176 # so the template_fn has to be defined inside this larger function so 

177 # that it has easy access to sound_templates. Can't be defined outside 

178 # this function either, because we need access to `wxr` from here, and 

179 # the template_fn signature is already set in wikitextprocessor. 

180 sound_templates: list[Sound] = [] 

181 

182 # Template handling functions are function objects that are used to process 

183 # a parsed template node (a WikiNode object parsed from `{{template|arg}}` 

184 # that hasn't been expanded yet into text). The function objects are passed 

185 # into certain functions like like clean_node. When the expander comes 

186 # across a template, before expanding the template it calls whatever was 

187 # passed to template_fn=. The handler function can return a string, which is 

188 # inserted into the returned text, or None in case nothing special will be 

189 # done with the template and it will use the normal expanded value. 

190 # post_template_fn= is the same, except it happens after the template has 

191 # already been expanded and receives that text as a parameter. If you want 

192 # to replace templates based on their names or arguments before expansion, 

193 # use template_fn=, if you want access to the expanded text without doing 

194 # it manually, use post_template_fn= 

195 def parse_pronunciation_template_fn( 

196 name: str, ht: TemplateArgs 

197 ) -> str | None: 

198 lname = name.lower() 

199 if lname in PANEL_TEMPLATES: 199 ↛ 200line 199 didn't jump to line 200 because the condition on line 199 was never true

200 return "" 

201 if lname == "audio": 

202 filename = ht.get(1) or "" 

203 desc = ht.get(2) or "" 

204 desc = clean_node(wxr, None, [desc]).strip() 

205 audio = Sound(audio=filename.strip()) 

206 if desc: 206 ↛ 208line 206 didn't jump to line 208 because the condition on line 206 was always true

207 audio.raw_tags.append(desc) 

208 sound_templates.append(audio) 

209 return "__SOUND_" + str(len(sound_templates) - 1) + "__" 

210 if lname == "audio-ipa": 

211 filename = ht.get(1) or "" 

212 ipa = ht.get(2) or "" 

213 ipa = clean_node(wxr, None, [ipa]) 

214 audio = Sound(audio=filename.strip()) 

215 if ipa: 215 ↛ 217line 215 didn't jump to line 217 because the condition on line 215 was always true

216 audio.ipa = ipa 

217 sound_templates.append(audio) 

218 return "__SOUND_" + str(len(sound_templates) - 1) + "__" 

219 if lname == "ipachar": 

220 ipa = ht.get(1) or "" 

221 if ipa: 221 ↛ 232line 221 didn't jump to line 232 because the condition on line 221 was always true

222 ipa = clean_node(wxr, None, [ipa]) 

223 audio = Sound(ipa=ipa.strip()) 

224 sound_templates.append(audio) 

225 return "__SOUND_" + str(len(sound_templates) - 1) + "__" 

226 # Simple Wiktionary AHD = enPR 

227 # The IPA templates of Simple Wiktionary are simple enough that we can 

228 # just pull the data from the arguments and clean them up and use as 

229 # is; in contrast with en.wiktionary's templates, where you can have 

230 # processed qualifiers everywhere, it becomes necessary to do all of 

231 # this in post_template_fn= and parse the expanded output. 

232 if lname in ( 

233 "ipa", 

234 "sampa", 

235 "ahd", 

236 "enpr", 

237 ): 

238 for ipa in (ht.get(x, "") for x in (1, 2, 3, 4)): 238 ↛ 249line 238 didn't jump to line 249 because the loop on line 238 didn't complete

239 if ipa: 239 ↛ 238line 239 didn't jump to line 238 because the condition on line 239 was always true

240 ipa = clean_node(wxr, None, [ipa]) 

241 if lname == "ipa": 

242 audio = Sound(ipa=ipa.strip()) 

243 elif lname == "sampa": 243 ↛ 244line 243 didn't jump to line 244 because the condition on line 243 was never true

244 audio = Sound(sampa=ipa.strip()) 

245 elif lname in ("ahd", "enpr"): 245 ↛ 247line 245 didn't jump to line 247 because the condition on line 245 was always true

246 audio = Sound(enpr=ipa.strip()) 

247 sound_templates.append(audio) 

248 return "__SOUND_" + str(len(sound_templates) - 1) + "__" 

249 if lname in ( 

250 "homophone", 

251 "homophones", 

252 "hmp", 

253 ): 

254 homophones = [s for s in ht.values()] 

255 audio = Sound() 

256 for hp in homophones: 

257 hp = clean_node(wxr, None, [hp]) 

258 audio.homophones.append(hp) 

259 if homophones: 259 ↛ 263line 259 didn't jump to line 263 because the condition on line 259 was always true

260 sound_templates.append(audio) 

261 return "__SOUND_" + str(len(sound_templates) - 1) + "__" 

262 

263 return None 

264 

265 def post_parse_pronunciation_template_fn( 

266 name: str, 

267 ht: TemplateArgs, 

268 expanded: str, 

269 ) -> str | None: 

270 lname = name.lower() 

271 if lname in PANEL_TEMPLATES: 271 ↛ 272line 271 didn't jump to line 272 because the condition on line 271 was never true

272 return "" 

273 if lname in ("hyph", "hyphenation"): 

274 # Add hyphenation template output straight to data 

275 # English hyphenation rules don't make sense. You don't break up 

276 # "united" into "ụ-nit-ed", that t definitely belongs at the 

277 # beginning of the last syllable. """you-night.... ED""". Bah. 

278 text = clean_node( 

279 wxr, 

280 None, 

281 [expanded], 

282 ) # clean_node strip()s by default 

283 m = REMOVE_HYPHENATION_RE.match(text) 

284 if m: 284 ↛ 286line 284 didn't jump to line 286 because the condition on line 284 was always true

285 text = m.group(1) 

286 if text and target_data.hyphenation: 286 ↛ 287line 286 didn't jump to line 287 because the condition on line 286 was never true

287 target_data.hyphenation += "; " + text 

288 elif text: 288 ↛ 291line 288 didn't jump to line 291 because the condition on line 288 was always true

289 target_data.hyphenation = text 

290 else: 

291 return None 

292 return "" 

293 return None 

294 

295 # Using the already parsed parse-tree would be ideal, but because wikitext 

296 # is written by humans, sometimes that parse-tree does not actually reflect 

297 # what it *is*. A pronunciation section for Simple Wiktionary is relatively 

298 # simple and usually pretty similar, but if we suddenly were introduced to 

299 # something like a template that generates more entries that are appended to 

300 # the source text as bullet points, that isn't reflected in the parse tree 

301 # which has unexpanded templates. The parse tree also doesn't understand 

302 # what a line is; newlines are just parts of strings, or something that 

303 # would be created by a parse node implicitly. We know that a line is 

304 # meaningful in this context: if we use clean_node to revert the parse tree 

305 # and expand template nodes found in it so that we have a 'clean' raw-text 

306 # representation with a few remnants of wikitext syntax (like headers and 

307 # bullet point syntax), then we can handle each line separately and also 

308 # keep an eye out on the list hierarchy/depth at the same time. The expanded 

309 # templates can also be handled in template_fn and post_template_fn, 

310 # even going so far as to leave magic markers in the text that are easily 

311 # regexable later. 

312 parts: list[str] = [] 

313 for i, child in enumerate(node.invert_find_child(LEVEL_KIND_FLAGS)): 

314 parts.append( 

315 clean_node( 

316 wxr, 

317 None, 

318 child, 

319 template_fn=parse_pronunciation_template_fn, 

320 post_template_fn=post_parse_pronunciation_template_fn, 

321 no_strip=True, 

322 ) 

323 ) 

324 pron_main = "".join(parts) 

325 

326 # logger.debug(f"{wxr.wtp.title}\n{pron_main}") 

327 

328 # We parse the already expanded and cleaned text; templates have been either 

329 # expanded or they've been replaced with something in the _template_fn 

330 # handler functions. We're left with a "bare-bones" parse tree that mainly 

331 # has list structure. 

332 # This is future-proofing, but it's an issue in other extractors: if a 

333 # template is used to generate the pronunciation section list, it has 

334 # been expanded here and properly parsed. 

335 pron_root = wxr.wtp.parse(pron_main) 

336 # logger.debug(print_tree(pron_root, indent=2, ret_value=True)) 

337 

338 recursively_complete_sound_data(wxr, pron_root, sound_templates) 

339 

340 # print(pron_main) 

341 # for st in sound_templates: 

342 # print(st.model_dump_json(exclude_defaults=True)) 

343 

344 # print(target_data) 

345 

346 # remove duplicate tags 

347 for st in sound_templates: 

348 legit_tags, raw_tags, poses = convert_tags(st.raw_tags) 

349 if len(legit_tags) > 0: 

350 st.tags = list(set(legit_tags)) 

351 st.raw_tags = list(set(raw_tags)) 

352 if len(poses) > 0: 352 ↛ 353line 352 didn't jump to line 353 because the condition on line 352 was never true

353 st.poses.extend(poses) 

354 st.poses = list(set(st.poses)) 

355 

356 if len(sound_templates) > 0: 

357 # completely replace sound data with new 

358 target_data.sounds = sound_templates 

359 

360 return None