Coverage for src/wiktextract/extractor/simple/pronunciation.py: 89%

1import re

2from copy import copy

4from wikitextprocessor import NodeKind, TemplateArgs, WikiNode

5from wikitextprocessor.parser import LEVEL_KIND_FLAGS # , print_tree

7from wiktextract import WiktextractContext

8from wiktextract.page import clean_node

10# from wiktextract.wxr_logging import logger

11from .models import Sound, WordEntry

12from .parse_utils import PANEL_TEMPLATES

13from .tags_utils import convert_tags

15REMOVE_HYPHENATION_RE = re.compile(r"(?i)\s*hyphenation\s*,?:?\s*(.+)")

17def recurse_list(

18 wxr: WiktextractContext,

19 node: WikiNode,

20 sound_templates: list[Sound],

21 poses: list[str],

22 raw_tags: list[str],

23) -> tuple[list[str] | None, list[str] | None]:

24 assert node.kind == NodeKind.LIST

26 this_level_tags = raw_tags[:]

28 if len(node.children) == 1:

29 ### HACKYHACKHACK ###

30 # ; pos or raw tags

31 # * pron 1

32 # * pron 2

33 # The first line is typically used in Simple English Wiktionary

34 # for tagging entries "below" it, even though ";" shouldn't

35 # be used to make things bold according to wikitext guidelines

36 # (creates broken HTML5 and breaks screen-readers). The ";" list

37 # is also separate from the "*" list, so they're completely separated

38 # in our parse tree; two different LIST objects!

39 return recurse_list_item(

40 wxr,

41 # Guaranteed LIST_ITEM

42 node.children[0], # type:ignore[arg-type]

43 sound_templates,

44 poses,

45 this_level_tags,

46 )

47 for child in node.children:

48 new_pos, new_tags = recurse_list_item(

49 wxr,

50 # We are pretty much guaranteed a LIST will only only have

51 # LIST_ITEM children.

52 child, # type:ignore[arg-type]

53 sound_templates,

54 poses,

55 this_level_tags,

56 )

57 if new_pos is not None:

58 poses = new_pos

59 if new_tags: 59 ↛ 60line 59 didn't jump to line 60 because the condition on line 59 was never true

60 this_level_tags = raw_tags + new_tags

62 return None, None

65def recurse_list_item(

66 wxr: WiktextractContext,

67 node: WikiNode,

68 sound_templates: list[Sound],

69 poses: list[str],

70 raw_tags: list[str],

71) -> tuple[list[str] | None, list[str] | None]:

72 """Recurse through list and list_item nodes. In some cases, a call might

73 return a tuple of a POS string and list of raw tags, which can be applied

74 to `pos` and `raw_tags` parameters on the same level."""

76 # We can trust that contents has only stuff from the beginning of this

77 # list_item because because lists would "consume" the rest.

78 assert node.kind in (NodeKind.LIST_ITEM, NodeKind.ROOT)

80 contents = list(node.invert_find_child(NodeKind.LIST))

82 text = clean_node(wxr, None, contents).strip()

83 text_raw_tags: list[str] = []

84 text_poses: list[str] = []

86 if "__SOUND" not in text:

87 # This is text without a pronunciation template like {{ipa}}.

88 # Simple Wikt. is pretty consistent with its pron. templates, so

89 # we dismiss the possibility that someone put a stray string of

90 # IPA here, and treat the text as a description or a reference

91 # to a sense.

92 # XXX extract raw tags more gracefully

93 text_tags, text_raw_tags, text_poses = convert_tags([text])

94 text_raw_tags.extend(text_tags) # let's ignore the normal tags for now

96 if len(text_poses) > 0 or len(text_raw_tags) > 0:

97 if len(contents) == len(node.children):

98 # No sublists in this node

99 return (

100 text_poses or None,

101 text_raw_tags or None,

102 ) # return "noun 1"

103

104 line_raw_tags: list[str] = []

105 line_poses = []

106 for sound_m in re.findall(r"([^_]*)__SOUND_(\d+)__", text):

107 part_tags, new_raw_tags, part_poses = convert_tags([sound_m[0]])

108 new_raw_tags.extend(part_tags)

109 if len(part_poses) > 0: 109 ↛ 110line 109 didn't jump to line 110 because the condition on line 109 was never true

110 line_poses = part_poses

111 if len(new_raw_tags) > 0:

112 line_raw_tags = new_raw_tags

113

114 i = int(sound_m[-1]) # (\d+)

115 sound = sound_templates[i] # the Sound object

116

117 # These sound datas are attached to POS data later; for this, we

118 # use the sound.pos field.

119 if len(line_poses) > 0 or len(poses) > 0:

120 sound.poses = line_poses or poses

121

122 # print(f"{raw_tags=}, {line_raw_tags=}")

123 for d in raw_tags + line_raw_tags:

124 sound.raw_tags.append(d)

125

126 this_level_tags = raw_tags + text_raw_tags

127

128 if len(text_poses) > 0:

129 poses = copy(poses)

130 poses.extend(text_poses)

131

132 for li in node.find_child(NodeKind.LIST):

133 new_poses, new_tags2 = recurse_list(

134 wxr, li, sound_templates, poses, this_level_tags

135 )

136 if new_poses is not None:

137 poses = new_poses

138 if new_tags2 is not None: 138 ↛ 139line 138 didn't jump to line 139 because the condition on line 138 was never true

139 this_level_tags = raw_tags + new_tags2

140

141 return None, None

142

143

144def recursively_complete_sound_data(

145 wxr: WiktextractContext, node: WikiNode, sound_templates: list[Sound]

146) -> None:

147 """Parse all the lists for pronunciation data recursively."""

148

149 # node should be NodeKind.ROOT

150 recurse_list_item(wxr, node, sound_templates, [], [])

151 return None

152

153

154def process_pron(

155 wxr: WiktextractContext,

156 node: WikiNode,

157 target_data: WordEntry,

158) -> None:

159 """Process a Pronunciation section WikiNode, extracting Sound data entries

160 which are inserted into target_data.sounds. target_data is a WordEntry, so

161 can be base_data (used to complete other entries) or an individual POS

162 entry."""

163

164 # XXX: figure out a way to collect category here with clean_node so that

165 # it can be properly assigned to the right POS WordEntry; currently,

166 # clean_node insert the category stuff straight into whatever data object

167 # is its target, which would make target_data (if used for this) basically

168 # have every category from every pron-section level. We already use a hack

169 # to later assign sound data to the correct POS with the `pos` field in

170 # SoundEntry... Currently ignoring categories for sound.

171

172 # print("====")

173 # print_tree(pr_node)

174

175 # We save data in parse_pronunciation_template_fn into this local list,

176 # so the template_fn has to be defined inside this larger function so

177 # that it has easy access to sound_templates. Can't be defined outside

178 # this function either, because we need access to `wxr` from here, and

179 # the template_fn signature is already set in wikitextprocessor.

180 sound_templates: list[Sound] = []

181

182 # Template handling functions are function objects that are used to process

183 # a parsed template node (a WikiNode object parsed from `{{template|arg}}`

184 # that hasn't been expanded yet into text). The function objects are passed

185 # into certain functions like like clean_node. When the expander comes

186 # across a template, before expanding the template it calls whatever was

187 # passed to template_fn=. The handler function can return a string, which is

188 # inserted into the returned text, or None in case nothing special will be

189 # done with the template and it will use the normal expanded value.

190 # post_template_fn= is the same, except it happens after the template has

191 # already been expanded and receives that text as a parameter. If you want

192 # to replace templates based on their names or arguments before expansion,

193 # use template_fn=, if you want access to the expanded text without doing

194 # it manually, use post_template_fn=

195 def parse_pronunciation_template_fn(

196 name: str, ht: TemplateArgs

197 ) -> str | None:

198 lname = name.lower()

199 if lname in PANEL_TEMPLATES: 199 ↛ 200line 199 didn't jump to line 200 because the condition on line 199 was never true

200 return ""

201 if lname == "audio":

202 filename = ht.get(1) or ""

203 desc = ht.get(2) or ""

204 desc = clean_node(wxr, None, [desc]).strip()

205 audio = Sound(audio=filename.strip())

206 if desc: 206 ↛ 208line 206 didn't jump to line 208 because the condition on line 206 was always true

207 audio.raw_tags.append(desc)

208 sound_templates.append(audio)

209 return "__SOUND_" + str(len(sound_templates) - 1) + "__"

210 if lname == "audio-ipa":

211 filename = ht.get(1) or ""

212 ipa = ht.get(2) or ""

213 ipa = clean_node(wxr, None, [ipa])

214 audio = Sound(audio=filename.strip())

215 if ipa: 215 ↛ 217line 215 didn't jump to line 217 because the condition on line 215 was always true

216 audio.ipa = ipa

217 sound_templates.append(audio)

218 return "__SOUND_" + str(len(sound_templates) - 1) + "__"

219 if lname == "ipachar":

220 ipa = ht.get(1) or ""

221 if ipa: 221 ↛ 232line 221 didn't jump to line 232 because the condition on line 221 was always true

222 ipa = clean_node(wxr, None, [ipa])

223 audio = Sound(ipa=ipa.strip())

224 sound_templates.append(audio)

225 return "__SOUND_" + str(len(sound_templates) - 1) + "__"

226 # Simple Wiktionary AHD = enPR

227 # The IPA templates of Simple Wiktionary are simple enough that we can

228 # just pull the data from the arguments and clean them up and use as

229 # is; in contrast with en.wiktionary's templates, where you can have

230 # processed qualifiers everywhere, it becomes necessary to do all of

231 # this in post_template_fn= and parse the expanded output.

232 if lname in (

233 "ipa",

234 "sampa",

235 "ahd",

236 "enpr",

237 ):

238 for ipa in (ht.get(x, "") for x in (1, 2, 3, 4)): 238 ↛ 249line 238 didn't jump to line 249 because the loop on line 238 didn't complete

239 if ipa: 239 ↛ 238line 239 didn't jump to line 238 because the condition on line 239 was always true

240 ipa = clean_node(wxr, None, [ipa])

241 if lname == "ipa":

242 audio = Sound(ipa=ipa.strip())

243 elif lname == "sampa": 243 ↛ 244line 243 didn't jump to line 244 because the condition on line 243 was never true

244 audio = Sound(sampa=ipa.strip())

245 elif lname in ("ahd", "enpr"): 245 ↛ 247line 245 didn't jump to line 247 because the condition on line 245 was always true

246 audio = Sound(enpr=ipa.strip())

247 sound_templates.append(audio)

248 return "__SOUND_" + str(len(sound_templates) - 1) + "__"

249 if lname in (

250 "homophone",

251 "homophones",

252 "hmp",

253 ):

254 homophones = [s for s in ht.values()]

255 audio = Sound()

256 for hp in homophones:

257 hp = clean_node(wxr, None, [hp])

258 audio.homophones.append(hp)

259 if homophones: 259 ↛ 263line 259 didn't jump to line 263 because the condition on line 259 was always true

260 sound_templates.append(audio)

261 return "__SOUND_" + str(len(sound_templates) - 1) + "__"

262

263 return None

264

265 def post_parse_pronunciation_template_fn(

266 name: str,

267 ht: TemplateArgs,

268 expanded: str,

269 ) -> str | None:

270 lname = name.lower()

271 if lname in PANEL_TEMPLATES: 271 ↛ 272line 271 didn't jump to line 272 because the condition on line 271 was never true

272 return ""

273 if lname in ("hyph", "hyphenation"):

274 # Add hyphenation template output straight to data

275 # English hyphenation rules don't make sense. You don't break up

276 # "united" into "ụ-nit-ed", that t definitely belongs at the

277 # beginning of the last syllable. """you-night.... ED""". Bah.

278 text = clean_node(

279 wxr,

280 None,

281 [expanded],

282 ) # clean_node strip()s by default

283 m = REMOVE_HYPHENATION_RE.match(text)

284 if m: 284 ↛ 286line 284 didn't jump to line 286 because the condition on line 284 was always true

285 text = m.group(1)

286 if text and target_data.hyphenation: 286 ↛ 287line 286 didn't jump to line 287 because the condition on line 286 was never true

287 target_data.hyphenation += "; " + text

288 elif text: 288 ↛ 291line 288 didn't jump to line 291 because the condition on line 288 was always true

289 target_data.hyphenation = text

290 else:

291 return None

292 return ""

293 return None

294

295 # Using the already parsed parse-tree would be ideal, but because wikitext

296 # is written by humans, sometimes that parse-tree does not actually reflect

297 # what it *is*. A pronunciation section for Simple Wiktionary is relatively

298 # simple and usually pretty similar, but if we suddenly were introduced to

299 # something like a template that generates more entries that are appended to

300 # the source text as bullet points, that isn't reflected in the parse tree

301 # which has unexpanded templates. The parse tree also doesn't understand

302 # what a line is; newlines are just parts of strings, or something that

303 # would be created by a parse node implicitly. We know that a line is

304 # meaningful in this context: if we use clean_node to revert the parse tree

305 # and expand template nodes found in it so that we have a 'clean' raw-text

306 # representation with a few remnants of wikitext syntax (like headers and

307 # bullet point syntax), then we can handle each line separately and also

308 # keep an eye out on the list hierarchy/depth at the same time. The expanded

309 # templates can also be handled in template_fn and post_template_fn,

310 # even going so far as to leave magic markers in the text that are easily

311 # regexable later.

312 parts: list[str] = []

313 for i, child in enumerate(node.invert_find_child(LEVEL_KIND_FLAGS)):

314 parts.append(

315 clean_node(

316 wxr,

317 None,

318 child,

319 template_fn=parse_pronunciation_template_fn,

320 post_template_fn=post_parse_pronunciation_template_fn,

321 no_strip=True,

322 )

323 )

324 pron_main = "".join(parts)

325

326 # logger.debug(f"{wxr.wtp.title}\n{pron_main}")

327

328 # We parse the already expanded and cleaned text; templates have been either

329 # expanded or they've been replaced with something in the _template_fn

330 # handler functions. We're left with a "bare-bones" parse tree that mainly

331 # has list structure.

332 # This is future-proofing, but it's an issue in other extractors: if a

333 # template is used to generate the pronunciation section list, it has

334 # been expanded here and properly parsed.

335 pron_root = wxr.wtp.parse(pron_main)

336 # logger.debug(print_tree(pron_root, indent=2, ret_value=True))

337

338 recursively_complete_sound_data(wxr, pron_root, sound_templates)

339

340 # print(pron_main)

341 # for st in sound_templates:

342 # print(st.model_dump_json(exclude_defaults=True))

343

344 # print(target_data)

345

346 # remove duplicate tags

347 for st in sound_templates:

348 legit_tags, raw_tags, poses = convert_tags(st.raw_tags)

349 if len(legit_tags) > 0:

350 st.tags = list(set(legit_tags))

351 st.raw_tags = list(set(raw_tags))

352 if len(poses) > 0: 352 ↛ 353line 352 didn't jump to line 353 because the condition on line 352 was never true

353 st.poses.extend(poses)

354 st.poses = list(set(st.poses))

355

356 if len(sound_templates) > 0:

357 # completely replace sound data with new

358 target_data.sounds = sound_templates

359

360 return None