Coverage for src / wiktextract / extractor / simple / pronunciation.py: 89%
149 statements
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-12 08:09 +0000
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-12 08:09 +0000
1import re
2from copy import copy
4from wikitextprocessor import NodeKind, TemplateArgs, WikiNode
5from wikitextprocessor.parser import LEVEL_KIND_FLAGS # , print_tree
7from wiktextract import WiktextractContext
8from wiktextract.page import clean_node
10# from wiktextract.wxr_logging import logger
11from .models import Sound, WordEntry
12from .parse_utils import PANEL_TEMPLATES
13from .tags_utils import convert_tags
15REMOVE_HYPHENATION_RE = re.compile(r"(?i)\s*hyphenation\s*,?:?\s*(.+)")
18def recurse_list(
19 wxr: WiktextractContext,
20 node: WikiNode,
21 sound_templates: list[Sound],
22 poses: list[str],
23 raw_tags: list[str],
24) -> tuple[list[str] | None, list[str] | None]:
25 assert node.kind == NodeKind.LIST
27 this_level_tags = raw_tags[:]
29 if len(node.children) == 1:
30 ### HACKYHACKHACK ###
31 # ; pos or raw tags
32 # * pron 1
33 # * pron 2
34 # The first line is typically used in Simple English Wiktionary
35 # for tagging entries "below" it, even though ";" shouldn't
36 # be used to make things bold according to wikitext guidelines
37 # (creates broken HTML5 and breaks screen-readers). The ";" list
38 # is also separate from the "*" list, so they're completely separated
39 # in our parse tree; two different LIST objects!
40 return recurse_list_item(
41 wxr,
42 # Guaranteed LIST_ITEM
43 node.children[0], # type:ignore[arg-type]
44 sound_templates,
45 poses,
46 this_level_tags,
47 )
48 for child in node.children:
49 new_pos, new_tags = recurse_list_item(
50 wxr,
51 # We are pretty much guaranteed a LIST will only only have
52 # LIST_ITEM children.
53 child, # type:ignore[arg-type]
54 sound_templates,
55 poses,
56 this_level_tags,
57 )
58 if new_pos is not None:
59 poses = new_pos
60 if new_tags: 60 ↛ 61line 60 didn't jump to line 61 because the condition on line 60 was never true
61 this_level_tags = raw_tags + new_tags
63 return None, None
66def recurse_list_item(
67 wxr: WiktextractContext,
68 node: WikiNode,
69 sound_templates: list[Sound],
70 poses: list[str],
71 raw_tags: list[str],
72) -> tuple[list[str] | None, list[str] | None]:
73 """Recurse through list and list_item nodes. In some cases, a call might
74 return a tuple of a POS string and list of raw tags, which can be applied
75 to `pos` and `raw_tags` parameters on the same level."""
77 # We can trust that contents has only stuff from the beginning of this
78 # list_item because because lists would "consume" the rest.
79 assert node.kind in (NodeKind.LIST_ITEM, NodeKind.ROOT)
81 contents = list(node.invert_find_child(NodeKind.LIST))
83 text = clean_node(wxr, None, contents).strip()
84 text_raw_tags: list[str] = []
85 text_poses: list[str] = []
87 if "__SOUND" not in text:
88 # This is text without a pronunciation template like {{ipa}}.
89 # Simple Wikt. is pretty consistent with its pron. templates, so
90 # we dismiss the possibility that someone put a stray string of
91 # IPA here, and treat the text as a description or a reference
92 # to a sense.
93 # XXX extract raw tags more gracefully
94 text_tags, text_raw_tags, text_poses = convert_tags([text])
95 text_raw_tags.extend(text_tags) # let's ignore the normal tags for now
97 if len(text_poses) > 0 or len(text_raw_tags) > 0:
98 if len(contents) == len(node.children):
99 # No sublists in this node
100 return (
101 text_poses or None,
102 text_raw_tags or None,
103 ) # return "noun 1"
105 line_raw_tags: list[str] = []
106 line_poses = []
107 for sound_m in re.findall(r"([^_]*)__SOUND_(\d+)__", text):
108 part_tags, new_raw_tags, part_poses = convert_tags([sound_m[0]])
109 new_raw_tags.extend(part_tags)
110 if len(part_poses) > 0: 110 ↛ 111line 110 didn't jump to line 111 because the condition on line 110 was never true
111 line_poses = part_poses
112 if len(new_raw_tags) > 0:
113 line_raw_tags = new_raw_tags
115 i = int(sound_m[-1]) # (\d+)
116 sound = sound_templates[i] # the Sound object
118 # These sound data are attached to POS data later; for this, we
119 # use the sound.pos field.
120 if len(line_poses) > 0 or len(poses) > 0:
121 sound.poses = line_poses or poses
123 # print(f"{raw_tags=}, {line_raw_tags=}")
124 for d in raw_tags + line_raw_tags:
125 sound.raw_tags.append(d)
127 this_level_tags = raw_tags + text_raw_tags
129 if len(text_poses) > 0:
130 poses = copy(poses)
131 poses.extend(text_poses)
133 for li in node.find_child(NodeKind.LIST):
134 new_poses, new_tags2 = recurse_list(
135 wxr, li, sound_templates, poses, this_level_tags
136 )
137 if new_poses is not None:
138 poses = new_poses
139 if new_tags2 is not None: 139 ↛ 140line 139 didn't jump to line 140 because the condition on line 139 was never true
140 this_level_tags = raw_tags + new_tags2
142 return None, None
145def recursively_complete_sound_data(
146 wxr: WiktextractContext, node: WikiNode, sound_templates: list[Sound]
147) -> None:
148 """Parse all the lists for pronunciation data recursively."""
150 # node should be NodeKind.ROOT
151 recurse_list_item(wxr, node, sound_templates, [], [])
152 return None
155def process_pron(
156 wxr: WiktextractContext,
157 node: WikiNode,
158 target_data: WordEntry,
159) -> None:
160 """Process a Pronunciation section WikiNode, extracting Sound data entries
161 which are inserted into target_data.sounds. target_data is a WordEntry, so
162 can be base_data (used to complete other entries) or an individual POS
163 entry."""
165 # XXX: figure out a way to collect category here with clean_node so that
166 # it can be properly assigned to the right POS WordEntry; currently,
167 # clean_node insert the category stuff straight into whatever data object
168 # is its target, which would make target_data (if used for this) basically
169 # have every category from every pron-section level. We already use a hack
170 # to later assign sound data to the correct POS with the `pos` field in
171 # SoundEntry... Currently ignoring categories for sound.
173 # print("====")
174 # print_tree(pr_node)
176 # We save data in parse_pronunciation_template_fn into this local list,
177 # so the template_fn has to be defined inside this larger function so
178 # that it has easy access to sound_templates. Can't be defined outside
179 # this function either, because we need access to `wxr` from here, and
180 # the template_fn signature is already set in wikitextprocessor.
181 sound_templates: list[Sound] = []
183 # Template handling functions are function objects that are used to process
184 # a parsed template node (a WikiNode object parsed from `{{template|arg}}`
185 # that hasn't been expanded yet into text). The function objects are passed
186 # into certain functions like like clean_node. When the expander comes
187 # across a template, before expanding the template it calls whatever was
188 # passed to template_fn=. The handler function can return a string, which is
189 # inserted into the returned text, or None in case nothing special will be
190 # done with the template and it will use the normal expanded value.
191 # post_template_fn= is the same, except it happens after the template has
192 # already been expanded and receives that text as a parameter. If you want
193 # to replace templates based on their names or arguments before expansion,
194 # use template_fn=, if you want access to the expanded text without doing
195 # it manually, use post_template_fn=
196 def parse_pronunciation_template_fn(
197 name: str, ht: TemplateArgs
198 ) -> str | None:
199 lname = name.lower()
200 if lname in PANEL_TEMPLATES: 200 ↛ 201line 200 didn't jump to line 201 because the condition on line 200 was never true
201 return ""
202 if lname == "audio":
203 filename = ht.get(1) or ""
204 desc = ht.get(2) or ""
205 desc = clean_node(wxr, None, [desc]).strip()
206 audio = Sound(audio=filename.strip())
207 if desc: 207 ↛ 209line 207 didn't jump to line 209 because the condition on line 207 was always true
208 audio.raw_tags.append(desc)
209 sound_templates.append(audio)
210 return "__SOUND_" + str(len(sound_templates) - 1) + "__"
211 if lname == "audio-ipa":
212 filename = ht.get(1) or ""
213 ipa = ht.get(2) or ""
214 ipa = clean_node(wxr, None, [ipa])
215 audio = Sound(audio=filename.strip())
216 if ipa: 216 ↛ 218line 216 didn't jump to line 218 because the condition on line 216 was always true
217 audio.ipa = ipa
218 sound_templates.append(audio)
219 return "__SOUND_" + str(len(sound_templates) - 1) + "__"
220 if lname == "ipachar":
221 ipa = ht.get(1) or ""
222 if ipa: 222 ↛ 233line 222 didn't jump to line 233 because the condition on line 222 was always true
223 ipa = clean_node(wxr, None, [ipa])
224 audio = Sound(ipa=ipa.strip())
225 sound_templates.append(audio)
226 return "__SOUND_" + str(len(sound_templates) - 1) + "__"
227 # Simple Wiktionary AHD = enPR
228 # The IPA templates of Simple Wiktionary are simple enough that we can
229 # just pull the data from the arguments and clean them up and use as
230 # is; in contrast with en.wiktionary's templates, where you can have
231 # processed qualifiers everywhere, it becomes necessary to do all of
232 # this in post_template_fn= and parse the expanded output.
233 if lname in (
234 "ipa",
235 "sampa",
236 "ahd",
237 "enpr",
238 ):
239 for ipa in (ht.get(x, "") for x in (1, 2, 3, 4)): 239 ↛ 250line 239 didn't jump to line 250 because the loop on line 239 didn't complete
240 if ipa: 240 ↛ 239line 240 didn't jump to line 239 because the condition on line 240 was always true
241 ipa = clean_node(wxr, None, [ipa])
242 if lname == "ipa":
243 audio = Sound(ipa=ipa.strip())
244 elif lname == "sampa": 244 ↛ 245line 244 didn't jump to line 245 because the condition on line 244 was never true
245 audio = Sound(sampa=ipa.strip())
246 elif lname in ("ahd", "enpr"): 246 ↛ 248line 246 didn't jump to line 248 because the condition on line 246 was always true
247 audio = Sound(enpr=ipa.strip())
248 sound_templates.append(audio)
249 return "__SOUND_" + str(len(sound_templates) - 1) + "__"
250 if lname in (
251 "homophone",
252 "homophones",
253 "hmp",
254 ):
255 homophones = [s for s in ht.values()]
256 audio = Sound()
257 for hp in homophones:
258 hp = clean_node(wxr, None, [hp])
259 audio.homophones.append(hp)
260 if homophones: 260 ↛ 264line 260 didn't jump to line 264 because the condition on line 260 was always true
261 sound_templates.append(audio)
262 return "__SOUND_" + str(len(sound_templates) - 1) + "__"
264 return None
266 def post_parse_pronunciation_template_fn(
267 name: str,
268 ht: TemplateArgs,
269 expanded: str,
270 ) -> str | None:
271 lname = name.lower()
272 if lname in PANEL_TEMPLATES: 272 ↛ 273line 272 didn't jump to line 273 because the condition on line 272 was never true
273 return ""
274 if lname in ("hyph", "hyphenation"):
275 # Add hyphenation template output straight to data
276 # English hyphenation rules don't make sense. You don't break up
277 # "united" into "ụ-nit-ed", that t definitely belongs at the
278 # beginning of the last syllable. """you-night.... ED""". Bah.
279 text = clean_node(
280 wxr,
281 None,
282 [expanded],
283 ) # clean_node strip()s by default
284 m = REMOVE_HYPHENATION_RE.match(text)
285 if m: 285 ↛ 287line 285 didn't jump to line 287 because the condition on line 285 was always true
286 text = m.group(1)
287 if text and target_data.hyphenation: 287 ↛ 288line 287 didn't jump to line 288 because the condition on line 287 was never true
288 target_data.hyphenation += "; " + text
289 elif text: 289 ↛ 292line 289 didn't jump to line 292 because the condition on line 289 was always true
290 target_data.hyphenation = text
291 else:
292 return None
293 return ""
294 return None
296 # Using the already parsed parse-tree would be ideal, but because wikitext
297 # is written by humans, sometimes that parse-tree does not actually reflect
298 # what it *is*. A pronunciation section for Simple Wiktionary is relatively
299 # simple and usually pretty similar, but if we suddenly were introduced to
300 # something like a template that generates more entries that are appended to
301 # the source text as bullet points, that isn't reflected in the parse tree
302 # which has unexpanded templates. The parse tree also doesn't understand
303 # what a line is; newlines are just parts of strings, or something that
304 # would be created by a parse node implicitly. We know that a line is
305 # meaningful in this context: if we use clean_node to revert the parse tree
306 # and expand template nodes found in it so that we have a 'clean' raw-text
307 # representation with a few remnants of wikitext syntax (like headers and
308 # bullet point syntax), then we can handle each line separately and also
309 # keep an eye out on the list hierarchy/depth at the same time. The expanded
310 # templates can also be handled in template_fn and post_template_fn,
311 # even going so far as to leave magic markers in the text that are easily
312 # regexable later.
313 parts: list[str] = []
314 for i, child in enumerate(node.invert_find_child(LEVEL_KIND_FLAGS)):
315 parts.append(
316 clean_node(
317 wxr,
318 None,
319 child,
320 template_fn=parse_pronunciation_template_fn,
321 post_template_fn=post_parse_pronunciation_template_fn,
322 no_strip=True,
323 )
324 )
325 pron_main = "".join(parts)
327 # logger.debug(f"{wxr.wtp.title}\n{pron_main}")
329 # We parse the already expanded and cleaned text; templates have been either
330 # expanded or they've been replaced with something in the _template_fn
331 # handler functions. We're left with a "bare-bones" parse tree that mainly
332 # has list structure.
333 # This is future-proofing, but it's an issue in other extractors: if a
334 # template is used to generate the pronunciation section list, it has
335 # been expanded here and properly parsed.
336 pron_root = wxr.wtp.parse(pron_main)
337 # logger.debug(print_tree(pron_root, indent=2, ret_value=True))
339 recursively_complete_sound_data(wxr, pron_root, sound_templates)
341 # print(pron_main)
342 # for st in sound_templates:
343 # print(st.model_dump_json(exclude_defaults=True))
345 # print(target_data)
347 # remove duplicate tags
348 for st in sound_templates:
349 legit_tags, raw_tags, poses = convert_tags(st.raw_tags)
350 if len(legit_tags) > 0:
351 st.tags = sorted(set(legit_tags))
352 st.raw_tags = sorted(set(raw_tags))
353 if len(poses) > 0: 353 ↛ 354line 353 didn't jump to line 354 because the condition on line 353 was never true
354 st.poses.extend(poses)
355 st.poses = sorted(set(st.poses))
357 if len(sound_templates) > 0:
358 # completely replace sound data with new
359 target_data.sounds = sound_templates
361 return None