Coverage for src/wiktextract/extractor/simple/pronunciation.py: 89%
149 statements
« prev ^ index » next coverage.py v7.11.0, created at 2025-10-24 07:36 +0000
« prev ^ index » next coverage.py v7.11.0, created at 2025-10-24 07:36 +0000
1import re
2from copy import copy
4from wikitextprocessor import NodeKind, TemplateArgs, WikiNode
5from wikitextprocessor.parser import LEVEL_KIND_FLAGS # , print_tree
7from wiktextract import WiktextractContext
8from wiktextract.page import clean_node
10# from wiktextract.wxr_logging import logger
11from .models import Sound, WordEntry
12from .parse_utils import PANEL_TEMPLATES
13from .tags_utils import convert_tags
15REMOVE_HYPHENATION_RE = re.compile(r"(?i)\s*hyphenation\s*,?:?\s*(.+)")
17def recurse_list(
18 wxr: WiktextractContext,
19 node: WikiNode,
20 sound_templates: list[Sound],
21 poses: list[str],
22 raw_tags: list[str],
23) -> tuple[list[str] | None, list[str] | None]:
24 assert node.kind == NodeKind.LIST
26 this_level_tags = raw_tags[:]
28 if len(node.children) == 1:
29 ### HACKYHACKHACK ###
30 # ; pos or raw tags
31 # * pron 1
32 # * pron 2
33 # The first line is typically used in Simple English Wiktionary
34 # for tagging entries "below" it, even though ";" shouldn't
35 # be used to make things bold according to wikitext guidelines
36 # (creates broken HTML5 and breaks screen-readers). The ";" list
37 # is also separate from the "*" list, so they're completely separated
38 # in our parse tree; two different LIST objects!
39 return recurse_list_item(
40 wxr,
41 # Guaranteed LIST_ITEM
42 node.children[0], # type:ignore[arg-type]
43 sound_templates,
44 poses,
45 this_level_tags,
46 )
47 for child in node.children:
48 new_pos, new_tags = recurse_list_item(
49 wxr,
50 # We are pretty much guaranteed a LIST will only only have
51 # LIST_ITEM children.
52 child, # type:ignore[arg-type]
53 sound_templates,
54 poses,
55 this_level_tags,
56 )
57 if new_pos is not None:
58 poses = new_pos
59 if new_tags: 59 ↛ 60line 59 didn't jump to line 60 because the condition on line 59 was never true
60 this_level_tags = raw_tags + new_tags
62 return None, None
65def recurse_list_item(
66 wxr: WiktextractContext,
67 node: WikiNode,
68 sound_templates: list[Sound],
69 poses: list[str],
70 raw_tags: list[str],
71) -> tuple[list[str] | None, list[str] | None]:
72 """Recurse through list and list_item nodes. In some cases, a call might
73 return a tuple of a POS string and list of raw tags, which can be applied
74 to `pos` and `raw_tags` parameters on the same level."""
76 # We can trust that contents has only stuff from the beginning of this
77 # list_item because because lists would "consume" the rest.
78 assert node.kind in (NodeKind.LIST_ITEM, NodeKind.ROOT)
80 contents = list(node.invert_find_child(NodeKind.LIST))
82 text = clean_node(wxr, None, contents).strip()
83 text_raw_tags: list[str] = []
84 text_poses: list[str] = []
86 if "__SOUND" not in text:
87 # This is text without a pronunciation template like {{ipa}}.
88 # Simple Wikt. is pretty consistent with its pron. templates, so
89 # we dismiss the possibility that someone put a stray string of
90 # IPA here, and treat the text as a description or a reference
91 # to a sense.
92 # XXX extract raw tags more gracefully
93 text_tags, text_raw_tags, text_poses = convert_tags([text])
94 text_raw_tags.extend(text_tags) # let's ignore the normal tags for now
96 if len(text_poses) > 0 or len(text_raw_tags) > 0:
97 if len(contents) == len(node.children):
98 # No sublists in this node
99 return (
100 text_poses or None,
101 text_raw_tags or None,
102 ) # return "noun 1"
104 line_raw_tags: list[str] = []
105 line_poses = []
106 for sound_m in re.findall(r"([^_]*)__SOUND_(\d+)__", text):
107 part_tags, new_raw_tags, part_poses = convert_tags([sound_m[0]])
108 new_raw_tags.extend(part_tags)
109 if len(part_poses) > 0: 109 ↛ 110line 109 didn't jump to line 110 because the condition on line 109 was never true
110 line_poses = part_poses
111 if len(new_raw_tags) > 0:
112 line_raw_tags = new_raw_tags
114 i = int(sound_m[-1]) # (\d+)
115 sound = sound_templates[i] # the Sound object
117 # These sound datas are attached to POS data later; for this, we
118 # use the sound.pos field.
119 if len(line_poses) > 0 or len(poses) > 0:
120 sound.poses = line_poses or poses
122 # print(f"{raw_tags=}, {line_raw_tags=}")
123 for d in raw_tags + line_raw_tags:
124 sound.raw_tags.append(d)
126 this_level_tags = raw_tags + text_raw_tags
128 if len(text_poses) > 0:
129 poses = copy(poses)
130 poses.extend(text_poses)
132 for li in node.find_child(NodeKind.LIST):
133 new_poses, new_tags2 = recurse_list(
134 wxr, li, sound_templates, poses, this_level_tags
135 )
136 if new_poses is not None:
137 poses = new_poses
138 if new_tags2 is not None: 138 ↛ 139line 138 didn't jump to line 139 because the condition on line 138 was never true
139 this_level_tags = raw_tags + new_tags2
141 return None, None
144def recursively_complete_sound_data(
145 wxr: WiktextractContext, node: WikiNode, sound_templates: list[Sound]
146) -> None:
147 """Parse all the lists for pronunciation data recursively."""
149 # node should be NodeKind.ROOT
150 recurse_list_item(wxr, node, sound_templates, [], [])
151 return None
154def process_pron(
155 wxr: WiktextractContext,
156 node: WikiNode,
157 target_data: WordEntry,
158) -> None:
159 """Process a Pronunciation section WikiNode, extracting Sound data entries
160 which are inserted into target_data.sounds. target_data is a WordEntry, so
161 can be base_data (used to complete other entries) or an individual POS
162 entry."""
164 # XXX: figure out a way to collect category here with clean_node so that
165 # it can be properly assigned to the right POS WordEntry; currently,
166 # clean_node insert the category stuff straight into whatever data object
167 # is its target, which would make target_data (if used for this) basically
168 # have every category from every pron-section level. We already use a hack
169 # to later assign sound data to the correct POS with the `pos` field in
170 # SoundEntry... Currently ignoring categories for sound.
172 # print("====")
173 # print_tree(pr_node)
175 # We save data in parse_pronunciation_template_fn into this local list,
176 # so the template_fn has to be defined inside this larger function so
177 # that it has easy access to sound_templates. Can't be defined outside
178 # this function either, because we need access to `wxr` from here, and
179 # the template_fn signature is already set in wikitextprocessor.
180 sound_templates: list[Sound] = []
182 # Template handling functions are function objects that are used to process
183 # a parsed template node (a WikiNode object parsed from `{{template|arg}}`
184 # that hasn't been expanded yet into text). The function objects are passed
185 # into certain functions like like clean_node. When the expander comes
186 # across a template, before expanding the template it calls whatever was
187 # passed to template_fn=. The handler function can return a string, which is
188 # inserted into the returned text, or None in case nothing special will be
189 # done with the template and it will use the normal expanded value.
190 # post_template_fn= is the same, except it happens after the template has
191 # already been expanded and receives that text as a parameter. If you want
192 # to replace templates based on their names or arguments before expansion,
193 # use template_fn=, if you want access to the expanded text without doing
194 # it manually, use post_template_fn=
195 def parse_pronunciation_template_fn(
196 name: str, ht: TemplateArgs
197 ) -> str | None:
198 lname = name.lower()
199 if lname in PANEL_TEMPLATES: 199 ↛ 200line 199 didn't jump to line 200 because the condition on line 199 was never true
200 return ""
201 if lname == "audio":
202 filename = ht.get(1) or ""
203 desc = ht.get(2) or ""
204 desc = clean_node(wxr, None, [desc]).strip()
205 audio = Sound(audio=filename.strip())
206 if desc: 206 ↛ 208line 206 didn't jump to line 208 because the condition on line 206 was always true
207 audio.raw_tags.append(desc)
208 sound_templates.append(audio)
209 return "__SOUND_" + str(len(sound_templates) - 1) + "__"
210 if lname == "audio-ipa":
211 filename = ht.get(1) or ""
212 ipa = ht.get(2) or ""
213 ipa = clean_node(wxr, None, [ipa])
214 audio = Sound(audio=filename.strip())
215 if ipa: 215 ↛ 217line 215 didn't jump to line 217 because the condition on line 215 was always true
216 audio.ipa = ipa
217 sound_templates.append(audio)
218 return "__SOUND_" + str(len(sound_templates) - 1) + "__"
219 if lname == "ipachar":
220 ipa = ht.get(1) or ""
221 if ipa: 221 ↛ 232line 221 didn't jump to line 232 because the condition on line 221 was always true
222 ipa = clean_node(wxr, None, [ipa])
223 audio = Sound(ipa=ipa.strip())
224 sound_templates.append(audio)
225 return "__SOUND_" + str(len(sound_templates) - 1) + "__"
226 # Simple Wiktionary AHD = enPR
227 # The IPA templates of Simple Wiktionary are simple enough that we can
228 # just pull the data from the arguments and clean them up and use as
229 # is; in contrast with en.wiktionary's templates, where you can have
230 # processed qualifiers everywhere, it becomes necessary to do all of
231 # this in post_template_fn= and parse the expanded output.
232 if lname in (
233 "ipa",
234 "sampa",
235 "ahd",
236 "enpr",
237 ):
238 for ipa in (ht.get(x, "") for x in (1, 2, 3, 4)): 238 ↛ 249line 238 didn't jump to line 249 because the loop on line 238 didn't complete
239 if ipa: 239 ↛ 238line 239 didn't jump to line 238 because the condition on line 239 was always true
240 ipa = clean_node(wxr, None, [ipa])
241 if lname == "ipa":
242 audio = Sound(ipa=ipa.strip())
243 elif lname == "sampa": 243 ↛ 244line 243 didn't jump to line 244 because the condition on line 243 was never true
244 audio = Sound(sampa=ipa.strip())
245 elif lname in ("ahd", "enpr"): 245 ↛ 247line 245 didn't jump to line 247 because the condition on line 245 was always true
246 audio = Sound(enpr=ipa.strip())
247 sound_templates.append(audio)
248 return "__SOUND_" + str(len(sound_templates) - 1) + "__"
249 if lname in (
250 "homophone",
251 "homophones",
252 "hmp",
253 ):
254 homophones = [s for s in ht.values()]
255 audio = Sound()
256 for hp in homophones:
257 hp = clean_node(wxr, None, [hp])
258 audio.homophones.append(hp)
259 if homophones: 259 ↛ 263line 259 didn't jump to line 263 because the condition on line 259 was always true
260 sound_templates.append(audio)
261 return "__SOUND_" + str(len(sound_templates) - 1) + "__"
263 return None
265 def post_parse_pronunciation_template_fn(
266 name: str,
267 ht: TemplateArgs,
268 expanded: str,
269 ) -> str | None:
270 lname = name.lower()
271 if lname in PANEL_TEMPLATES: 271 ↛ 272line 271 didn't jump to line 272 because the condition on line 271 was never true
272 return ""
273 if lname in ("hyph", "hyphenation"):
274 # Add hyphenation template output straight to data
275 # English hyphenation rules don't make sense. You don't break up
276 # "united" into "ụ-nit-ed", that t definitely belongs at the
277 # beginning of the last syllable. """you-night.... ED""". Bah.
278 text = clean_node(
279 wxr,
280 None,
281 [expanded],
282 ) # clean_node strip()s by default
283 m = REMOVE_HYPHENATION_RE.match(text)
284 if m: 284 ↛ 286line 284 didn't jump to line 286 because the condition on line 284 was always true
285 text = m.group(1)
286 if text and target_data.hyphenation: 286 ↛ 287line 286 didn't jump to line 287 because the condition on line 286 was never true
287 target_data.hyphenation += "; " + text
288 elif text: 288 ↛ 291line 288 didn't jump to line 291 because the condition on line 288 was always true
289 target_data.hyphenation = text
290 else:
291 return None
292 return ""
293 return None
295 # Using the already parsed parse-tree would be ideal, but because wikitext
296 # is written by humans, sometimes that parse-tree does not actually reflect
297 # what it *is*. A pronunciation section for Simple Wiktionary is relatively
298 # simple and usually pretty similar, but if we suddenly were introduced to
299 # something like a template that generates more entries that are appended to
300 # the source text as bullet points, that isn't reflected in the parse tree
301 # which has unexpanded templates. The parse tree also doesn't understand
302 # what a line is; newlines are just parts of strings, or something that
303 # would be created by a parse node implicitly. We know that a line is
304 # meaningful in this context: if we use clean_node to revert the parse tree
305 # and expand template nodes found in it so that we have a 'clean' raw-text
306 # representation with a few remnants of wikitext syntax (like headers and
307 # bullet point syntax), then we can handle each line separately and also
308 # keep an eye out on the list hierarchy/depth at the same time. The expanded
309 # templates can also be handled in template_fn and post_template_fn,
310 # even going so far as to leave magic markers in the text that are easily
311 # regexable later.
312 parts: list[str] = []
313 for i, child in enumerate(node.invert_find_child(LEVEL_KIND_FLAGS)):
314 parts.append(
315 clean_node(
316 wxr,
317 None,
318 child,
319 template_fn=parse_pronunciation_template_fn,
320 post_template_fn=post_parse_pronunciation_template_fn,
321 no_strip=True,
322 )
323 )
324 pron_main = "".join(parts)
326 # logger.debug(f"{wxr.wtp.title}\n{pron_main}")
328 # We parse the already expanded and cleaned text; templates have been either
329 # expanded or they've been replaced with something in the _template_fn
330 # handler functions. We're left with a "bare-bones" parse tree that mainly
331 # has list structure.
332 # This is future-proofing, but it's an issue in other extractors: if a
333 # template is used to generate the pronunciation section list, it has
334 # been expanded here and properly parsed.
335 pron_root = wxr.wtp.parse(pron_main)
336 # logger.debug(print_tree(pron_root, indent=2, ret_value=True))
338 recursively_complete_sound_data(wxr, pron_root, sound_templates)
340 # print(pron_main)
341 # for st in sound_templates:
342 # print(st.model_dump_json(exclude_defaults=True))
344 # print(target_data)
346 # remove duplicate tags
347 for st in sound_templates:
348 legit_tags, raw_tags, poses = convert_tags(st.raw_tags)
349 if len(legit_tags) > 0:
350 st.tags = sorted(set(legit_tags))
351 st.raw_tags = sorted(set(raw_tags))
352 if len(poses) > 0: 352 ↛ 353line 352 didn't jump to line 353 because the condition on line 352 was never true
353 st.poses.extend(poses)
354 st.poses = sorted(set(st.poses))
356 if len(sound_templates) > 0:
357 # completely replace sound data with new
358 target_data.sounds = sound_templates
360 return None