Coverage for src/wiktextract/extractor/simple/pos.py: 80%
195 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1from wikitextprocessor import NodeKind, TemplateArgs, TemplateNode, WikiNode
2from wikitextprocessor.parser import LEVEL_KIND_FLAGS
4from wiktextract import WiktextractContext
5from wiktextract.page import clean_node
7from .models import Example, Form, Linkage, Sense, TemplateData, WordEntry
8from .section_titles import POS_HEADINGS
9from .table import parse_pos_table
10from .tags_utils import convert_tags_in_sense
11from .text_utils import (
12 POS_ENDING_NUMBER_RE,
13 POS_TEMPLATE_NAMES,
14 STRIP_PUNCTUATION,
15)
17# from wiktextract.wxr_logging import logger
20def remove_duplicate_forms(
21 wxr: WiktextractContext, forms: list[Form]
22) -> list[Form]:
23 """Check for identical forms and remove duplicates."""
24 if not forms:
25 return []
26 new_forms = []
27 for i, form in enumerate(forms):
28 for comp in forms[i + 1 :]:
29 if (
30 form.form == comp.form
31 and form.tags == comp.tags
32 and form.raw_tags == comp.raw_tags
33 ):
34 break
35 # basically "continue" for the outer for block in this case,
36 # but this will not trigger the following else-block
37 else:
38 # No duplicates found in for loop (exited without breaking)
39 new_forms.append(form)
40 if len(forms) > len(new_forms):
41 # wxr.wtp.debug("Found duplicate forms", sortid="simple/pos/32")
42 return new_forms
43 return forms
46ExOrSense = Sense | Example
48IGNORED_GLOSS_TEMPLATES = ("exstub",)
51def parse_gloss(
52 wxr: WiktextractContext, parent_sense: Sense, contents: list[str | WikiNode]
53) -> bool:
54 """Take what is preferably a line of text and extract tags and a gloss from
55 it. The data is inserted into parent_sense, and for recursion purposes
56 we return a boolean that tells whether there was any gloss text in a
57 lower node."""
58 if len(contents) == 0: 58 ↛ 59line 58 didn't jump to line 59 because the condition on line 58 was never true
59 return False
61 template_tags: list[str] = []
62 found_template = False
63 synonyms: list[Linkage] = []
64 antonyms: list[Linkage] = []
66 # We define this subfunction here to use closure with synonyms and antonyms;
67 # this is the usual way we do it with these kinds of _fn's in the main
68 # extractor. You could also make a wrapper function that takes the
69 # variables you want to enclose and returns a _fn function with those
70 # enclosed, although I don't know if that is more or less efficient;
71 # if you need to use the same _fn code in two places, this is the
72 # way to go.
73 # There's a more detailed explanation about using template_fn in
74 # pronunciation.py.
75 def gloss_template_fn(name: str, ht: TemplateArgs) -> str | None:
76 if name in ("synonyms", "synonym", "syn"):
77 for syn in ht.values(): # ht for 'hashtable'. Tatu comes from C.
78 # The template parameters of `synonyms` is simple: just a list.
79 if not syn:
80 continue
81 synonyms.append(
82 Linkage(
83 word=clean_node(
84 wxr, parent_sense, clean_node(wxr, None, syn)
85 )
86 )
87 )
88 # Returning a string means replacing the 'expansion' that would
89 # have otherwise appeared there with it; `None` leaves things alone.
90 return ""
91 if name in ("antonyms", "antonym", "ant"): 91 ↛ 92line 91 didn't jump to line 92 because the condition on line 91 was never true
92 for ant in ht.values():
93 if not ant:
94 continue
95 antonyms.append(
96 Linkage(
97 word=clean_node(
98 wxr, parent_sense, clean_node(wxr, None, ant)
99 )
100 )
101 )
102 return ""
104 if name in IGNORED_GLOSS_TEMPLATES: 104 ↛ 105line 104 didn't jump to line 105 because the condition on line 104 was never true
105 return ""
107 # Don't handle other templates here.
108 return None
110 for i, tnode in enumerate(contents):
111 if (
112 isinstance(tnode, str)
113 and tnode.strip(STRIP_PUNCTUATION)
114 or not isinstance(tnode, (TemplateNode, str))
115 ):
116 # When we encounter the first naked string that isn't just
117 # whitespace or the first WikiNode that isn't a template.
118 break
119 if isinstance(tnode, TemplateNode):
120 if tnode.template_name == "exstub":
121 parent_sense.raw_tags.append("no-gloss")
122 return False
123 tag_text = clean_node(
124 wxr, parent_sense, tnode, template_fn=gloss_template_fn
125 )
126 if tag_text.endswith((")", "]")): 126 ↛ 135line 126 didn't jump to line 135 because the condition on line 126 was always true
127 # Simple wiktionary is pretty good at making these templates
128 # have brackets
129 tag_text = tag_text.strip(STRIP_PUNCTUATION)
130 if tag_text: 130 ↛ 110line 130 didn't jump to line 110 because the condition on line 130 was always true
131 found_template = True
132 template_tags.append(tag_text)
133 else:
134 # looks like normal text, so probably something {{plural of}}.
135 break
136 # else for the for loop: if we never break
137 else:
138 # If we never break, that means the last item was a tag.
139 i += 1
141 if found_template is True:
142 contents = contents[i:]
144 text = clean_node(
145 wxr, parent_sense, contents, template_fn=gloss_template_fn
146 )
148 if len(synonyms) > 0:
149 parent_sense.synonyms = synonyms
151 if len(antonyms) > 0: 151 ↛ 152line 151 didn't jump to line 152 because the condition on line 151 was never true
152 parent_sense.antonyms = antonyms
154 if len(template_tags) > 0:
155 parent_sense.raw_tags.extend(template_tags)
157 if len(text) > 0:
158 parent_sense.glosses.append(text)
159 return True
161 return False
164def recurse_glosses1(
165 wxr: WiktextractContext,
166 parent_sense: Sense,
167 node: WikiNode,
168) -> list[ExOrSense]:
169 """Helper function for recurse_glosses"""
170 ret: list[ExOrSense] = []
171 found_gloss = False
173 if node.kind == NodeKind.LIST:
174 for child in node.children:
175 if isinstance(child, str) or child.kind != NodeKind.LIST_ITEM: 175 ↛ 177line 175 didn't jump to line 177 because the condition on line 175 was never true
176 # This should never happen
177 wxr.wtp.error(
178 f"{child=} is direct child of NodeKind.LIST",
179 sortid="simple/pos/44",
180 )
181 continue
182 ret.extend(
183 recurse_glosses1(wxr, parent_sense.model_copy(deep=True), child)
184 )
185 elif node.kind == NodeKind.LIST_ITEM: 185 ↛ 238line 185 didn't jump to line 238 because the condition on line 185 was always true
186 contents = []
187 sublists = []
188 broke_out = False
189 for i, c in enumerate(node.children):
190 # The contents ends when a new sublist begins.
191 if isinstance(c, WikiNode) and c.kind == NodeKind.LIST:
192 broke_out = True
193 break
194 contents.append(c)
195 if broke_out is True:
196 sublists = node.children[i:]
198 # A LIST and LIST_ITEM `sarg` is basically the prefix of the line, like
199 # `#` or `##:`: the token that appears at the very start of a line that
200 # is used to parse the depth and structure of lists.
201 if node.sarg.endswith((":", "*")) and node.sarg not in (":", "*"):
202 # This is either a quotation or example.
203 # The `not in` filters out lines that are usually notes or random
204 # stuff not inside gloss lists; see "dare".
205 text = clean_node(
206 wxr, parent_sense, contents
207 ) # clean_node strip()s already so no need to .strip() here.
208 example = Example(text=text)
209 # logger.debug(f"{wxr.wtp.title}/example\n{text}")
210 # We will not bother with subglosses for example entries;
211 # XXX do something about it if it becomes relevant.
212 return [example]
213 elif node.sarg in (":", "*"): 213 ↛ 214line 213 didn't jump to line 214 because the condition on line 213 was never true
214 wxr.wtp.debug(
215 f"Gloss item line starts with {node.sarg=}.",
216 sortid="simple/pos/214",
217 )
218 return []
220 found_gloss = parse_gloss(wxr, parent_sense, contents)
222 for sl in sublists:
223 if not (isinstance(sl, WikiNode) and sl.kind == NodeKind.LIST): 223 ↛ 225line 223 didn't jump to line 225 because the condition on line 223 was never true
224 # Should not happen
225 wxr.wtp.error(
226 f"Sublist is not NodeKind.LIST: {sublists=!r}",
227 sortid="simple/pos/82",
228 )
229 continue
230 for r in recurse_glosses1(
231 wxr, parent_sense.model_copy(deep=True), sl
232 ):
233 if isinstance(r, Example):
234 parent_sense.examples.append(r)
235 else:
236 ret.append(r)
238 if len(ret) > 0:
239 # the recursion returned actual senses from below, so we will
240 # ignore everything else (incl. any example data that might have
241 # been given to parent_sense) and return that instead.
242 # XXX if this becomes relevant, add the example data to a returned
243 # subsense instead?
244 return ret
246 if found_gloss is True or "no-gloss" in parent_sense.raw_tags:
247 return [parent_sense]
249 return []
252def recurse_glosses(
253 wxr: WiktextractContext, node: WikiNode, data: WordEntry
254) -> list[Sense]:
255 """Recurse through WikiNodes to find glosses and sense-related data."""
256 base_sense = Sense()
257 ret = []
259 for r in recurse_glosses1(wxr, base_sense, node):
260 if isinstance(r, Example): 260 ↛ 261line 260 didn't jump to line 261 because the condition on line 260 was never true
261 wxr.wtp.error(
262 f"Example() has bubbled to recurse_glosses: {r.json()}",
263 sortid="simple/pos/glosses",
264 )
265 continue
266 convert_tags_in_sense(r)
267 ret.append(r)
269 if len(ret) > 0:
270 return ret
272 return []
275def process_pos(
276 wxr: WiktextractContext,
277 node: WikiNode,
278 data: WordEntry,
279 # the "noun" in "Noun 2"
280 pos_title: str,
281 # the "2" in "Noun 2"
282 pos_num: int = -1,
283) -> WordEntry | None:
284 """Process a part-of-speech section, like 'Noun'. `data` provides basic
285 data common with other POS sections, like pronunciation or etymology."""
287 # Metadata for different part-of-speech kinds.
288 pos_meta = POS_HEADINGS[pos_title]
289 data.pos = pos_meta["pos"] # the internal/translated name for the POS
290 data.pos_num = pos_num # SEW uses "Noun 1", "Noun 2" style headings.
292 # Sound data associated with this POS might be coming from a shared
293 # section, in which case we've tried to tag the sound data with its
294 # pos name + number if possible. Filter out stuff that doesn't fit.
295 new_sounds = []
296 for sound in data.sounds: 296 ↛ 297line 296 didn't jump to line 297 because the loop on line 296 never started
297 if len(sound.poses) == 0:
298 # This sound data wasn't tagged with any specific pos section(s), so
299 # we add it to everything; this is basically the default behavior.
300 new_sounds.append(sound)
301 else:
302 for sound_pos in sound.poses:
303 m = POS_ENDING_NUMBER_RE.search(sound_pos)
304 if m is not None:
305 s_num = int(m.group(1).strip())
306 s_pos = sound_pos[: m.start()].strip().lower()
307 else:
308 s_pos = sound_pos.strip().lower()
309 s_num = -1
310 sound_meta = POS_HEADINGS[s_pos]
311 s_pos = sound_meta["pos"]
312 if s_pos == data.pos and s_num == data.pos_num:
313 new_sounds.append(sound)
314 data.sounds = new_sounds
316 # Get child nodes except headings (= LEVEL).
317 pos_contents = list(node.invert_find_child(LEVEL_KIND_FLAGS))
319 if len(pos_contents) == 0 or (
320 len(pos_contents) == 1
321 and isinstance(pos_contents[0], str)
322 # Just a single newline or whitespace after heading.
323 and not pos_contents[0].strip()
324 ):
325 # Most probably a bad article.
326 wxr.wtp.error(
327 "No body for Part-of-speech section.", sortid="simple/pos/271"
328 )
329 data.senses.append(Sense(tags=["no-gloss"]))
330 return data
332 # Check POS templates at the start of the section (Simple English specific).
333 template_tags: list[str] = []
334 template_forms: list[Form] = []
335 head_templates: list[TemplateData] = []
337 # Typically, a Wiktionary has a word head before glosses, which contains
338 # the main form of the word (usually same as the title of the article)
339 # and common other forms of the word, plus qualifiers and other data
340 # like that; however, for Simple English Wiktionary the format is to
341 # have a table (or two, if there's variations) containing the word's
342 # conjugation or declension, so we don't have to actually parse the
343 # head here.
344 for i, child in enumerate(pos_contents): 344 ↛ 387line 344 didn't jump to line 387 because the loop on line 344 didn't complete
345 if isinstance(child, str) and not child.strip(): 345 ↛ 347line 345 didn't jump to line 347 because the condition on line 345 was never true
346 # Ignore whitespace
347 continue
348 # TemplateNode is a subclass of WikiNode; not all kinds of nodes have
349 # a subclass, but TemplateNode is handy.
350 if (
351 isinstance(child, TemplateNode)
352 and child.template_name in POS_TEMPLATE_NAMES
353 ):
354 if child.template_name not in pos_meta["templates"]: 354 ↛ 355line 354 didn't jump to line 355 because the condition on line 354 was never true
355 wxr.wtp.debug(
356 f"Template {child.template_name} "
357 f"found under {pos_title}",
358 sortid="simple/pos/93",
359 )
360 elif ttags := pos_meta["templates"][child.template_name]: 360 ↛ 363line 360 didn't jump to line 363 because the condition on line 360 was never true
361 # Some templates have associated tags:
362 # `irrnoun` -> ["irregular"]
363 template_tags.extend(ttags)
364 if forms := parse_pos_table(wxr, child, data): 364 ↛ 367line 364 didn't jump to line 367 because the condition on line 364 was always true
365 template_forms.extend(forms)
366 else:
367 wxr.wtp.warning(
368 f"POS template '{child.template_name}' did "
369 "not have any forms.",
370 sortid="simple/pos/129",
371 )
372 head_templates.append(
373 TemplateData(
374 name=child.template_name,
375 args={
376 str(k): clean_node(wxr, None, v)
377 for k, v in child.template_parameters.items()
378 },
379 expansion="[POS TABLE]"
380 # Clean node returns an empty string for a table.
381 # expansion = clean_node(wxr, None, child)
382 )
383 )
384 else:
385 break
387 template_tags = list(set(template_tags))
388 data.forms.extend(template_forms)
389 data.forms = remove_duplicate_forms(wxr, data.forms)
390 data.tags.extend(template_tags)
391 data.head_templates.extend(head_templates)
393 # parts = []
394 found_list = False
395 got_senses = False
396 for child in pos_contents[i:]:
397 # Wiktionaries handle glosses the usual way: with numbered lists.
398 # Each list entry is a gloss, sometimes with subglosses, but with
399 # Simple English Wiktionary that seems rare.
400 # logger.debug(f"{child}")
401 if isinstance(child, WikiNode) and child.kind == NodeKind.LIST:
402 senses = recurse_glosses(wxr, child, data)
403 found_list = True
404 if len(senses) > 0:
405 got_senses = True
406 data.senses.extend(senses)
408 if not got_senses and found_list:
409 wxr.wtp.error(
410 "POS had a list, but the list did not return senses.",
411 sortid="simple/pos/313",
412 )
414 # If there is not list, clump everything into one gloss.
415 if not found_list:
416 sense = Sense()
417 found_gloss = parse_gloss(wxr, sense, pos_contents[i:])
418 if found_gloss is True or len(sense.raw_tags) > 0: 418 ↛ 424line 418 didn't jump to line 424 because the condition on line 418 was always true
419 convert_tags_in_sense(sense)
420 if len(sense.glosses) == 0 and "no-gloss" not in sense.tags:
421 sense.tags.append("no-gloss")
422 data.senses.append(sense)
424 if len(data.senses) == 0:
425 data.senses.append(Sense(tags=["no-gloss"]))
427 return data