Coverage for src / wiktextract / extractor / el / pos.py: 81%
468 statements
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-12 08:09 +0000
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-12 08:09 +0000
1import re
2from collections.abc import Iterator
3from functools import partial
4from typing import Any, TypeAlias
5from unicodedata import name as unicode_name
7from wikitextprocessor import (
8 HTMLNode,
9 NodeKind,
10 TemplateArgs,
11 TemplateNode,
12 WikiNode,
13)
14from wikitextprocessor.parser import LEVEL_KIND_FLAGS
16from wiktextract import WiktextractContext
17from wiktextract.extractor.el.tags import translate_raw_tags
18from wiktextract.page import clean_node
20from .head import parse_head
21from .linkages import process_linkage_section
22from .models import (
23 AltForm,
24 Example,
25 FormSource,
26 Linkage,
27 Sense,
28 TemplateData,
29 WordEntry,
30)
31from .parse_utils import (
32 GREEK_LANGCODES,
33 expand_suffix_forms,
34 parse_lower_heading,
35 remove_duplicate_forms,
36)
37from .section_titles import POS_HEADINGS, Heading, POSName
38from .table import parse_table, process_inflection_section, remove_article_forms
39from .tags_utils import convert_tags_in_sense
40from .text_utils import (
41 ENDING_NUMBER_RE,
42 normalized_int,
43)
44from .translations import process_translations
46# from wiktextract.wxr_logging import logger
49def process_pos(
50 wxr: WiktextractContext,
51 node: WikiNode,
52 data: WordEntry,
53 prev_data: WordEntry | None, # data from the last entry in this language
54 # the "noun" in "Noun 2"
55 pos: POSName,
56 title: str,
57 # the "2" in "Noun 2"
58 pos_tags: list[str],
59 pos_num: int = -1,
60) -> WordEntry | None:
61 """Process a part-of-speech section, like 'Noun'. `data` provides basic
62 data common with other POS sections, like pronunciation or etymology."""
64 # Metadata for different part-of-speech kinds.
65 # print(f"{pos_title=}, {pos_tags=}, {pos_num=}")
66 data.pos = pos # the internal/translated name for the POS
67 data.pos_num = pos_num # SEW uses "Noun 1", "Noun 2" style headings.
68 for pos_tag in pos_tags:
69 if pos_tag not in data.tags: 69 ↛ 68line 69 didn't jump to line 68 because the condition on line 69 was always true
70 data.tags.append(pos_tag)
72 wxr.wtp.start_subsection(title)
74 # Sound data associated with this POS might be coming from a shared
75 # section, in which case we've tried to tag the sound data with its
76 # pos name + number if possible. Filter out stuff that doesn't fit.
77 # This is actually pretty common, but if the edition has proper hierarchies
78 # for this, doing this step might be unnecessary.
79 new_sounds = []
80 for sound in data.sounds: 80 ↛ 81line 80 didn't jump to line 81 because the loop on line 80 never started
81 if len(sound.poses) == 0:
82 # This sound data wasn't tagged with any specific pos section(s), so
83 # we add it to everything; this is basically the default behavior.
84 new_sounds.append(sound)
85 else:
86 for sound_pos in sound.poses:
87 m = ENDING_NUMBER_RE.search(sound_pos)
88 if m is not None:
89 s_num = normalized_int(m.group(1).strip())
90 s_pos = sound_pos[: m.start()].strip().lower()
91 else:
92 s_pos = sound_pos.strip().lower()
93 s_num = -1
94 sound_meta = POS_HEADINGS[s_pos]
95 s_pos = sound_meta["pos"]
96 if s_pos == data.pos and s_num == data.pos_num:
97 new_sounds.append(sound)
98 data.sounds = new_sounds
100 # Get child nodes *except* headings (= LEVEL).
101 pos_contents = list(
102 node.invert_find_child(LEVEL_KIND_FLAGS, include_empty_str=True)
103 # include empty string only for debug printing?
104 )
106 if len(pos_contents) == 0 or ( 106 ↛ 113line 106 didn't jump to line 113 because the condition on line 106 was never true
107 len(pos_contents) == 1
108 and isinstance(pos_contents[0], str)
109 # Just a single newline or whitespace after heading.
110 and not pos_contents[0].strip()
111 ):
112 # Most probably a bad article.
113 wxr.wtp.error(
114 "No body for Part-of-speech section.", sortid="simple/pos/271"
115 )
116 data.senses.append(Sense(tags=["no-gloss"]))
117 return data
119 # split_nodes_to_lines returns lists items on their own 'line'
120 node_lines = list(split_nodes_to_lines(pos_contents))
122 glosses_index = None
123 glosses_lists = []
124 for i, line in enumerate(node_lines):
125 # Looking at the "rump" after glosses lists starts, it's simplest
126 # just to pull all the list nodes, and handle them. Anything after
127 # or inbetween (like categories, extra templates, tables and images)
128 # can be ignored.
129 if (
130 len(line) == 1
131 and isinstance(line[0], WikiNode)
132 and line[0].kind == NodeKind.LIST
133 and (line[0].sarg != ":")
134 ):
135 if glosses_index is None: 135 ↛ 137line 135 didn't jump to line 137 because the condition on line 135 was always true
136 glosses_index = i
137 glosses_lists.append(line[0])
139 if glosses_index is None:
140 # if nothing found, accept ":" nodes
141 for i, line in enumerate(node_lines):
142 if (
143 len(line) == 1
144 and isinstance(line[0], WikiNode)
145 and line[0].kind == NodeKind.LIST
146 ):
147 if glosses_index is None: 147 ↛ 149line 147 didn't jump to line 149 because the condition on line 147 was always true
148 glosses_index = i
149 glosses_lists.append(line[0])
151 if glosses_index is None: 151 ↛ 154line 151 didn't jump to line 154 because the condition on line 151 was never true
152 # Could not find any glosses.
153 # logger.info(f" //// {wxr.wtp.title}\n MISSING GLOSSES")
154 wxr.wtp.wiki_notice("Missing glosses", sortid="pos/20250121")
155 data.tags.append("no-gloss")
157 template_data: list[TemplateData] = []
158 category_data: list[str] = []
159 table_nodes: list[tuple[str | None, WikiNode]] = []
160 # template_depth is used as a nonlocal variable in bold_node_handler
161 # to gauge how deep inside a top-level template we are; we want to
162 # collect template data only for the top-level templates that are
163 # visible in the wikitext, not templates inside templates.
164 template_depth = 0
165 top_template_name: str | None = None
167 def bold_node_handler_fn(
168 node: WikiNode,
169 ) -> list[str | WikiNode] | str | None:
170 """Insert special markers `__*S__` and `__*E__` around bold nodes so
171 that the strings can later be split into "head-word" and "tag-words"
172 parts. Collect incidental stuff, like side-tables, that are often
173 put around the head."""
174 assert isinstance(node, WikiNode)
175 kind = node.kind
176 nonlocal template_depth
177 nonlocal top_template_name
178 if kind == NodeKind.BOLD or (
179 isinstance(node, HTMLNode)
180 and (
181 node.tag == "span"
182 and "style" in node.attrs
183 and (
184 "bold" in node.attrs["style"]
185 # Special handling for output for stuff in arabic script
186 or node.attrs["style"] == "color:black; font-size:200%;"
187 )
188 or node.tag == "b"
189 or node.tag == "strong"
190 )
191 ):
192 # These are word forms almost always
193 return ["__B__", *node.children, "__/B__"]
194 elif kind == NodeKind.ITALIC or (
195 isinstance(node, HTMLNode)
196 and (
197 (
198 node.tag == "span"
199 and "style" in node.attrs
200 and "italic" in node.attrs["style"]
201 )
202 or node.tag == "i"
203 or node.tag == "em"
204 )
205 ):
206 # These are almost always tag words; often 'kai' isn't italicized,
207 # for example.
208 return ["__I__", *node.children, "__/I__"]
209 elif isinstance(node, TemplateNode):
210 # Recursively expand templates so that even nodes inside the
211 # the templates are handled with bold_node_handler.
212 # Argh. Don't use "node_to_text", that causes bad output...
213 expanded = wxr.wtp.expand(wxr.wtp.node_to_wikitext(node))
214 if template_depth == 0: 214 ↛ 228line 214 didn't jump to line 228 because the condition on line 214 was always true
215 # We are looking at a top-level template in the original
216 # wikitext.
217 template_data.append(
218 TemplateData(
219 name=node.template_name,
220 args={
221 str(k): clean_node(wxr, None, v)
222 for k, v in node.template_parameters.items()
223 },
224 expansion=expanded,
225 )
226 )
227 top_template_name = node.template_name
228 new_node = wxr.wtp.parse(expanded)
230 template_depth += 1
231 ret = wxr.wtp.node_to_text(
232 new_node, node_handler_fn=bold_node_handler_fn
233 )
234 template_depth -= 1
235 if template_depth == 0: 235 ↛ 237line 235 didn't jump to line 237 because the condition on line 235 was always true
236 top_template_name = None
237 return ret
238 elif kind == NodeKind.LINK:
239 if not isinstance(node.largs[0][0], str): 239 ↛ 240line 239 didn't jump to line 240 because the condition on line 239 was never true
240 return None
241 if node.largs[0][0].startswith("Κατηγορία:"):
242 category_data.append(node.largs[0][0][len("Κατηγορία:") :])
243 return [""]
244 # Special case for meta-links like Πρότυπο:ετ that generate
245 # both a category link and :category link that is actually
246 # displayed as a link, but for our purposes we want to ignore
247 # that it is a link; it's a tag.
248 if node.largs[0][0].startswith(":Κατηγορία:"):
249 # unpacking a list-comprehension, unpacking into a list
250 # seems to be more performant than adding lists together.
251 return [
252 wxr.wtp.node_to_text(
253 node.largs[1:2] or node.largs[0],
254 node_handler_fn=bold_node_handler_fn,
255 )
256 # output the "visible" half of the link.
257 ]
258 if node.largs[0][0].startswith("Αρχείο:"): 258 ↛ 259line 258 didn't jump to line 259 because the condition on line 258 was never true
259 return [""]
260 # Often forms are 'formatted' with links, so let's mark these
261 # too.
262 return [
263 "__L__",
264 wxr.wtp.node_to_text(
265 node.largs[1:2] or node.largs[0],
266 node_handler_fn=bold_node_handler_fn,
267 ),
268 # output the "visible" half of the link.
269 # XXX collect link data if it turns out to be important.
270 "__/L__",
271 ]
272 # print(f"{node.largs=}")
274 elif kind in { 274 ↛ 280line 274 didn't jump to line 280 because the condition on line 274 was never true
275 NodeKind.TABLE,
276 }:
277 # XXX Handle tables here
278 # template depth and top-level template name
279 nonlocal table_nodes
280 table_nodes.append((top_template_name, node))
281 return [""]
282 return None
284 # Get Head Line
285 # Head *should* be immediately before the glosses...
286 # print(node_lines[:glosses_index])
287 found_head = False
289 for line in reversed(node_lines[:glosses_index]):
290 template_data = []
291 template_depth = 0
292 stripped = (
293 wxr.wtp.node_to_text(line, node_handler_fn=bold_node_handler_fn)
294 .removeprefix(":")
295 .strip()
296 )
297 if not stripped:
298 continue
299 if not found_head and (parsed_forms := parse_head(wxr, stripped)): 299 ↛ 289line 299 didn't jump to line 289 because the condition on line 299 was always true
300 for form in parsed_forms:
301 translate_raw_tags(form)
303 if (
304 data.lang_code == "el"
305 # If there are spaces around the "/", we don't parse the
306 # header correctly, so just skip the expansion.
307 # Ex. "πρωτοπόρος, -α / -ος, -ο"
308 # Remove this check if that ever gets fixed.
309 and len(parsed_forms) == 3
310 # Only adjectives or participles
311 and (
312 data.pos == "adj"
313 or (data.pos == "verb" and "participle" in data.tags)
314 )
315 ):
316 parsed_forms = expand_suffix_forms(parsed_forms)
318 parsed_forms = remove_article_forms(parsed_forms, data.word)
319 data.forms.extend(parsed_forms)
320 found_head = True
322 if not found_head: 322 ↛ 328line 322 didn't jump to line 328 because the condition on line 322 was never true
323 # There are a bunch of Greek Wiktionary articles with POS sections
324 # without heads, but they seem to always follow ones with heads;
325 # in this case, the result is just not including any `forms` field
326 # for these (or copying the previous one).
328 if prev_data is None:
329 wxr.wtp.wiki_notice(
330 f"Part of speech missing head: {wxr.wtp.title}",
331 sortid="pos/460/20250104",
332 )
333 else:
334 # No head found, copy previous (in this language)
335 data.forms = [
336 form.model_copy(deep=True) for form in prev_data.forms
337 ]
339 if len(template_data) > 0: 339 ↛ 340line 339 didn't jump to line 340 because the condition on line 339 was never true
340 data.head_templates = template_data
341 # logger.info(
342 # f" //// {wxr.wtp.title}\n >>>"
343 # + "\n >>>".join(repr(td) for td in template_data)
344 # )
346 for template_name, table_node in table_nodes: 346 ↛ 348line 346 didn't jump to line 348 because the loop on line 346 never started
347 # XXX template_name
348 parse_table(
349 wxr,
350 table_node,
351 data,
352 data.lang_code in GREEK_LANGCODES,
353 template_name=template_name or "",
354 source="inflection",
355 )
357 data.forms = remove_duplicate_forms(wxr, data.forms)
359 # Ignore images and files
360 # 2025-01-17 13:10:10,035 INFO: //// Ηρόδοτος
361 # //// [[Αρχείο:Herodotus Massimo Inv124478.jpg|200px|thumb|[[προτομή]] του Ηροδότου]]
363 # Have to ignore {{(( specifically. Creates columns.
364 # 2025-01-17 13:10:11,059 INFO: //// κάνω
365 # //// {{((|width=97%}}
367 # logger.info(f"<<<<<<<<< {wxr.wtp.title}\n<<<" + "\n<<<".join(pparts))
368 # see: free -> {{en-verb-'free'}} creates a floating inflection table
369 # followed by the usual head template
371 # >>>>>>>>> free
372 # >>>{{en-adj-r}} # floating table
373 # >>>{{τ|en|{{PAGENAME}}}}, ''συγκριτικός'' '''freer''', ''υπερθετικός'' '''freest'''
374 # pretty consistent bolding and italics
376 # genus
377 # {{τ|la|{{PAGENAME}}}} {{ο}} ([[γενική]]: generis) (3ης [[κλίση]]ς)
379 # καφέ
380 # >>>[[Αρχείο:Cafe Museum - Vienna (5402363918).jpg|μικρογραφία|τραπέζι σε βιενέζικο '''καφέ''']]
381 # >>>'''{{PAGENAME}}''' {{ο}} {{ακλ|ουδ}}
382 # Ignore images
384 # κρόκος
385 # >>>{| align="right"
386 # >>>
387 # >>>|-
388 # >>>
389 # >>>|[[Αρχείο:Crocus sativus1.jpg|thumb|150px|Άνθη '''κρόκου''' (''Crocus sativus'').]]
390 # >>>
391 # >>>
392 # >>>|[[Αρχείο:Raw egg.jpg|thumb|200px|Ο '''κρόκος''' ενός αβγού.]]
393 # >>>
394 # >>>
395 # >>>|}
396 # >>>
397 # >>>'''{{PAGENAME}}''' {{α}}
399 # p
400 # >>>'''{{PAGENAME}}''' ''πεζό'' (''κεφαλαίο:'' '''{{l|P|la}}''')
401 # lowercase, uppercase
403 # Δημόκριτος
404 # >>>'''{{PAGENAME}}'''
405 # >>># {{όνομα||α}}
406 # >>>{{clear}}
407 # Clear is just formatting to move the line down where there are empty
408 # margins.
410 # BIG ASSUMPTION: let us assume that Greek Wiktionary doesn't use templates
411 # that generate multiline text that is part of head. That is, we can see
412 # each newline because they are in strings, and when something that does
413 # generate virtual newlines (list) pops up, that's when the head portion
414 # ends.
415 # Greek Wiktionary head sections look like this:
416 # > Pre-head templates that create side-tables, like inflections
417 # > Possible formatting templates like {{clear}} that should be ignored
418 # > Head template last before glosses list
419 # > Clear again...
420 # > Glosses list tree, where we can stop.
421 # We can create "lines" of these by looping over the items in pos_content
422 # and looking for newlines in strings, because that's where they mainly
423 # should be (except side-table templates). We handle earlier lines
424 # differently than the last line before the glosses list, which is the
425 # head.
427 # ======================
429 ### Glosses after head ###
430 got_senses = False
431 for lst in glosses_lists:
432 # Wiktionaries handle glosses the usual way: with numbered lists.
433 # Each list entry is a gloss, sometimes with subglosses, but with
434 # Simple English Wiktionary that seems rare.
435 # logger.debug(f"{lst}")
436 senses = recurse_glosses(wxr, lst, data)
437 if len(senses) > 0: 437 ↛ 431line 437 didn't jump to line 431 because the condition on line 437 was always true
438 got_senses = True
439 for sense in senses:
440 translate_raw_tags(sense)
441 data.senses.extend(senses)
443 if not got_senses and len(glosses_lists) > 0: 443 ↛ 444line 443 didn't jump to line 444 because the condition on line 443 was never true
444 wxr.wtp.error(
445 "POS had a list, but the list did not return senses.",
446 sortid="simple/pos/313",
447 )
449 # If there is no list, clump everything into one gloss.
450 # if not len(glosses_lists > 0):
451 # sense = Sense()
452 # found_gloss = parse_gloss(wxr, sense, pos_contents[i:])
453 # if found_gloss is True or len(sense.raw_tags) > 0:
454 # convert_tags_in_sense(sense)
455 # if len(sense.glosses) == 0 and "no-gloss" not in sense.tags:
456 # sense.tags.append("no-gloss")
457 # data.senses.append(sense)
459 if len(data.senses) == 0: 459 ↛ 460line 459 didn't jump to line 460 because the condition on line 459 was never true
460 data.senses.append(Sense(tags=["no-gloss"]))
462 #####
463 #####
464 # TEMP DEBUG PRINTS
466 pos_sublevels = list(
467 node.find_child(LEVEL_KIND_FLAGS)
468 # include empty string only for debug printing?
469 )
471 for sl in pos_sublevels:
472 subtitle = clean_node(wxr, None, sl.largs).lower().strip()
474 heading_type, *_ = parse_lower_heading(wxr, subtitle)
476 match heading_type:
477 case Heading.Translations: 477 ↛ 478line 477 didn't jump to line 478 because the pattern on line 477 never matched
478 process_translations(wxr, data, sl)
479 case Heading.Infl: 479 ↛ 480line 479 didn't jump to line 480 because the pattern on line 479 never matched
480 source: FormSource = "inflection"
481 if data.lang_code in ("el", "grc"):
482 source = "conjugation"
483 process_inflection_section(wxr, data, sl, source=source)
484 case ( 484 ↛ 471line 484 didn't jump to line 471 because the pattern on line 484 always matched
485 Heading.Related
486 | Heading.Synonyms
487 | Heading.Antonyms
488 | Heading.Transliterations
489 | Heading.AltOf
490 | Heading.FormOf
491 ):
492 process_linkage_section(wxr, data, sl, heading_type)
493 # if heading_type not in (
494 # Heading.Translations,
495 # Heading.Ignored,
496 # Heading.Infl,
497 # Heading.Related,
498 # Heading.Synonyms,
499 # Heading.Antonyms,
500 # Heading.Derived,
501 # # We're going to ignore homonyms because they're
502 # # only tangentially related, like anagrams
503 # Heading.Homonyms,
504 # ):
505 # # ...
506 # expanded = wxr.wtp.expand(wxr.wtp.node_to_wikitext(sl))
507 # # text = clean_node(wxr, None, sl)
508 # logger.warning(
509 # f"""
510 # {wxr.wtp.title}: {heading_type}, {ok=}
511 # {expanded}
513 # ###########################
514 # """
515 # )
517 #####
518 #####
519 return data
522PARENS_BEFORE_RE = re.compile(r"\s*(\([^()]+\)\s*)+")
523ITER_PARENS_RE = re.compile(r"\(([^()]+)\)")
526def bold_node_fn(
527 node: WikiNode,
528) -> list[str | WikiNode] | None:
529 """Handle nodes in the parse tree specially."""
530 # print(f"{node=}")
531 if node.kind == NodeKind.ITALIC:
532 return ["__I__", *node.children, "__/I__"]
533 if node.kind == NodeKind.BOLD:
534 return ["__B__", *node.children, "__/B__"]
535 # if node.kind == NodeKind.LINK:
536 # if not isinstance(node.largs[0][0], str):
537 # return None
538 # return [
539 # "__L__",
540 # # unpacking a list-comprehension, unpacking into a list
541 # # seems to be more performant than adding lists together.
542 # *(
543 # wxr.wtp.node_to_text(
544 # node.largs[1:2] or node.largs[0],
545 # )
546 # # output the "visible" half of the link.
547 # ),
548 # # XXX collect link data if it turns out to be important.
549 # "__/L__",
550 # ]
551 # # print(f"{node.largs=}")
552 return None
555def extract_alt_form_templates(
556 wxr: WiktextractContext,
557 parent_sense: Sense | WordEntry,
558 t_node: TemplateNode,
559 siblings: list[str | WikiNode],
560 siblings_index: int,
561) -> None:
562 """Parse form_of or alt_of templates.
564 Supports:
565 1. κλ | generic | form_of
566 2. γρ | generic | form_of or alt_of
567 3. πτώση/πτώσεις | nouns, adjectives etc. | form_of and tags
568 4. υπο/υποκ | nouns | form_of
569 5. μεγ/μεγεθ | nouns | form_of
570 6. ρημ τύπος | verbs | form_of
571 7. μτχ | verbs | form_of
573 * References:
574 1. https://el.wiktionary.org/wiki/Πρότυπο:κλ
575 2. https://el.wiktionary.org/wiki/Module:άλλημορφή
576 3. https://el.wiktionary.org/wiki/Κατηγορία:Πρότυπα_για_κλιτικούς_τύπους
577 4. https://el.wiktionary.org/wiki/Πρότυπο:υπο
578 5. https://el.wiktionary.org/wiki/Πρότυπο:μεγ
579 6. https://el.wiktionary.org/wiki/Πρότυπο:ρημ_τύπος
580 7. https://el.wiktionary.org/wiki/Κατηγορία:Πρότυπα_για_μετοχές
581 """
582 t_name = t_node.template_name
583 t_args = t_node.template_parameters
585 basic_extract_form_of = partial(
586 extract_form_of_templates_basic,
587 wxr,
588 parent_sense,
589 siblings,
590 siblings_index,
591 t_name,
592 t_node,
593 )
594 # Generic
595 if t_name == "κλ":
596 return basic_extract_form_of(extract_argument=2)
598 # Generic
599 # * Try parsing a "form_of" if the second template arguments refers to a
600 # form (μορφ / μορφή / λόγια μορφή του, etc.).
601 # * Otherwise, parse an "alt_of"
602 #
603 # Notes:
604 # * All occurrences in wiktionary have at least one argument
605 if t_name in ("γρ", "γραφή του", "alter") and 1 in t_args:
606 if 2 in t_node.template_parameters:
607 second_arg = t_node.template_parameters[2]
608 if "μορφ" in clean_node(wxr, None, second_arg):
609 return basic_extract_form_of(extract_argument=1)
610 # We could add some tags here, but AltForm takes none
611 word = clean_node(wxr, None, t_args[1]).strip()
612 parent_sense.alt_of.append(AltForm(word=word))
613 return
615 # Nouns and adjectives
616 if any(name in t_name for name in ("πτώσεις", "πτώση")) and 1 in t_args:
617 return extract_form_of_templates_ptosi(wxr, parent_sense, t_node)
619 # Nouns
620 # Note that the "diminutive/augmentative" tags will be added later on
621 # via translation of the "υποκοριστικό/μεγεθυντικό" raw_tags
622 if t_name in ("υπο", "υποκ", "μεγ", "μεγεθ") and 1 in t_args:
623 return basic_extract_form_of(extract_argument=1)
625 # Verbs
626 if t_name == "ρημ τύπος":
627 return basic_extract_form_of(extract_argument=2)
629 if t_name.startswith("μτχ"):
630 return basic_extract_form_of(extract_argument=1)
633def extract_form_of_templates_basic(
634 wxr: WiktextractContext,
635 parent_sense: Sense | WordEntry,
636 siblings: list[str | WikiNode],
637 sibling_index: int,
638 t_name: str,
639 t_node: TemplateNode,
640 extract_argument: int | str,
641) -> None:
642 t_args = t_node.template_parameters
643 if extract_argument in t_args:
644 lemma = clean_node(wxr, None, t_args[extract_argument]).strip()
645 else:
646 # mtxpp template has no args, consume the next links for the
647 # form_of field
648 # cf. https://github.com/tatuylonen/wiktextract/issues/1372
649 wxr.wtp.wiki_notice(
650 f"Form-of template does not have lemma data: {t_name}, {t_args=}",
651 sortid="pos/570/20250517",
652 )
653 links: list[str | WikiNode] = []
654 for node in siblings[sibling_index + 1 :]:
655 if not (
656 (isinstance(node, str) and node.strip() == "")
657 or (isinstance(node, WikiNode) and node.kind == NodeKind.LINK)
658 ):
659 break
660 links.append(node)
661 lemma = clean_node(wxr, None, links).strip()
663 if lemma:
664 form_of = AltForm(word=lemma)
665 parent_sense.form_of.append(form_of)
666 else:
667 wxr.wtp.wiki_notice(
668 "Lemma extract from form-of template was empty or whitespace:"
669 f"{t_name}, {t_args=}, {lemma=}",
670 sortid="pos/609/20250925",
671 )
674PTOSI_GENDER_INFLECTION_MAP = {
675 "θηλ": "feminine",
676 "αρσ": "masculine",
677 "ουδ": "neuter",
678}
679PTOSI_NUMBER_INFLECTION_MAP = {
680 "εν": "singular",
681 "πλ": "plural",
682}
683PTOSI_CASE_INFLECTION_MAP = {
684 "Ο": "nominative",
685 "Α": "accusative",
686 "Γ": "genitive",
687 "Κ": "vocative",
688}
691def extract_form_of_templates_ptosi(
692 wxr: WiktextractContext,
693 parent_sense: Sense | WordEntry,
694 t_node: TemplateNode,
695) -> None:
696 """Parse form_of for nouns and adjectives.
698 Supports:
699 * [gender του] πτώση-πτώσεις templates
701 Notes:
702 * The πτώση-πτώσεις templates contains:
703 * Case(s): 1 for πτώση, >1 for πτώσεις - in uppercase characters.
704 * Number: "εν" (singular) or "πλ" (plural)
705 Examples:
706 * {{πτώσηΑεν|κόρφος}} > accusative | singular
707 * {{πτώσειςΟΚπλ|κόρφος}} > nominative, vocative | plural
708 """
709 t_name = t_node.template_name
710 inflection_t_names = ("πτώσεις", "πτώση")
711 tags: list[str] = []
713 # Parse and consume gender if any
714 if "-" in t_name:
715 # Cf. {{ουδ του-πτώσειςΟΑΚεν|καλός|grc}}
716 gender, inflection = t_name.split("-")
717 code = gender[:3]
718 try:
719 gender_tag = PTOSI_GENDER_INFLECTION_MAP[code]
720 except KeyError:
721 # Bad template name.
722 return
723 tags.append(gender_tag)
724 else:
725 inflection = t_name
727 # Remove πτώση-πτώσεις prefix
728 for prefix in inflection_t_names: 728 ↛ 733line 728 didn't jump to line 733 because the loop on line 728 didn't complete
729 if inflection.startswith(prefix):
730 inflection = inflection[len(prefix) :]
731 break
733 try:
734 lowercase = "".join(ch for ch in inflection if ch.islower())
735 number = PTOSI_NUMBER_INFLECTION_MAP[lowercase]
736 uppercase = [ch for ch in inflection if not ch.islower()]
737 cases = [PTOSI_CASE_INFLECTION_MAP[ch] for ch in uppercase]
738 except KeyError:
739 # Bad template name.
740 return
742 tags.extend([*cases, number])
743 tags.sort() # For the tests, but also good practice
745 lemma = clean_node(wxr, None, t_node.template_parameters[1])
746 form_of = AltForm(word=lemma)
747 parent_sense.form_of.append(form_of)
748 parent_sense.tags.extend(tags)
751def parse_gloss(
752 wxr: WiktextractContext, parent_sense: Sense, contents: list[str | WikiNode]
753) -> bool:
754 """Take what is preferably a line of text and extract tags and a gloss from
755 it. The data is inserted into parent_sense, and for recursion purposes
756 we return a boolean that tells whether there was any gloss text in a
757 lower node."""
758 if len(contents) == 0: 758 ↛ 759line 758 didn't jump to line 759 because the condition on line 758 was never true
759 return False
761 for i, t_node in enumerate(contents):
762 if isinstance(t_node, TemplateNode):
763 extract_alt_form_templates(wxr, parent_sense, t_node, contents, i)
765 template_tags: list[str] = []
767 bl_linkages: list[Linkage] = []
768 no_gloss_but_keep_anyway = False
770 def bl_template_handler_fn(name: str, ht: TemplateArgs) -> str | None:
771 nonlocal bl_linkages
772 if name == "βλ":
773 for k, v in ht.items():
774 if isinstance(k, int):
775 bl_linkages.append(Linkage(word=clean_node(wxr, None, v)))
776 return ""
777 return None
779 # The rest of the text.
780 text = clean_node(
781 wxr,
782 parent_sense,
783 contents,
784 template_fn=bl_template_handler_fn,
785 node_handler_fn=bold_node_fn,
786 )
788 if len(bl_linkages) > 0:
789 parent_sense.related.extend(bl_linkages)
790 no_gloss_but_keep_anyway = True
792 if not text.strip():
793 if len(bl_linkages) <= 0: 793 ↛ 794line 793 didn't jump to line 794 because the condition on line 793 was never true
794 return False
796 # print(f" ============ {contents=}, {text=}")
798 # Greek Wiktionary uses a lot of template-less tags.
799 if parens_n := PARENS_BEFORE_RE.match(text):
800 blocks = ITER_PARENS_RE.findall(parens_n.group(0))
801 # print(f"{blocks=}")
802 kept_blocks: list[str] = []
803 forms: list[str] = []
804 raw_tag_texts: list[str] = []
805 for block in blocks:
806 if block_has_non_greek_text(block):
807 # Keep parentheses with non-greek text with gloss text)
808 kept_blocks.extend(("(", block, ") "))
809 continue
810 nforms, nraw_tag_texts = extract_forms_and_tags(block)
811 forms.extend(nforms)
812 raw_tag_texts.extend(nraw_tag_texts)
813 # print(f"{forms=}, {raw_tag_texts=}")
814 if forms: 814 ↛ 816line 814 didn't jump to line 816 because the condition on line 814 was never true
815 # print(f"{forms=}")
816 parent_sense.related.extend(Linkage(word=form) for form in forms)
817 parent_sense.raw_tags.extend(raw_tag_texts)
818 kept_blocks.append(text[parens_n.end() :])
819 text = "".join(kept_blocks)
821 text = re.sub(r"__/?[IB]__", "", text)
823 if len(template_tags) > 0: 823 ↛ 824line 823 didn't jump to line 824 because the condition on line 823 was never true
824 parent_sense.raw_tags.extend(template_tags)
826 if len(text) > 0:
827 parent_sense.glosses.append(text)
828 return True
830 if no_gloss_but_keep_anyway: 830 ↛ 834line 830 didn't jump to line 834 because the condition on line 830 was always true
831 parent_sense.tags.append("no-gloss")
832 return True
834 return False
837Related: TypeAlias = Linkage
838Synonym: TypeAlias = Linkage
839Antonym: TypeAlias = Linkage
842def recurse_glosses1(
843 wxr: WiktextractContext,
844 parent_sense: Sense,
845 node: WikiNode,
846) -> tuple[
847 list[Sense],
848 list[Example],
849 list[Related],
850 list[Synonym],
851 list[Antonym],
852]:
853 """Helper function for recurse_glosses"""
854 # print(f"{node=}")
856 ret_senses: list[Sense] = []
857 ret_examples: list[Example] = []
858 ret_related: list[Related] = []
859 ret_synonyms: list[Synonym] = []
860 ret_antonyms: list[Antonym] = []
861 found_gloss = False
863 # Pydantic stuff doesn't play nice with Tatu's manual dict manipulation
864 # functions, so we'll use a dummy dict here that we then check for
865 # content and apply to `parent_sense`.
866 dummy_parent: dict[str, Any] = {}
868 related_linkages: list[Linkage] = []
869 example_is_synonym = False
870 example_is_antonym = False
872 def bl_template_handler_fn(name: str, ht: TemplateArgs) -> str | None:
873 nonlocal related_linkages
874 nonlocal example_is_synonym
875 nonlocal example_is_antonym
876 # Sometimes the bl-templates point to synonyms or antonyms, instead
877 # of just "related"; we save them, and if example_is_xxxnym is true,
878 # we later return them as xxxnyms.
879 if name == "βλ":
880 for k, v in ht.items():
881 if isinstance(k, int):
882 related_linkages.append(
883 Linkage(word=clean_node(wxr, None, v))
884 )
885 return ""
886 if name in ("συνων", "συνών"):
887 example_is_synonym = True
888 return ""
889 if name in ("αντων", "αντών"):
890 example_is_antonym = True
891 return ""
892 return None
894 # List nodes contain only LIST_ITEM nodes, which may contain sub-LIST nodes.
895 if node.kind == NodeKind.LIST:
896 list_ret: tuple[
897 list[Sense],
898 list[Example],
899 list[Related],
900 list[Synonym],
901 list[Antonym],
902 ] = ([], [], [], [], [])
903 for child in node.children:
904 if isinstance(child, str) or child.kind != NodeKind.LIST_ITEM: 904 ↛ 906line 904 didn't jump to line 906 because the condition on line 904 was never true
905 # This should never happen
906 wxr.wtp.error(
907 f"{child=} is direct child of NodeKind.LIST",
908 sortid="simple/pos/44",
909 )
910 continue
911 (
912 senses,
913 examples,
914 related,
915 synonyms,
916 antonyms,
917 ) = recurse_glosses1(wxr, parent_sense.model_copy(deep=True), child)
918 list_ret[0].extend(senses)
919 list_ret[1].extend(examples)
920 list_ret[2].extend(related)
921 list_ret[3].extend(synonyms)
922 list_ret[4].extend(antonyms)
923 return list_ret
925 elif node.kind == NodeKind.LIST_ITEM: 925 ↛ 1027line 925 didn't jump to line 1027 because the condition on line 925 was always true
926 # Split at first LIST node found
927 split_at = next(
928 (
929 i
930 for i, c in enumerate(node.children)
931 if isinstance(c, WikiNode) and c.kind == NodeKind.LIST
932 ),
933 len(node.children),
934 )
935 contents = node.children[:split_at]
936 sublists = node.children[split_at:]
938 # A LIST and LIST_ITEM `sarg` is basically the prefix of the line, like
939 # `#` or `##:`: the token that appears at the very start of a line that
940 # is used to parse the depth and structure of lists.
941 # `#` Item 1
942 # `##` Item 1.1
943 # `##*` Example 1.1
944 if node.sarg.endswith((":", "*")) and node.sarg not in (":", "*"):
945 # This is either a quotation or example.
946 text = clean_node(
947 wxr, dummy_parent, contents, template_fn=bl_template_handler_fn
948 ).strip("⮡ \n")
950 # print(f"{contents=}, {text=}, {related_linkages=}")
952 if example_is_synonym or example_is_antonym:
953 link_linkages = []
954 for snode in contents:
955 if not isinstance(snode, WikiNode):
956 continue
957 if snode.kind == NodeKind.LINK:
958 link_linkages.append(
959 Linkage(
960 word=clean_node(wxr, None, snode.largs[0][0])
961 )
962 )
963 else:
964 for link in snode.find_child_recursively(NodeKind.LINK): 964 ↛ 965line 964 didn't jump to line 965 because the loop on line 964 never started
965 link_linkages.append(
966 Linkage(word=clean_node(wxr, None, link))
967 )
969 # print("=====")
970 # print(f"{link_linkages=}")
972 if example_is_synonym:
973 return [], [], [], link_linkages + related_linkages, []
974 elif example_is_antonym: 974 ↛ 977line 974 didn't jump to line 977 because the condition on line 974 was always true
975 return [], [], [], [], link_linkages + related_linkages
977 if len(related_linkages) > 0:
978 # parent_sense.related.extend(bl_linkages)
979 # related_linkages = []
980 # if not text.strip():
981 return [], [], related_linkages, [], []
983 example_is_synonym = False
984 example_is_antonym = False
986 if not text.strip(): 986 ↛ 987line 986 didn't jump to line 987 because the condition on line 986 was never true
987 return [], [], [], [], []
989 example = Example(text=text)
990 # logger.debug(f"{wxr.wtp.title}/example\n{text}")
991 if len(sublists) > 0:
992 translation = clean_node(wxr, dummy_parent, sublists).strip(
993 "#*: \n"
994 )
995 if translation != "": 995 ↛ 998line 995 didn't jump to line 998 because the condition on line 995 was always true
996 example.translation = translation
998 for k, v in dummy_parent.items(): 998 ↛ 999line 998 didn't jump to line 999 because the loop on line 998 never started
999 if k == "categories":
1000 parent_sense.categories.extend(v)
1001 dummy_parent = {}
1003 return [], [example], [], [], []
1005 found_gloss = parse_gloss(wxr, parent_sense, contents)
1007 for sl in sublists:
1008 if not (isinstance(sl, WikiNode) and sl.kind == NodeKind.LIST): 1008 ↛ 1010line 1008 didn't jump to line 1010 because the condition on line 1008 was never true
1009 # Should not happen
1010 wxr.wtp.error(
1011 f"Sublist is not NodeKind.LIST: {sublists=!r}",
1012 sortid="simple/pos/82",
1013 )
1014 continue
1015 (
1016 senses,
1017 examples,
1018 related,
1019 synonyms,
1020 antonyms,
1021 ) = recurse_glosses1(wxr, parent_sense.model_copy(deep=True), sl)
1022 ret_senses.extend(senses)
1023 ret_examples.extend(examples)
1024 ret_related.extend(related)
1025 ret_synonyms.extend(synonyms)
1026 ret_antonyms.extend(antonyms)
1027 if len(ret_senses) > 0:
1028 # the recursion returned actual senses from below, so we will
1029 # ignore everything else (incl. any example data that might have
1030 # been given to parent_sense) and return that instead.
1031 # XXX if this becomes relevant, add the example data to a returned
1032 # subsense instead?
1033 # if any(
1034 # isinstance(r, Sense) and r.tags == ["no-gloss"] for r in ret
1035 # ):
1036 # print(f"{ret=}")
1037 return (
1038 combine_senses_with_identical_glosses(ret_senses),
1039 [],
1040 [],
1041 [],
1042 [],
1043 )
1045 # If nothing came from below, then this.
1046 if found_gloss is True or "no-gloss" in parent_sense.tags: 1046 ↛ 1054line 1046 didn't jump to line 1054 because the condition on line 1046 was always true
1047 parent_sense.examples.extend(ret_examples)
1048 parent_sense.related.extend(ret_related)
1049 parent_sense.synonyms.extend(ret_synonyms)
1050 parent_sense.antonyms.extend(ret_antonyms)
1052 return [parent_sense], [], [], [], []
1054 return [], ret_examples, ret_related, ret_synonyms, ret_antonyms
1057def recurse_glosses(
1058 wxr: WiktextractContext, node: WikiNode, data: WordEntry
1059) -> list[Sense]:
1060 """Recurse through WikiNodes to find glosses and sense-related data."""
1061 base_sense = Sense()
1062 ret: list[Sense] = []
1064 senses, examples, related, synonyms, antonyms = recurse_glosses1(
1065 wxr, base_sense, node
1066 )
1067 if ( 1067 ↛ 1073line 1067 didn't jump to line 1073 because the condition on line 1067 was never true
1068 len(examples) > 0
1069 or len(related) > 0
1070 or len(synonyms) > 0
1071 or len(antonyms) > 0
1072 ):
1073 wxr.wtp.error(
1074 "NOT Sense has bubbled to recurse_glosses: "
1075 f"{examples=}, {related=}, {synonyms=}, {antonyms=}",
1076 sortid="pos/glosses/966",
1077 )
1078 for sense in senses:
1079 convert_tags_in_sense(sense)
1080 ret.append(sense)
1082 return ret
1085def split_nodes_to_lines(
1086 nodes: list[WikiNode | str],
1087) -> Iterator[list[WikiNode | str]]:
1088 """Take a list of nodes and split up the list into lines.
1089 This could be done by using node_to_wikitext() to reverse the parsing,
1090 and then you could parse the individual lines after splitting the text,
1091 but it seems unnecessary in the context of Greek Wiktionary PoS sections.
1092 """
1093 parts: list[WikiNode | str] = []
1094 for node in nodes:
1095 if isinstance(node, WikiNode):
1096 # Lists are returned as whole, they're their own line
1097 if node.kind == NodeKind.LIST:
1098 if len(parts) > 0: 1098 ↛ 1099line 1098 didn't jump to line 1099 because the condition on line 1098 was never true
1099 yield parts
1100 parts = []
1101 yield [node]
1102 continue
1103 if isinstance(node, TemplateNode) and node.template_name in ( 1103 ↛ 1110line 1103 didn't jump to line 1110 because the condition on line 1103 was never true
1104 # Ignore specific templates, like {{((}} that bookends a column.
1105 "((",
1106 "))",
1107 "clear",
1108 "κλείδα-ελλ",
1109 ):
1110 continue
1111 parts.append(node)
1112 else:
1113 if "\n" in node:
1114 split_string = node.splitlines()
1115 for spl in split_string[:-1]:
1116 if spl: 1116 ↛ 1117line 1116 didn't jump to line 1117 because the condition on line 1116 was never true
1117 parts.append(spl)
1118 yield parts
1119 parts = []
1120 # special handling for final newline; splitlines ignores it
1121 if node.endswith("\n"):
1122 if split_string[-1]:
1123 parts.append(split_string[-1])
1124 yield parts
1125 parts = []
1126 elif split_string[-1]: 1126 ↛ 1094line 1126 didn't jump to line 1094 because the condition on line 1126 was always true
1127 parts.append(split_string[-1])
1128 elif node: 1128 ↛ 1094line 1128 didn't jump to line 1094 because the condition on line 1128 was always true
1129 parts.append(node)
1131 # yield final parts
1132 if len(parts) > 0: 1132 ↛ 1133line 1132 didn't jump to line 1133 because the condition on line 1132 was never true
1133 yield parts
1136BOLD_RE = re.compile(r"(__/?[BI]__|, |\. )")
1139def extract_forms_and_tags(tagged_text: str) -> tuple[list[str], list[str]]:
1140 forms: list[str] = []
1141 tags: list[str] = []
1143 # print(f"{tagged_text=}")
1144 # inside_italics = False
1145 inside_bold = False
1147 for i, t in enumerate(BOLD_RE.split(tagged_text)):
1148 t = t.strip()
1149 # print(f"{i}: {t=}")
1150 if not t:
1151 continue
1153 if i % 2 == 0:
1154 # Text between splitters
1155 if inside_bold is True: 1155 ↛ 1156line 1155 didn't jump to line 1156 because the condition on line 1155 was never true
1156 forms.append(t)
1157 continue
1158 # Add everything else to raw_tags
1159 # if inside_italics is True:
1160 # tags.append(t)
1161 # continue
1162 # ". " and ", " just split. They're stripped to "." and "," if
1163 # this needs to be modified later.
1164 tags.append(t)
1165 continue
1166 match t:
1167 case "__B__": 1167 ↛ 1168line 1167 didn't jump to line 1168 because the pattern on line 1167 never matched
1168 inside_bold = True
1169 case "__/B__": 1169 ↛ 1170line 1169 didn't jump to line 1170 because the pattern on line 1169 never matched
1170 inside_bold = False
1171 # case "__I__":
1172 # inside_italics = True
1173 # case "__/I__":
1174 # inside_italics = False
1176 return forms, tags
1179META_RE = re.compile(r"__/?[ILEB]__")
1182def block_has_non_greek_text(text: str) -> bool:
1183 text = META_RE.sub("", text)
1184 for t in text.split():
1185 for ch in t: 1185 ↛ 1184line 1185 didn't jump to line 1184 because the loop on line 1185 didn't complete
1186 if not ch.isalpha(): 1186 ↛ 1187line 1186 didn't jump to line 1187 because the condition on line 1186 was never true
1187 continue
1188 if not unicode_name(ch).startswith("GREEK"):
1189 return True
1190 break
1191 return False
1194def combine_senses_with_identical_glosses(
1195 orig_senses: list[Sense],
1196) -> list[Sense]:
1197 glosses_to_senses: dict[tuple[str, ...], list[Sense]] = {}
1198 senses: list[Sense] = []
1200 found_identical_glosses = False
1202 for item in orig_senses:
1203 glosses_key = tuple(item.glosses)
1204 if glosses_key not in glosses_to_senses: 1204 ↛ 1207line 1204 didn't jump to line 1207 because the condition on line 1204 was always true
1205 glosses_to_senses[glosses_key] = [item]
1206 else:
1207 glosses_to_senses[glosses_key].append(item)
1208 found_identical_glosses = True
1210 if not found_identical_glosses: 1210 ↛ 1213line 1210 didn't jump to line 1213 because the condition on line 1210 was always true
1211 return orig_senses
1213 for twinned_senses in glosses_to_senses.values():
1214 main_sense = twinned_senses[0]
1215 for other_sense in twinned_senses[1:]:
1216 main_sense.merge(other_sense)
1217 senses.append(main_sense)
1219 return senses