Coverage for src/wiktextract/extractor/el/pos.py: 80%
465 statements
« prev ^ index » next coverage.py v7.11.0, created at 2025-11-03 05:44 +0000
« prev ^ index » next coverage.py v7.11.0, created at 2025-11-03 05:44 +0000
1import re
2from collections.abc import Iterator
3from functools import partial
4from typing import Any, TypeAlias
5from unicodedata import name as unicode_name
7from wikitextprocessor import (
8 HTMLNode,
9 NodeKind,
10 TemplateArgs,
11 TemplateNode,
12 WikiNode,
13)
14from wikitextprocessor.parser import LEVEL_KIND_FLAGS
16from wiktextract import WiktextractContext
17from wiktextract.extractor.el.tags import translate_raw_tags
18from wiktextract.page import clean_node
20from .head import parse_head
21from .linkages import process_linkage_section
22from .models import (
23 Example,
24 FormOf,
25 FormSource,
26 Linkage,
27 Sense,
28 TemplateData,
29 WordEntry,
30)
31from .parse_utils import (
32 GREEK_LANGCODES,
33 expand_suffix_forms,
34 parse_lower_heading,
35 remove_duplicate_forms,
36)
37from .section_titles import POS_HEADINGS, Heading, POSName
38from .table import parse_table, process_inflection_section, remove_article_forms
39from .tags_utils import convert_tags_in_sense
40from .text_utils import (
41 ENDING_NUMBER_RE,
42 normalized_int,
43)
44from .translations import process_translations
46# from wiktextract.wxr_logging import logger
49def process_pos(
50 wxr: WiktextractContext,
51 node: WikiNode,
52 data: WordEntry,
53 prev_data: WordEntry | None, # data from the last entry in this language
54 # the "noun" in "Noun 2"
55 pos: POSName,
56 title: str,
57 # the "2" in "Noun 2"
58 pos_tags: list[str],
59 pos_num: int = -1,
60) -> WordEntry | None:
61 """Process a part-of-speech section, like 'Noun'. `data` provides basic
62 data common with other POS sections, like pronunciation or etymology."""
64 # Metadata for different part-of-speech kinds.
65 # print(f"{pos_title=}, {pos_tags=}, {pos_num=}")
66 data.pos = pos # the internal/translated name for the POS
67 data.pos_num = pos_num # SEW uses "Noun 1", "Noun 2" style headings.
68 for pos_tag in pos_tags:
69 if pos_tag not in data.tags: 69 ↛ 68line 69 didn't jump to line 68 because the condition on line 69 was always true
70 data.tags.append(pos_tag)
72 wxr.wtp.start_subsection(title)
74 # Sound data associated with this POS might be coming from a shared
75 # section, in which case we've tried to tag the sound data with its
76 # pos name + number if possible. Filter out stuff that doesn't fit.
77 # This is actually pretty common, but if the edition has proper hierarchies
78 # for this, doing this step might be unnecessary.
79 new_sounds = []
80 for sound in data.sounds: 80 ↛ 81line 80 didn't jump to line 81 because the loop on line 80 never started
81 if len(sound.poses) == 0:
82 # This sound data wasn't tagged with any specific pos section(s), so
83 # we add it to everything; this is basically the default behavior.
84 new_sounds.append(sound)
85 else:
86 for sound_pos in sound.poses:
87 m = ENDING_NUMBER_RE.search(sound_pos)
88 if m is not None:
89 s_num = normalized_int(m.group(1).strip())
90 s_pos = sound_pos[: m.start()].strip().lower()
91 else:
92 s_pos = sound_pos.strip().lower()
93 s_num = -1
94 sound_meta = POS_HEADINGS[s_pos]
95 s_pos = sound_meta["pos"]
96 if s_pos == data.pos and s_num == data.pos_num:
97 new_sounds.append(sound)
98 data.sounds = new_sounds
100 # Get child nodes *except* headings (= LEVEL).
101 pos_contents = list(
102 node.invert_find_child(LEVEL_KIND_FLAGS, include_empty_str=True)
103 # include empty string only for debug printing?
104 )
106 if len(pos_contents) == 0 or ( 106 ↛ 113line 106 didn't jump to line 113 because the condition on line 106 was never true
107 len(pos_contents) == 1
108 and isinstance(pos_contents[0], str)
109 # Just a single newline or whitespace after heading.
110 and not pos_contents[0].strip()
111 ):
112 # Most probably a bad article.
113 wxr.wtp.error(
114 "No body for Part-of-speech section.", sortid="simple/pos/271"
115 )
116 data.senses.append(Sense(tags=["no-gloss"]))
117 return data
119 # split_nodes_to_lines returns lists items on their own 'line'
120 node_lines = list(split_nodes_to_lines(pos_contents))
122 glosses_index = None
123 glosses_lists = []
124 for i, line in enumerate(node_lines):
125 # Looking at the "rump" after glosses lists starts, it's simplest
126 # just to pull all the list nodes, and handle them. Anything after
127 # or inbetween (like categories, extra templates, tables and images)
128 # can be ignored.
129 if (
130 len(line) == 1
131 and isinstance(line[0], WikiNode)
132 and line[0].kind == NodeKind.LIST
133 and (line[0].sarg != ":")
134 ):
135 if glosses_index is None: 135 ↛ 137line 135 didn't jump to line 137 because the condition on line 135 was always true
136 glosses_index = i
137 glosses_lists.append(line[0])
139 if glosses_index is None:
140 # if nothing found, accept ":" nodes
141 for i, line in enumerate(node_lines):
142 if (
143 len(line) == 1
144 and isinstance(line[0], WikiNode)
145 and line[0].kind == NodeKind.LIST
146 ):
147 if glosses_index is None: 147 ↛ 149line 147 didn't jump to line 149 because the condition on line 147 was always true
148 glosses_index = i
149 glosses_lists.append(line[0])
151 if glosses_index is None: 151 ↛ 154line 151 didn't jump to line 154 because the condition on line 151 was never true
152 # Could not find any glosses.
153 # logger.info(f" //// {wxr.wtp.title}\n MISSING GLOSSES")
154 wxr.wtp.wiki_notice("Missing glosses", sortid="pos/20250121")
155 data.tags.append("no-gloss")
157 template_data: list[TemplateData] = []
158 category_data: list[str] = []
159 table_nodes: list[tuple[str | None, WikiNode]] = []
160 # template_depth is used as a nonlocal variable in bold_node_handler
161 # to gauge how deep inside a top-level template we are; we want to
162 # collect template data only for the top-level templates that are
163 # visible in the wikitext, not templates inside templates.
164 template_depth = 0
165 top_template_name: str | None = None
167 def bold_node_handler_fn(
168 node: WikiNode,
169 ) -> list[str | WikiNode] | str | None:
170 """Insert special markers `__*S__` and `__*E__` around bold nodes so
171 that the strings can later be split into "head-word" and "tag-words"
172 parts. Collect incidental stuff, like side-tables, that are often
173 put around the head."""
174 assert isinstance(node, WikiNode)
175 kind = node.kind
176 nonlocal template_depth
177 nonlocal top_template_name
178 if kind == NodeKind.BOLD or (
179 isinstance(node, HTMLNode)
180 and (
181 node.tag == "span"
182 and "style" in node.attrs
183 and (
184 "bold" in node.attrs["style"]
185 # Special handling for output for stuff in arabic script
186 or node.attrs["style"] == "color:black; font-size:200%;"
187 )
188 or node.tag == "b"
189 or node.tag == "strong"
190 )
191 ):
192 # These are word forms almost always
193 return ["__B__", *node.children, "__/B__"]
194 elif kind == NodeKind.ITALIC or (
195 isinstance(node, HTMLNode)
196 and (
197 (
198 node.tag == "span"
199 and "style" in node.attrs
200 and "italic" in node.attrs["style"]
201 )
202 or node.tag == "i"
203 or node.tag == "em"
204 )
205 ):
206 # These are almost always tag words; often 'kai' isn't italicized,
207 # for example.
208 return ["__I__", *node.children, "__/I__"]
209 elif isinstance(node, TemplateNode):
210 # Recursively expand templates so that even nodes inside the
211 # the templates are handled with bold_node_handler.
212 # Argh. Don't use "node_to_text", that causes bad output...
213 expanded = wxr.wtp.expand(wxr.wtp.node_to_wikitext(node))
214 if template_depth == 0: 214 ↛ 228line 214 didn't jump to line 228 because the condition on line 214 was always true
215 # We are looking at a top-level template in the original
216 # wikitext.
217 template_data.append(
218 TemplateData(
219 name=node.template_name,
220 args={
221 str(k): clean_node(wxr, None, v)
222 for k, v in node.template_parameters.items()
223 },
224 expansion=expanded,
225 )
226 )
227 top_template_name = node.template_name
228 new_node = wxr.wtp.parse(expanded)
230 template_depth += 1
231 ret = wxr.wtp.node_to_text(
232 new_node, node_handler_fn=bold_node_handler_fn
233 )
234 template_depth -= 1
235 if template_depth == 0: 235 ↛ 237line 235 didn't jump to line 237 because the condition on line 235 was always true
236 top_template_name = None
237 return ret
238 elif kind == NodeKind.LINK:
239 if not isinstance(node.largs[0][0], str): 239 ↛ 240line 239 didn't jump to line 240 because the condition on line 239 was never true
240 return None
241 if node.largs[0][0].startswith("Κατηγορία:"):
242 category_data.append(node.largs[0][0][len("Κατηγορία:") :])
243 return [""]
244 # Special case for meta-links like Πρότυπο:ετ that generate
245 # both a category link and :category link that is actually
246 # displayed as a link, but for our purposes we want to ignore
247 # that it is a link; it's a tag.
248 if node.largs[0][0].startswith(":Κατηγορία:"):
249 # unpacking a list-comprehension, unpacking into a list
250 # seems to be more performant than adding lists together.
251 return [
252 wxr.wtp.node_to_text(
253 node.largs[1:2] or node.largs[0],
254 node_handler_fn=bold_node_handler_fn,
255 )
256 # output the "visible" half of the link.
257 ]
258 if node.largs[0][0].startswith("Αρχείο:"): 258 ↛ 259line 258 didn't jump to line 259 because the condition on line 258 was never true
259 return [""]
260 # Often forms are 'formatted' with links, so let's mark these
261 # too.
262 return [
263 "__L__",
264 wxr.wtp.node_to_text(
265 node.largs[1:2] or node.largs[0],
266 node_handler_fn=bold_node_handler_fn,
267 ),
268 # output the "visible" half of the link.
269 # XXX collect link data if it turns out to be important.
270 "__/L__",
271 ]
272 # print(f"{node.largs=}")
274 elif kind in { 274 ↛ 280line 274 didn't jump to line 280 because the condition on line 274 was never true
275 NodeKind.TABLE,
276 }:
277 # XXX Handle tables here
278 # template depth and top-level template name
279 nonlocal table_nodes
280 table_nodes.append((top_template_name, node))
281 return [""]
282 return None
284 # Get Head Line
285 # Head *should* be immediately before the glosses...
286 # print(node_lines[:glosses_index])
287 found_head = False
289 for line in reversed(node_lines[:glosses_index]):
290 template_data = []
291 template_depth = 0
292 stripped = (
293 wxr.wtp.node_to_text(line, node_handler_fn=bold_node_handler_fn)
294 .removeprefix(":")
295 .strip()
296 )
297 if not stripped:
298 continue
299 if not found_head and (parsed_forms := parse_head(wxr, stripped)): 299 ↛ 289line 299 didn't jump to line 289 because the condition on line 299 was always true
300 for form in parsed_forms:
301 translate_raw_tags(form)
303 if (
304 data.lang_code == "el"
305 and not data.word.startswith("-")
306 # If there are spaces around the "/", we don't parse the
307 # header correctly, so just skip the expansion.
308 # Ex. "πρωτοπόρος, -α / -ος, -ο"
309 # Remove this check if that ever gets fixed.
310 and len(parsed_forms) == 3
311 # Only adjectives or participles
312 and (
313 data.pos == "adj"
314 or (data.pos == "verb" and "participle" in data.tags)
315 )
316 ):
317 parsed_forms = expand_suffix_forms(parsed_forms)
319 parsed_forms = remove_article_forms(parsed_forms, data.word)
320 data.forms.extend(parsed_forms)
321 found_head = True
323 if not found_head: 323 ↛ 329line 323 didn't jump to line 329 because the condition on line 323 was never true
324 # There are a bunch of Greek Wiktionary articles with POS sections
325 # without heads, but they seem to always follow ones with heads;
326 # in this case, the result is just not including any `forms` field
327 # for these (or copying the previous one).
329 if prev_data is None:
330 wxr.wtp.wiki_notice(
331 f"Part of speech missing head: {wxr.wtp.title}",
332 sortid="pos/460/20250104",
333 )
334 else:
335 # No head found, copy previous (in this language)
336 data.forms = [
337 form.model_copy(deep=True) for form in prev_data.forms
338 ]
340 if len(template_data) > 0: 340 ↛ 341line 340 didn't jump to line 341 because the condition on line 340 was never true
341 data.head_templates = template_data
342 # logger.info(
343 # f" //// {wxr.wtp.title}\n >>>"
344 # + "\n >>>".join(repr(td) for td in template_data)
345 # )
347 for template_name, table_node in table_nodes: 347 ↛ 349line 347 didn't jump to line 349 because the loop on line 347 never started
348 # XXX template_name
349 parse_table(
350 wxr,
351 table_node,
352 data,
353 data.lang_code in GREEK_LANGCODES,
354 template_name=template_name or "",
355 source="inflection",
356 )
358 data.forms = remove_duplicate_forms(wxr, data.forms)
360 # Ignore images and files
361 # 2025-01-17 13:10:10,035 INFO: //// Ηρόδοτος
362 # //// [[Αρχείο:Herodotus Massimo Inv124478.jpg|200px|thumb|[[προτομή]] του Ηροδότου]]
364 # Have to ignore {{(( specifically. Creates columns.
365 # 2025-01-17 13:10:11,059 INFO: //// κάνω
366 # //// {{((|width=97%}}
368 # logger.info(f"<<<<<<<<< {wxr.wtp.title}\n<<<" + "\n<<<".join(pparts))
369 # see: free -> {{en-verb-'free'}} creates a floating inflection table
370 # followed by the usual head template
372 # see: τηλεομοιοτυπία
373 # '''{{PAGENAME}}''' {{θ}}
374 # theta is basically {{f|...}}
376 # see: θηλυκός
377 # '''{{PAGENAME}}, -ή''' ''και'' '''-ιά, -ό'''
378 # pagename, -e and -ia, -o, no indication of what these mean
380 # Ιόνια νησιά
381 # >>>'''{{PAGENAME}}''' ''πληθυντικός του'' [[ιόνιος|Ιόνιο]] [[νησί]]
382 # plural of 'Ionian island'
384 # >>>>>>>>> free
385 # >>>{{en-adj-r}} # floating table
386 # >>>{{τ|en|{{PAGENAME}}}}, ''συγκριτικός'' '''freer''', ''υπερθετικός'' '''freest'''
387 # pretty consistent bolding and italics
389 # genus
390 # {{τ|la|{{PAGENAME}}}} {{ο}} ([[γενική]]: generis) (3ης [[κλίση]]ς)
392 # ουδέτερος
393 # >>>'''{{PAGENAME}} -η -ο'''
395 # καφέ
396 # >>>'''{{PAGENAME}}''' {{ακλ|επίθ}}
397 # aklitos, uninflected
399 # καφέ
400 # >>>[[Αρχείο:Cafe Museum - Vienna (5402363918).jpg|μικρογραφία|τραπέζι σε βιενέζικο '''καφέ''']]
401 # >>>'''{{PAGENAME}}''' {{ο}} {{ακλ|ουδ}}
402 # Ignore images
404 # κρόκος
405 # >>>{| align="right"
406 # >>>
407 # >>>|-
408 # >>>
409 # >>>|[[Αρχείο:Crocus sativus1.jpg|thumb|150px|Άνθη '''κρόκου''' (''Crocus sativus'').]]
410 # >>>
411 # >>>
412 # >>>|[[Αρχείο:Raw egg.jpg|thumb|200px|Ο '''κρόκος''' ενός αβγού.]]
413 # >>>
414 # >>>
415 # >>>|}
416 # >>>
417 # >>>'''{{PAGENAME}}''' {{α}}
419 # p
420 # >>>'''{{PAGENAME}}''' ''πεζό'' (''κεφαλαίο:'' '''{{l|P|la}}''')
421 # lowercase, uppercase
423 # Δημόκριτος
424 # >>>'''{{PAGENAME}}'''
425 # >>># {{όνομα||α}}
426 # >>>{{clear}}
427 # Clear is just formatting to move the line down where there are empty
428 # margins.
430 # BIG ASSUMPTION: let us assume that Greek Wiktionary doesn't use templates
431 # that generate multiline text that is part of head. That is, we can see
432 # each newline because they are in strings, and when something that does
433 # generate virtual newlines (list) pops up, that's when the head portion
434 # ends.
435 # Greek Wiktionary head sections look like this:
436 # > Pre-head templates that create side-tables, like inflections
437 # > Possible formatting templates like {{clear}} that should be ignored
438 # > Head template last before glosses list
439 # > Clear again...
440 # > Glosses list tree, where we can stop.
441 # We can create "lines" of these by looping over the items in pos_content
442 # and looking for newlines in strings, because that's where they mainly
443 # should be (except side-table templates). We handle earlier lines
444 # differently than the last line before the glosses list, which is the
445 # head.
447 # return None
449 # ======================
451 ### Glosses after head ###
452 # parts = []
453 got_senses = False
454 for lst in glosses_lists:
455 # Wiktionaries handle glosses the usual way: with numbered lists.
456 # Each list entry is a gloss, sometimes with subglosses, but with
457 # Simple English Wiktionary that seems rare.
458 # logger.debug(f"{lst}")
459 senses = recurse_glosses(wxr, lst, data)
460 if len(senses) > 0: 460 ↛ 454line 460 didn't jump to line 454 because the condition on line 460 was always true
461 got_senses = True
462 for sense in senses:
463 translate_raw_tags(sense)
464 data.senses.extend(senses)
466 if not got_senses and len(glosses_lists) > 0: 466 ↛ 467line 466 didn't jump to line 467 because the condition on line 466 was never true
467 wxr.wtp.error(
468 "POS had a list, but the list did not return senses.",
469 sortid="simple/pos/313",
470 )
472 # If there is no list, clump everything into one gloss.
473 # if not len(glosses_lists > 0):
474 # sense = Sense()
475 # found_gloss = parse_gloss(wxr, sense, pos_contents[i:])
476 # if found_gloss is True or len(sense.raw_tags) > 0:
477 # convert_tags_in_sense(sense)
478 # if len(sense.glosses) == 0 and "no-gloss" not in sense.tags:
479 # sense.tags.append("no-gloss")
480 # data.senses.append(sense)
482 if len(data.senses) == 0: 482 ↛ 483line 482 didn't jump to line 483 because the condition on line 482 was never true
483 data.senses.append(Sense(tags=["no-gloss"]))
485 #####
486 #####
487 # TEMP DEBUG PRINTS
489 pos_sublevels = list(
490 node.find_child(LEVEL_KIND_FLAGS)
491 # include empty string only for debug printing?
492 )
494 for sl in pos_sublevels: 494 ↛ 495line 494 didn't jump to line 495 because the loop on line 494 never started
495 subtitle = clean_node(wxr, None, sl.largs).lower().strip()
497 heading_type, *_ = parse_lower_heading(wxr, subtitle)
499 if heading_type == Heading.Translations:
500 process_translations(wxr, data, sl)
501 elif heading_type == Heading.Infl:
502 source: FormSource = "inflection"
503 if data.lang_code in ("el", "grc"):
504 source = "conjugation"
505 process_inflection_section(wxr, data, sl, source=source)
506 elif heading_type in (
507 Heading.Related,
508 Heading.Synonyms,
509 Heading.Antonyms,
510 Heading.Transliterations,
511 ):
512 process_linkage_section(wxr, data, sl, heading_type)
513 # if heading_type not in (
514 # Heading.Translations,
515 # Heading.Ignored,
516 # Heading.Infl,
517 # Heading.Related,
518 # Heading.Synonyms,
519 # Heading.Antonyms,
520 # Heading.Derived,
521 # # We're going to ignore homonyms because they're
522 # # only tangentially related, like anagrams
523 # Heading.Homonyms,
524 # ):
525 # # ...
526 # expanded = wxr.wtp.expand(wxr.wtp.node_to_wikitext(sl))
527 # # text = clean_node(wxr, None, sl)
528 # logger.warning(
529 # f"""
530 # {wxr.wtp.title}: {heading_type}, {ok=}
531 # {expanded}
533 # ###########################
534 # """
535 # )
537 #####
538 #####
539 return data
542PARENS_BEFORE_RE = re.compile(r"\s*(\([^()]+\)\s*)+")
543ITER_PARENS_RE = re.compile(r"\(([^()]+)\)")
546def bold_node_fn(
547 node: WikiNode,
548) -> list[str | WikiNode] | None:
549 """Handle nodes in the parse tree specially."""
550 # print(f"{node=}")
551 if node.kind == NodeKind.ITALIC:
552 return ["__I__", *node.children, "__/I__"]
553 if node.kind == NodeKind.BOLD:
554 return ["__B__", *node.children, "__/B__"]
555 # if node.kind == NodeKind.LINK:
556 # if not isinstance(node.largs[0][0], str):
557 # return None
558 # return [
559 # "__L__",
560 # # unpacking a list-comprehension, unpacking into a list
561 # # seems to be more performant than adding lists together.
562 # *(
563 # wxr.wtp.node_to_text(
564 # node.largs[1:2] or node.largs[0],
565 # )
566 # # output the "visible" half of the link.
567 # ),
568 # # XXX collect link data if it turns out to be important.
569 # "__/L__",
570 # ]
571 # # print(f"{node.largs=}")
572 return None
575def extract_form_of_templates(
576 wxr: WiktextractContext,
577 parent_sense: Sense | WordEntry,
578 t_node: TemplateNode,
579 siblings: list[str | WikiNode],
580 siblings_index: int,
581) -> None:
582 """Parse form_of for nouns, adjectives and verbs.
584 Supports:
585 1. κλ | generic | form_of
586 2. γρ | generic | form_of
587 3. πτώση/πτώσεις | nouns, adjectives etc. | form_of and tags
588 4. υπο/υποκ | nouns | form_of
589 5. μεγ/μεγεθ | nouns | form_of
590 6. ρημ τύπος | verbs | form_of
591 7. μτχ | verbs | form_of
593 * References:
594 1. https://el.wiktionary.org/wiki/Πρότυπο:κλ
595 2. https://el.wiktionary.org/wiki/Module:άλλημορφή
596 3. https://el.wiktionary.org/wiki/Κατηγορία:Πρότυπα_για_κλιτικούς_τύπους
597 4. https://el.wiktionary.org/wiki/Πρότυπο:υπο
598 5. https://el.wiktionary.org/wiki/Πρότυπο:μεγ
599 6. https://el.wiktionary.org/wiki/Πρότυπο:ρημ_τύπος
600 7. https://el.wiktionary.org/wiki/Κατηγορία:Πρότυπα_για_μετοχές
601 """
602 t_name = t_node.template_name
604 basic_extract = partial(
605 extract_form_of_templates_basic,
606 wxr,
607 parent_sense,
608 siblings,
609 siblings_index,
610 t_name,
611 t_node,
612 )
613 # Generic
614 if t_name == "κλ":
615 return basic_extract(extract_argument=2)
617 # Notes:
618 # * All occurrences in wiktionary have at least one argument
619 # * Only handle cases where the second argument refers to a form:
620 # μορφ / μορφή / λόγια μορφή του, etc.
621 # and ignore those mistakenly used as synonym templates
622 if (
623 t_name in ("γρ", "γραφή του", "alter")
624 and 2 in t_node.template_parameters
625 ):
626 second_arg = t_node.template_parameters[2]
627 second_arg_str = clean_node(wxr, None, second_arg)
628 if "μορφ" in second_arg_str:
629 return basic_extract(extract_argument=1)
631 # Nouns and adjectives
632 inflection_t_names = ("πτώσεις", "πτώση")
633 if (
634 any(name in t_name for name in inflection_t_names)
635 and 1 in t_node.template_parameters
636 ):
637 return extract_form_of_templates_ptosi(wxr, parent_sense, t_node)
639 # Nouns
640 # Note that the "diminutive/augmentative" tags will be added later on
641 # via translation of the "υποκοριστικό/μεγεθυντικό" raw_tags
642 for template_name in ("υπο", "υποκ", "μεγ", "μεγεθ"):
643 if t_name == template_name and 1 in t_node.template_parameters:
644 return basic_extract(extract_argument=1)
646 # Verbs
647 if t_name == "ρημ τύπος":
648 return basic_extract(extract_argument=2)
650 if t_name.startswith("μτχ"):
651 return basic_extract(extract_argument=1)
654def extract_form_of_templates_basic(
655 wxr: WiktextractContext,
656 parent_sense: Sense | WordEntry,
657 siblings: list[str | WikiNode],
658 sibling_index: int,
659 t_name: str,
660 t_node: TemplateNode,
661 extract_argument: int | str,
662) -> None:
663 t_args = t_node.template_parameters
664 if extract_argument in t_args:
665 lemma = clean_node(wxr, None, t_args[extract_argument]).strip()
666 else:
667 # mtxpp template has no args, consume the next links for the
668 # form_of field
669 # cf. https://github.com/tatuylonen/wiktextract/issues/1372
670 wxr.wtp.wiki_notice(
671 f"Form-of template does not have lemma data: {t_name}, {t_args=}",
672 sortid="pos/570/20250517",
673 )
674 links: list[str | WikiNode] = []
675 for node in siblings[sibling_index + 1 :]:
676 if not (
677 (isinstance(node, str) and node.strip() == "")
678 or (isinstance(node, WikiNode) and node.kind == NodeKind.LINK)
679 ):
680 break
681 links.append(node)
682 lemma = clean_node(wxr, None, links).strip()
684 if lemma:
685 form_of = FormOf(word=lemma)
686 parent_sense.form_of.append(form_of)
687 else:
688 wxr.wtp.wiki_notice(
689 "Lemma extract from form-of template was empty or whitespace:"
690 f"{t_name}, {t_args=}, {lemma=}",
691 sortid="pos/609/20250925",
692 )
695PTOSI_GENDER_INFLECTION_MAP = {
696 "θηλ": "feminine",
697 "αρσ": "masculine",
698 "ουδ": "neuter",
699}
700PTOSI_NUMBER_INFLECTION_MAP = {
701 "εν": "singular",
702 "πλ": "plural",
703}
704PTOSI_CASE_INFLECTION_MAP = {
705 "Ο": "nominative",
706 "Α": "accusative",
707 "Γ": "genitive",
708 "Κ": "vocative",
709}
712def extract_form_of_templates_ptosi(
713 wxr: WiktextractContext,
714 parent_sense: Sense | WordEntry,
715 t_node: TemplateNode,
716) -> None:
717 """Parse form_of for nouns and adjectives.
719 Supports:
720 * [gender του] πτώση-πτώσεις templates
722 Notes:
723 * The πτώση-πτώσεις templates contains:
724 * Case(s): 1 for πτώση, >1 for πτώσεις - in uppercase characters.
725 * Number: "εν" (singular) or "πλ" (plural)
726 Examples:
727 * {{πτώσηΑεν|κόρφος}} > accusative | singular
728 * {{πτώσειςΟΚπλ|κόρφος}} > nominative, vocative | plural
729 """
730 t_name = t_node.template_name
731 inflection_t_names = ("πτώσεις", "πτώση")
732 tags: list[str] = []
734 # Parse and consume gender if any
735 if "-" in t_name:
736 # Cf. {{ουδ του-πτώσειςΟΑΚεν|καλός|grc}}
737 gender, inflection = t_name.split("-")
738 code = gender[:3]
739 try:
740 gender_tag = PTOSI_GENDER_INFLECTION_MAP[code]
741 except KeyError:
742 # Bad template name.
743 return
744 tags.append(gender_tag)
745 else:
746 inflection = t_name
748 # Remove πτώση-πτώσεις prefix
749 for prefix in inflection_t_names: 749 ↛ 754line 749 didn't jump to line 754 because the loop on line 749 didn't complete
750 if inflection.startswith(prefix):
751 inflection = inflection[len(prefix) :]
752 break
754 try:
755 lowercase = "".join(ch for ch in inflection if ch.islower())
756 number = PTOSI_NUMBER_INFLECTION_MAP[lowercase]
757 uppercase = [ch for ch in inflection if not ch.islower()]
758 cases = [PTOSI_CASE_INFLECTION_MAP[ch] for ch in uppercase]
759 except KeyError:
760 # Bad template name.
761 return
763 tags.extend([*cases, number])
764 tags.sort() # For the tests, but also good practice
766 lemma = clean_node(wxr, None, t_node.template_parameters[1])
767 form_of = FormOf(word=lemma)
768 parent_sense.form_of.append(form_of)
769 parent_sense.tags.extend(tags)
772def parse_gloss(
773 wxr: WiktextractContext, parent_sense: Sense, contents: list[str | WikiNode]
774) -> bool:
775 """Take what is preferably a line of text and extract tags and a gloss from
776 it. The data is inserted into parent_sense, and for recursion purposes
777 we return a boolean that tells whether there was any gloss text in a
778 lower node."""
779 if len(contents) == 0: 779 ↛ 780line 779 didn't jump to line 780 because the condition on line 779 was never true
780 return False
782 for i, t_node in enumerate(contents):
783 if isinstance(t_node, TemplateNode):
784 extract_form_of_templates(wxr, parent_sense, t_node, contents, i)
786 template_tags: list[str] = []
788 bl_linkages: list[Linkage] = []
789 no_gloss_but_keep_anyway = False
791 def bl_template_handler_fn(name: str, ht: TemplateArgs) -> str | None:
792 nonlocal bl_linkages
793 if name == "βλ":
794 for k, v in ht.items():
795 if isinstance(k, int):
796 bl_linkages.append(Linkage(word=clean_node(wxr, None, v)))
797 return ""
798 return None
800 # The rest of the text.
801 text = clean_node(
802 wxr,
803 parent_sense,
804 contents,
805 template_fn=bl_template_handler_fn,
806 node_handler_fn=bold_node_fn,
807 )
809 if len(bl_linkages) > 0:
810 parent_sense.related.extend(bl_linkages)
811 no_gloss_but_keep_anyway = True
813 if not text.strip():
814 if len(bl_linkages) <= 0: 814 ↛ 815line 814 didn't jump to line 815 because the condition on line 814 was never true
815 return False
817 # print(f" ============ {contents=}, {text=}")
819 # Greek Wiktionary uses a lot of template-less tags.
820 if parens_n := PARENS_BEFORE_RE.match(text):
821 blocks = ITER_PARENS_RE.findall(parens_n.group(0))
822 # print(f"{blocks=}")
823 kept_blocks: list[str] = []
824 forms: list[str] = []
825 raw_tag_texts: list[str] = []
826 for block in blocks:
827 if block_has_non_greek_text(block):
828 # Keep parentheses with non-greek text with gloss text)
829 kept_blocks.extend(("(", block, ") "))
830 continue
831 nforms, nraw_tag_texts = extract_forms_and_tags(block)
832 forms.extend(nforms)
833 raw_tag_texts.extend(nraw_tag_texts)
834 # print(f"{forms=}, {raw_tag_texts=}")
835 if forms: 835 ↛ 837line 835 didn't jump to line 837 because the condition on line 835 was never true
836 # print(f"{forms=}")
837 parent_sense.related.extend(Linkage(word=form) for form in forms)
838 parent_sense.raw_tags.extend(raw_tag_texts)
839 kept_blocks.append(text[parens_n.end() :])
840 text = "".join(kept_blocks)
842 text = re.sub(r"__/?[IB]__", "", text)
844 if len(template_tags) > 0: 844 ↛ 845line 844 didn't jump to line 845 because the condition on line 844 was never true
845 parent_sense.raw_tags.extend(template_tags)
847 if len(text) > 0:
848 parent_sense.glosses.append(text)
849 return True
851 if no_gloss_but_keep_anyway: 851 ↛ 855line 851 didn't jump to line 855 because the condition on line 851 was always true
852 parent_sense.tags.append("no-gloss")
853 return True
855 return False
858Related: TypeAlias = Linkage
859Synonym: TypeAlias = Linkage
860Antonym: TypeAlias = Linkage
863def recurse_glosses1(
864 wxr: WiktextractContext,
865 parent_sense: Sense,
866 node: WikiNode,
867) -> tuple[
868 list[Sense],
869 list[Example],
870 list[Related],
871 list[Synonym],
872 list[Antonym],
873]:
874 """Helper function for recurse_glosses"""
875 # print(f"{node=}")
877 ret_senses: list[Sense] = []
878 ret_examples: list[Example] = []
879 ret_related: list[Related] = []
880 ret_synonyms: list[Synonym] = []
881 ret_antonyms: list[Antonym] = []
882 found_gloss = False
884 # Pydantic stuff doesn't play nice with Tatu's manual dict manipulation
885 # functions, so we'll use a dummy dict here that we then check for
886 # content and apply to `parent_sense`.
887 dummy_parent: dict[str, Any] = {}
889 related_linkages: list[Linkage] = []
890 example_is_synonym = False
891 example_is_antonym = False
893 def bl_template_handler_fn(name: str, ht: TemplateArgs) -> str | None:
894 nonlocal related_linkages
895 nonlocal example_is_synonym
896 nonlocal example_is_antonym
897 # Sometimes the bl-templates point to synonyms or antonyms, instead
898 # of just "related"; we save them, and if example_is_xxxnym is true,
899 # we later return them as xxxnyms.
900 if name == "βλ":
901 for k, v in ht.items():
902 if isinstance(k, int):
903 related_linkages.append(
904 Linkage(word=clean_node(wxr, None, v))
905 )
906 return ""
907 if name in ("συνων", "συνών"):
908 example_is_synonym = True
909 return ""
910 if name in ("αντων", "αντών"):
911 example_is_antonym = True
912 return ""
913 return None
915 # List nodes contain only LIST_ITEM nodes, which may contain sub-LIST nodes.
916 if node.kind == NodeKind.LIST:
917 list_ret: tuple[
918 list[Sense],
919 list[Example],
920 list[Related],
921 list[Synonym],
922 list[Antonym],
923 ] = ([], [], [], [], [])
924 for child in node.children:
925 if isinstance(child, str) or child.kind != NodeKind.LIST_ITEM: 925 ↛ 927line 925 didn't jump to line 927 because the condition on line 925 was never true
926 # This should never happen
927 wxr.wtp.error(
928 f"{child=} is direct child of NodeKind.LIST",
929 sortid="simple/pos/44",
930 )
931 continue
932 (
933 senses,
934 examples,
935 related,
936 synonyms,
937 antonyms,
938 ) = recurse_glosses1(wxr, parent_sense.model_copy(deep=True), child)
939 list_ret[0].extend(senses)
940 list_ret[1].extend(examples)
941 list_ret[2].extend(related)
942 list_ret[3].extend(synonyms)
943 list_ret[4].extend(antonyms)
944 return list_ret
946 elif node.kind == NodeKind.LIST_ITEM: 946 ↛ 1048line 946 didn't jump to line 1048 because the condition on line 946 was always true
947 # Split at first LIST node found
948 split_at = next(
949 (
950 i
951 for i, c in enumerate(node.children)
952 if isinstance(c, WikiNode) and c.kind == NodeKind.LIST
953 ),
954 len(node.children),
955 )
956 contents = node.children[:split_at]
957 sublists = node.children[split_at:]
959 # A LIST and LIST_ITEM `sarg` is basically the prefix of the line, like
960 # `#` or `##:`: the token that appears at the very start of a line that
961 # is used to parse the depth and structure of lists.
962 # `#` Item 1
963 # `##` Item 1.1
964 # `##*` Example 1.1
965 if node.sarg.endswith((":", "*")) and node.sarg not in (":", "*"):
966 # This is either a quotation or example.
967 text = clean_node(
968 wxr, dummy_parent, contents, template_fn=bl_template_handler_fn
969 ).strip("⮡ \n")
971 # print(f"{contents=}, {text=}, {related_linkages=}")
973 if example_is_synonym or example_is_antonym:
974 link_linkages = []
975 for snode in contents:
976 if not isinstance(snode, WikiNode):
977 continue
978 if snode.kind == NodeKind.LINK:
979 link_linkages.append(
980 Linkage(
981 word=clean_node(wxr, None, snode.largs[0][0])
982 )
983 )
984 else:
985 for link in snode.find_child_recursively(NodeKind.LINK): 985 ↛ 986line 985 didn't jump to line 986 because the loop on line 985 never started
986 link_linkages.append(
987 Linkage(word=clean_node(wxr, None, link))
988 )
990 # print("=====")
991 # print(f"{link_linkages=}")
993 if example_is_synonym:
994 return [], [], [], link_linkages + related_linkages, []
995 elif example_is_antonym: 995 ↛ 998line 995 didn't jump to line 998 because the condition on line 995 was always true
996 return [], [], [], [], link_linkages + related_linkages
998 if len(related_linkages) > 0:
999 # parent_sense.related.extend(bl_linkages)
1000 # related_linkages = []
1001 # if not text.strip():
1002 return [], [], related_linkages, [], []
1004 example_is_synonym = False
1005 example_is_antonym = False
1007 if not text.strip(): 1007 ↛ 1008line 1007 didn't jump to line 1008 because the condition on line 1007 was never true
1008 return [], [], [], [], []
1010 example = Example(text=text)
1011 # logger.debug(f"{wxr.wtp.title}/example\n{text}")
1012 if len(sublists) > 0:
1013 translation = clean_node(wxr, dummy_parent, sublists).strip(
1014 "#*: \n"
1015 )
1016 if translation != "": 1016 ↛ 1019line 1016 didn't jump to line 1019 because the condition on line 1016 was always true
1017 example.translation = translation
1019 for k, v in dummy_parent.items(): 1019 ↛ 1020line 1019 didn't jump to line 1020 because the loop on line 1019 never started
1020 if k == "categories":
1021 parent_sense.categories.extend(v)
1022 dummy_parent = {}
1024 return [], [example], [], [], []
1026 found_gloss = parse_gloss(wxr, parent_sense, contents)
1028 for sl in sublists:
1029 if not (isinstance(sl, WikiNode) and sl.kind == NodeKind.LIST): 1029 ↛ 1031line 1029 didn't jump to line 1031 because the condition on line 1029 was never true
1030 # Should not happen
1031 wxr.wtp.error(
1032 f"Sublist is not NodeKind.LIST: {sublists=!r}",
1033 sortid="simple/pos/82",
1034 )
1035 continue
1036 (
1037 senses,
1038 examples,
1039 related,
1040 synonyms,
1041 antonyms,
1042 ) = recurse_glosses1(wxr, parent_sense.model_copy(deep=True), sl)
1043 ret_senses.extend(senses)
1044 ret_examples.extend(examples)
1045 ret_related.extend(related)
1046 ret_synonyms.extend(synonyms)
1047 ret_antonyms.extend(antonyms)
1048 if len(ret_senses) > 0:
1049 # the recursion returned actual senses from below, so we will
1050 # ignore everything else (incl. any example data that might have
1051 # been given to parent_sense) and return that instead.
1052 # XXX if this becomes relevant, add the example data to a returned
1053 # subsense instead?
1054 # if any(
1055 # isinstance(r, Sense) and r.tags == ["no-gloss"] for r in ret
1056 # ):
1057 # print(f"{ret=}")
1058 return (
1059 combine_senses_with_identical_glosses(ret_senses),
1060 [],
1061 [],
1062 [],
1063 [],
1064 )
1066 # If nothing came from below, then this.
1067 if found_gloss is True or "no-gloss" in parent_sense.tags: 1067 ↛ 1075line 1067 didn't jump to line 1075 because the condition on line 1067 was always true
1068 parent_sense.examples.extend(ret_examples)
1069 parent_sense.related.extend(ret_related)
1070 parent_sense.synonyms.extend(ret_synonyms)
1071 parent_sense.antonyms.extend(ret_antonyms)
1073 return [parent_sense], [], [], [], []
1075 return [], ret_examples, ret_related, ret_synonyms, ret_antonyms
1078def recurse_glosses(
1079 wxr: WiktextractContext, node: WikiNode, data: WordEntry
1080) -> list[Sense]:
1081 """Recurse through WikiNodes to find glosses and sense-related data."""
1082 base_sense = Sense()
1083 ret: list[Sense] = []
1085 senses, examples, related, synonyms, antonyms = recurse_glosses1(
1086 wxr, base_sense, node
1087 )
1088 if ( 1088 ↛ 1094line 1088 didn't jump to line 1094 because the condition on line 1088 was never true
1089 len(examples) > 0
1090 or len(related) > 0
1091 or len(synonyms) > 0
1092 or len(antonyms) > 0
1093 ):
1094 wxr.wtp.error(
1095 "NOT Sense has bubbled to recurse_glosses: "
1096 f"{examples=}, {related=}, {synonyms=}, {antonyms=}",
1097 sortid="pos/glosses/966",
1098 )
1099 for sense in senses:
1100 convert_tags_in_sense(sense)
1101 ret.append(sense)
1103 return ret
1106def split_nodes_to_lines(
1107 nodes: list[WikiNode | str],
1108) -> Iterator[list[WikiNode | str]]:
1109 """Take a list of nodes and split up the list into lines.
1110 This could be done by using node_to_wikitext() to reverse the parsing,
1111 and then you could parse the individual lines after splitting the text,
1112 but it seems unnecessary in the context of Greek Wiktionary PoS sections.
1113 """
1114 parts: list[WikiNode | str] = []
1115 for node in nodes:
1116 if isinstance(node, WikiNode):
1117 # Lists are returned as whole, they're their own line
1118 if node.kind == NodeKind.LIST:
1119 if len(parts) > 0: 1119 ↛ 1120line 1119 didn't jump to line 1120 because the condition on line 1119 was never true
1120 yield parts
1121 parts = []
1122 yield [node]
1123 continue
1124 if isinstance(node, TemplateNode) and node.template_name in ( 1124 ↛ 1131line 1124 didn't jump to line 1131 because the condition on line 1124 was never true
1125 # Ignore specific templates, like {{((}} that bookends a column.
1126 "((",
1127 "))",
1128 "clear",
1129 "κλείδα-ελλ",
1130 ):
1131 continue
1132 parts.append(node)
1133 else:
1134 if "\n" in node:
1135 split_string = node.splitlines()
1136 for spl in split_string[:-1]:
1137 if spl: 1137 ↛ 1138line 1137 didn't jump to line 1138 because the condition on line 1137 was never true
1138 parts.append(spl)
1139 yield parts
1140 parts = []
1141 # special handling for final newline; splitlines ignores it
1142 if node.endswith("\n"):
1143 if split_string[-1]:
1144 parts.append(split_string[-1])
1145 yield parts
1146 parts = []
1147 elif split_string[-1]: 1147 ↛ 1115line 1147 didn't jump to line 1115 because the condition on line 1147 was always true
1148 parts.append(split_string[-1])
1149 elif node: 1149 ↛ 1115line 1149 didn't jump to line 1115 because the condition on line 1149 was always true
1150 parts.append(node)
1152 # yield final parts
1153 if len(parts) > 0: 1153 ↛ 1154line 1153 didn't jump to line 1154 because the condition on line 1153 was never true
1154 yield parts
1157BOLD_RE = re.compile(r"(__/?[BI]__|, |\. )")
1160def extract_forms_and_tags(tagged_text: str) -> tuple[list[str], list[str]]:
1161 forms: list[str] = []
1162 tags: list[str] = []
1164 # print(f"{tagged_text=}")
1165 # inside_italics = False
1166 inside_bold = False
1168 for i, t in enumerate(BOLD_RE.split(tagged_text)):
1169 t = t.strip()
1170 # print(f"{i}: {t=}")
1171 if not t:
1172 continue
1174 if i % 2 == 0:
1175 # Text between splitters
1176 if inside_bold is True: 1176 ↛ 1177line 1176 didn't jump to line 1177 because the condition on line 1176 was never true
1177 forms.append(t)
1178 continue
1179 # Add everything else to raw_tags
1180 # if inside_italics is True:
1181 # tags.append(t)
1182 # continue
1183 # ". " and ", " just split. They're stripped to "." and "," if
1184 # this needs to be modified later.
1185 tags.append(t)
1186 continue
1187 match t:
1188 case "__B__": 1188 ↛ 1189line 1188 didn't jump to line 1189 because the pattern on line 1188 never matched
1189 inside_bold = True
1190 case "__/B__": 1190 ↛ 1191line 1190 didn't jump to line 1191 because the pattern on line 1190 never matched
1191 inside_bold = False
1192 # case "__I__":
1193 # inside_italics = True
1194 # case "__/I__":
1195 # inside_italics = False
1197 return forms, tags
1200META_RE = re.compile(r"__/?[ILEB]__")
1203def block_has_non_greek_text(text: str) -> bool:
1204 text = META_RE.sub("", text)
1205 for t in text.split():
1206 for ch in t: 1206 ↛ 1205line 1206 didn't jump to line 1205 because the loop on line 1206 didn't complete
1207 if not ch.isalpha(): 1207 ↛ 1208line 1207 didn't jump to line 1208 because the condition on line 1207 was never true
1208 continue
1209 if not unicode_name(ch).startswith("GREEK"):
1210 return True
1211 break
1212 return False
1215def combine_senses_with_identical_glosses(
1216 orig_senses: list[Sense],
1217) -> list[Sense]:
1218 glosses_to_senses: dict[tuple[str, ...], list[Sense]] = {}
1219 senses: list[Sense] = []
1221 found_identical_glosses = False
1223 for item in orig_senses:
1224 glosses_key = tuple(item.glosses)
1225 if glosses_key not in glosses_to_senses: 1225 ↛ 1228line 1225 didn't jump to line 1228 because the condition on line 1225 was always true
1226 glosses_to_senses[glosses_key] = [item]
1227 else:
1228 glosses_to_senses[glosses_key].append(item)
1229 found_identical_glosses = True
1231 if not found_identical_glosses: 1231 ↛ 1234line 1231 didn't jump to line 1234 because the condition on line 1231 was always true
1232 return orig_senses
1234 for twinned_senses in glosses_to_senses.values():
1235 main_sense = twinned_senses[0]
1236 for other_sense in twinned_senses[1:]:
1237 main_sense.merge(other_sense)
1238 senses.append(main_sense)
1240 return senses