Coverage for src/wiktextract/extractor/el/pos.py: 80%
452 statements
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
1import re
2from collections.abc import Iterator
3from functools import partial
4from typing import TypeAlias
5from unicodedata import name as unicode_name
7from wikitextprocessor import (
8 HTMLNode,
9 NodeKind,
10 TemplateArgs,
11 TemplateNode,
12 WikiNode,
13)
14from wikitextprocessor.parser import LEVEL_KIND_FLAGS
16from wiktextract import WiktextractContext
17from wiktextract.page import clean_node
18from wiktextract.wxr_logging import logger
20from .head import parse_head
21from .linkages import process_linkage_section
22from .models import Example, FormOf, Linkage, Sense, TemplateData, WordEntry
23from .parse_utils import (
24 GREEK_LANGCODES,
25 Heading,
26 parse_lower_heading,
27 remove_duplicate_forms,
28)
29from .section_titles import POS_HEADINGS
30from .table import parse_table, process_inflection_section
31from .tags_utils import convert_tags_in_sense
32from .text_utils import (
33 ENDING_NUMBER_RE,
34 normalized_int,
35)
36from .translations import process_translations
38# from wiktextract.wxr_logging import logger
41def process_pos(
42 wxr: WiktextractContext,
43 node: WikiNode,
44 data: WordEntry,
45 prev_data: WordEntry | None, # data from the last entry in this language
46 # the "noun" in "Noun 2"
47 pos: str,
48 title: str,
49 # the "2" in "Noun 2"
50 pos_tags: list[str],
51 pos_num: int = -1,
52) -> WordEntry | None:
53 """Process a part-of-speech section, like 'Noun'. `data` provides basic
54 data common with other POS sections, like pronunciation or etymology."""
56 # Metadata for different part-of-speech kinds.
57 # print(f"{pos_title=}, {pos_tags=}, {pos_num=}")
58 data.pos = pos # the internal/translated name for the POS
59 data.pos_num = pos_num # SEW uses "Noun 1", "Noun 2" style headings.
61 wxr.wtp.start_subsection(title)
63 # Sound data associated with this POS might be coming from a shared
64 # section, in which case we've tried to tag the sound data with its
65 # pos name + number if possible. Filter out stuff that doesn't fit.
66 # This is actually pretty common, but if the edition has proper hierarchies
67 # for this, doing this step might be unnecessary.
68 new_sounds = []
69 for sound in data.sounds: 69 ↛ 70line 69 didn't jump to line 70 because the loop on line 69 never started
70 if len(sound.poses) == 0:
71 # This sound data wasn't tagged with any specific pos section(s), so
72 # we add it to everything; this is basically the default behavior.
73 new_sounds.append(sound)
74 else:
75 for sound_pos in sound.poses:
76 m = ENDING_NUMBER_RE.search(sound_pos)
77 if m is not None:
78 s_num = normalized_int(m.group(1).strip())
79 s_pos = sound_pos[: m.start()].strip().lower()
80 else:
81 s_pos = sound_pos.strip().lower()
82 s_num = -1
83 sound_meta = POS_HEADINGS[s_pos]
84 s_pos = sound_meta["pos"]
85 if s_pos == data.pos and s_num == data.pos_num:
86 new_sounds.append(sound)
87 data.sounds = new_sounds
89 # Get child nodes *except* headings (= LEVEL).
90 pos_contents = list(
91 node.invert_find_child(LEVEL_KIND_FLAGS, include_empty_str=True)
92 # include empty string only for debug printing?
93 )
95 if len(pos_contents) == 0 or ( 95 ↛ 102line 95 didn't jump to line 102 because the condition on line 95 was never true
96 len(pos_contents) == 1
97 and isinstance(pos_contents[0], str)
98 # Just a single newline or whitespace after heading.
99 and not pos_contents[0].strip()
100 ):
101 # Most probably a bad article.
102 wxr.wtp.error(
103 "No body for Part-of-speech section.", sortid="simple/pos/271"
104 )
105 data.senses.append(Sense(tags=["no-gloss"]))
106 return data
108 # split_nodes_to_lines returns lists items on their own 'line'
109 node_lines = list(split_nodes_to_lines(pos_contents))
111 glosses_index = None
112 glosses_lists = []
113 for i, line in enumerate(node_lines):
114 # Looking at the "rump" after glosses lists starts, it's simplest
115 # just to pull all the list nodes, and handle them. Anything after
116 # or inbetween (like categories, extra templates, tables and images)
117 # can be ignored.
118 if (
119 len(line) == 1
120 and isinstance(line[0], WikiNode)
121 and line[0].kind == NodeKind.LIST
122 and (line[0].sarg != ":")
123 ):
124 if glosses_index is None: 124 ↛ 126line 124 didn't jump to line 126 because the condition on line 124 was always true
125 glosses_index = i
126 glosses_lists.append(line[0])
128 if glosses_index is None:
129 # if nothing found, accept ":" nodes
130 for i, line in enumerate(node_lines):
131 if (
132 len(line) == 1
133 and isinstance(line[0], WikiNode)
134 and line[0].kind == NodeKind.LIST
135 ):
136 if glosses_index is None: 136 ↛ 138line 136 didn't jump to line 138 because the condition on line 136 was always true
137 glosses_index = i
138 glosses_lists.append(line[0])
140 if glosses_index is None:
141 # Could not find any glosses.
142 # logger.info(f" //// {wxr.wtp.title}\n MISSING GLOSSES")
143 wxr.wtp.warning("Missing glosses", sortid="pos/20250121")
144 data.tags.append("no-gloss")
146 template_data: list[TemplateData] = []
147 category_data: list[str] = []
148 table_nodes: list[tuple[str | None, WikiNode]] = []
149 # template_depth is used as a nonlocal variable in bold_node_handler
150 # to gauge how deep inside a top-level template we are; we want to
151 # collect template data only for the top-level templates that are
152 # visible in the wikitext, not templates inside templates.
153 template_depth = 0
154 top_template_name: str | None = None
156 def bold_node_handler_fn(
157 node: WikiNode,
158 ) -> list[str | WikiNode] | None:
159 """Insert special markers `__*S__` and `__*E__` around bold nodes so
160 that the strings can later be split into "head-word" and "tag-words"
161 parts. Collect incidental stuff, like side-tables, that are often
162 put around the head."""
163 assert isinstance(node, WikiNode)
164 kind = node.kind
165 nonlocal template_depth
166 nonlocal top_template_name
167 if kind == NodeKind.BOLD or (
168 isinstance(node, HTMLNode)
169 and (
170 node.tag == "span"
171 and "style" in node.attrs
172 and (
173 "bold" in node.attrs["style"]
174 # Special handling for output for stuff in arabic script
175 or node.attrs["style"] == "color:black; font-size:200%;"
176 )
177 or node.tag == "b"
178 or node.tag == "strong"
179 )
180 ):
181 # These are word forms almost always
182 return ["__B__", *node.children, "__/B__"]
183 elif kind == NodeKind.ITALIC or (
184 isinstance(node, HTMLNode)
185 and (
186 node.tag == "span"
187 and "style" in node.attrs
188 and (
189 "italic" in node.attrs["style"]
190 )
191 or node.tag == "i"
192 or node.tag == "em"
193 )
194 ):
195 # These are almost always tag words; often 'kai' isn't italicized,
196 # for example.
197 return ["__I__", *node.children, "__/I__"]
198 elif isinstance(node, TemplateNode):
199 # Recursively expand templates so that even nodes inside the
200 # the templates are handled with bold_node_handler.
201 # Argh. Don't use "node_to_text", that causes bad output...
202 expanded = wxr.wtp.expand(wxr.wtp.node_to_wikitext(node))
203 if template_depth == 0: 203 ↛ 217line 203 didn't jump to line 217 because the condition on line 203 was always true
204 # We are looking at a top-level template in the original
205 # wikitext.
206 template_data.append(
207 TemplateData(
208 name=node.template_name,
209 args={
210 str(k): clean_node(wxr, None, v)
211 for k, v in node.template_parameters.items()
212 },
213 expansion=expanded,
214 )
215 )
216 top_template_name = node.template_name
217 new_node = wxr.wtp.parse(expanded)
219 template_depth += 1
220 ret = wxr.wtp.node_to_text(
221 new_node, node_handler_fn=bold_node_handler_fn
222 )
223 template_depth -= 1
224 if template_depth == 0: 224 ↛ 226line 224 didn't jump to line 226 because the condition on line 224 was always true
225 top_template_name = None
226 return ret
227 elif kind == NodeKind.LINK:
228 if not isinstance(node.largs[0][0], str): 228 ↛ 229line 228 didn't jump to line 229 because the condition on line 228 was never true
229 return None
230 if node.largs[0][0].startswith("Κατηγορία:"):
231 category_data.append(node.largs[0][0][len("Κατηγορία:") :])
232 return [""]
233 # Special case for meta-links like Πρότυπο:ετ that generate
234 # both a category link and :category link that is actually
235 # displayed as a link, but for our purposes we want to ignore
236 # that it is a link; it's a tag.
237 if node.largs[0][0].startswith(":Κατηγορία:"):
238 # unpacking a list-comprehension, unpacking into a list
239 # seems to be more performant than adding lists together.
240 return [
241 wxr.wtp.node_to_text(
242 node.largs[1:2] or node.largs[0],
243 node_handler_fn=bold_node_handler_fn,
244 )
245 # output the "visible" half of the link.
246 ]
247 if node.largs[0][0].startswith("Αρχείο:"): 247 ↛ 248line 247 didn't jump to line 248 because the condition on line 247 was never true
248 return [""]
249 # Often forms are 'formatted' with links, so let's mark these
250 # too.
251 return [
252 "__L__",
253 wxr.wtp.node_to_text(
254 node.largs[1:2] or node.largs[0],
255 node_handler_fn=bold_node_handler_fn,
256 ),
257 # output the "visible" half of the link.
258 # XXX collect link data if it turns out to be important.
259 "__/L__",
260 ]
261 # print(f"{node.largs=}")
263 elif kind in { 263 ↛ 269line 263 didn't jump to line 269 because the condition on line 263 was never true
264 NodeKind.TABLE,
265 }:
266 # XXX Handle tables here
267 # template depth and top-level template name
268 nonlocal table_nodes
269 table_nodes.append((top_template_name, node))
270 return [""]
271 return None
273 # Get Head Line
274 # Head *should* be immediately before the glosses...
275 # print(node_lines[:glosses_index])
276 found_head = False
278 for line in reversed(node_lines[:glosses_index]):
279 template_data = []
280 template_depth = 0
281 stripped = (
282 wxr.wtp.node_to_text(line, node_handler_fn=bold_node_handler_fn)
283 .removeprefix(":")
284 .strip()
285 )
286 if not stripped:
287 continue
288 if not found_head and parse_head(wxr, data, stripped): 288 ↛ 278line 288 didn't jump to line 278 because the condition on line 288 was always true
289 # print(data)
290 found_head = True
291 if not found_head: 291 ↛ 297line 291 didn't jump to line 297 because the condition on line 291 was never true
292 # There are a bunch of Greek Wiktionary articles with POS sections
293 # without heads, but they seem to always follow ones with heads;
294 # in this case, the result is just not including any `forms` field
295 # for these (or copying the previous one).
297 if prev_data is None:
298 wxr.wtp.warning(
299 f"Part of speech missing head: {wxr.wtp.title}",
300 sortid="pos/460/20250104",
301 )
302 else:
303 # No head found, copy previous (in this language)
304 data.forms = [
305 form.model_copy(deep=True) for form in prev_data.forms
306 ]
308 if len(template_data) > 0: 308 ↛ 309line 308 didn't jump to line 309 because the condition on line 308 was never true
309 data.head_templates = template_data
310 # logger.info(
311 # f" //// {wxr.wtp.title}\n >>>"
312 # + "\n >>>".join(repr(td) for td in template_data)
313 # )
315 if len(table_nodes) > 0: 315 ↛ 316line 315 didn't jump to line 316 because the condition on line 315 was never true
316 for template_name, table_node in table_nodes:
317 # XXX template_name
318 parse_table(
319 wxr,
320 table_node,
321 data,
322 data.lang_code in GREEK_LANGCODES,
323 template_name=template_name or "",
324 )
326 data.forms = remove_duplicate_forms(wxr, data.forms)
328 # Ignore images and files
329 # 2025-01-17 13:10:10,035 INFO: //// Ηρόδοτος
330 # //// [[Αρχείο:Herodotus Massimo Inv124478.jpg|200px|thumb|[[προτομή]] του Ηροδότου]]
332 # Have to ignore {{(( specifically. Creates columns.
333 # 2025-01-17 13:10:11,059 INFO: //// κάνω
334 # //// {{((|width=97%}}
336 # logger.info(f"<<<<<<<<< {wxr.wtp.title}\n<<<" + "\n<<<".join(pparts))
337 # see: free -> {{en-verb-'free'}} creates a floating inflection table
338 # followed by the usual head template
340 # see: τηλεομοιοτυπία
341 # '''{{PAGENAME}}''' {{θ}}
342 # theta is basically {{f|...}}
344 # see: θηλυκός
345 # '''{{PAGENAME}}, -ή''' ''και'' '''-ιά, -ό'''
346 # pagename, -e and -ia, -o, no indication of what these mean
348 # Ιόνια νησιά
349 # >>>'''{{PAGENAME}}''' ''πληθυντικός του'' [[ιόνιος|Ιόνιο]] [[νησί]]
350 # plural of 'Ionian island'
352 # >>>>>>>>> free
353 # >>>{{en-adj-r}} # floating table
354 # >>>{{τ|en|{{PAGENAME}}}}, ''συγκριτικός'' '''freer''', ''υπερθετικός'' '''freest'''
355 # pretty consistent bolding and italics
357 # genus
358 # {{τ|la|{{PAGENAME}}}} {{ο}} ([[γενική]]: generis) (3ης [[κλίση]]ς)
360 # ουδέτερος
361 # >>>'''{{PAGENAME}} -η -ο'''
363 # καφέ
364 # >>>'''{{PAGENAME}}''' {{ακλ|επίθ}}
365 # aklitos, uninflected
367 # καφέ
368 # >>>[[Αρχείο:Cafe Museum - Vienna (5402363918).jpg|μικρογραφία|τραπέζι σε βιενέζικο '''καφέ''']]
369 # >>>'''{{PAGENAME}}''' {{ο}} {{ακλ|ουδ}}
370 # Ignore images
372 # κρόκος
373 # >>>{| align="right"
374 # >>>
375 # >>>|-
376 # >>>
377 # >>>|[[Αρχείο:Crocus sativus1.jpg|thumb|150px|Άνθη '''κρόκου''' (''Crocus sativus'').]]
378 # >>>
379 # >>>
380 # >>>|[[Αρχείο:Raw egg.jpg|thumb|200px|Ο '''κρόκος''' ενός αβγού.]]
381 # >>>
382 # >>>
383 # >>>|}
384 # >>>
385 # >>>'''{{PAGENAME}}''' {{α}}
387 # p
388 # >>>'''{{PAGENAME}}''' ''πεζό'' (''κεφαλαίο:'' '''{{l|P|la}}''')
389 # lowercase, uppercase
391 # Δημόκριτος
392 # >>>'''{{PAGENAME}}'''
393 # >>># {{όνομα||α}}
394 # >>>{{clear}}
395 # Clear is just formatting to move the line down where there are empty
396 # margins.
398 # BIG ASSUMPTION: let us assume that Greek Wiktionary doesn't use templates
399 # that generate multiline text that is part of head. That is, we can see
400 # each newline because they are in strings, and when something that does
401 # generate virtual newlines (list) pops up, that's when the head portion
402 # ends.
403 # Greek Wiktionary head sections look like this:
404 # > Pre-head templates that create side-tables, like inflections
405 # > Possible formatting templates like {{clear}} that should be ignored
406 # > Head template last before glosses list
407 # > Clear again...
408 # > Glosses list tree, where we can stop.
409 # We can create "lines" of these by looping over the items in pos_content
410 # and looking for newlines in strings, because that's where they mainly
411 # should be (except side-table templates). We handle earlier lines
412 # differently than the last line before the glosses list, which is the
413 # head.
415 # return None
417 # ======================
419 ### Glosses after head ###
420 # parts = []
421 got_senses = False
422 for lst in glosses_lists:
423 # Wiktionaries handle glosses the usual way: with numbered lists.
424 # Each list entry is a gloss, sometimes with subglosses, but with
425 # Simple English Wiktionary that seems rare.
426 # logger.debug(f"{lst}")
427 senses = recurse_glosses(wxr, lst, data)
428 if len(senses) > 0: 428 ↛ 422line 428 didn't jump to line 422 because the condition on line 428 was always true
429 got_senses = True
430 data.senses.extend(senses)
432 if not got_senses and len(glosses_lists) > 0: 432 ↛ 433line 432 didn't jump to line 433 because the condition on line 432 was never true
433 wxr.wtp.error(
434 "POS had a list, but the list did not return senses.",
435 sortid="simple/pos/313",
436 )
438 # If there is no list, clump everything into one gloss.
439 # if not len(glosses_lists > 0):
440 # sense = Sense()
441 # found_gloss = parse_gloss(wxr, sense, pos_contents[i:])
442 # if found_gloss is True or len(sense.raw_tags) > 0:
443 # convert_tags_in_sense(sense)
444 # if len(sense.glosses) == 0 and "no-gloss" not in sense.tags:
445 # sense.tags.append("no-gloss")
446 # data.senses.append(sense)
448 if len(data.senses) == 0:
449 data.senses.append(Sense(tags=["no-gloss"]))
451 #####
452 #####
453 # TEMP DEBUG PRINTS
455 pos_sublevels = list(
456 node.find_child(LEVEL_KIND_FLAGS)
457 # include empty string only for debug printing?
458 )
460 for sl in pos_sublevels: 460 ↛ 461line 460 didn't jump to line 461 because the loop on line 460 never started
461 subtitle = clean_node(wxr, None, sl.largs).lower().strip()
463 type, pos, heading_name, tags, num, ok = parse_lower_heading(
464 wxr, subtitle
465 )
467 if type == Heading.Translations:
468 process_translations(wxr, data, sl)
469 elif type == Heading.Infl:
470 process_inflection_section(wxr, data, sl)
471 elif type in (
472 Heading.Related,
473 Heading.Synonyms,
474 Heading.Antonyms,
475 Heading.Transliterations,
476 ):
477 process_linkage_section(wxr, data, sl, type)
478 # if type not in (
479 # Heading.Translations,
480 # Heading.Ignored,
481 # Heading.Infl,
482 # Heading.Related,
483 # Heading.Synonyms,
484 # Heading.Antonyms,
485 # Heading.Derived,
486 # # We're going to ignore homonyms because they're
487 # # only tangentially related, like anagrams
488 # Heading.Homonyms,
489 # ):
490 # # ...
491 # expanded = wxr.wtp.expand(wxr.wtp.node_to_wikitext(sl))
492 # # text = clean_node(wxr, None, sl)
493 # logger.warning(
494 # f"""
495 # {wxr.wtp.title}: {type}, '{heading_name}', {ok=}
496 # {expanded}
498 # ###########################
499 # """
500 # )
502 #####
503 #####
504 return data
507PARENS_BEFORE_RE = re.compile(r"\s*(\([^()]+\)\s*)+")
508ITER_PARENS_RE = re.compile(r"\(([^()]+)\)")
511def bold_node_fn(
512 node: WikiNode,
513) -> list[str | WikiNode] | None:
514 """Handle nodes in the parse tree specially."""
515 # print(f"{node=}")
516 if node.kind == NodeKind.ITALIC:
517 return ["__I__", *node.children, "__/I__"]
518 if node.kind == NodeKind.BOLD:
519 return ["__B__", *node.children, "__/B__"]
520 # if node.kind == NodeKind.LINK:
521 # if not isinstance(node.largs[0][0], str):
522 # return None
523 # return [
524 # "__L__",
525 # # unpacking a list-comprehension, unpacking into a list
526 # # seems to be more performant than adding lists together.
527 # *(
528 # wxr.wtp.node_to_text(
529 # node.largs[1:2] or node.largs[0],
530 # )
531 # # output the "visible" half of the link.
532 # ),
533 # # XXX collect link data if it turns out to be important.
534 # "__/L__",
535 # ]
536 # # print(f"{node.largs=}")
537 return None
540def extract_form_of_templates(
541 wxr: WiktextractContext,
542 parent_sense: Sense | WordEntry,
543 t_node: TemplateNode,
544 siblings: list[str | WikiNode],
545 siblings_index: int,
546) -> None:
547 """Parse form_of for nouns, adjectives and verbs.
549 Supports:
550 * κλ | generic | form_of
551 * γρ | generic | form_of
552 * πτώση/πτώσεις | nouns, adjectives etc. | form_of and tags
553 * ρημ τύπος | verbs | form_of
554 * μτχ | verbs | form_of
556 * References:
557 https://el.wiktionary.org/wiki/Πρότυπο:κλ
558 https://el.wiktionary.org/wiki/Module:άλλημορφή
559 https://el.wiktionary.org/wiki/Κατηγορία:Πρότυπα_για_κλιτικούς_τύπους
560 https://el.wiktionary.org/wiki/Πρότυπο:ρημ_τύπος
561 https://el.wiktionary.org/wiki/Κατηγορία:Πρότυπα_για_μετοχές
562 """
563 t_name = t_node.template_name
565 basic_extract = partial(
566 extract_form_of_templates_basic,
567 wxr,
568 parent_sense,
569 siblings,
570 siblings_index,
571 t_name,
572 t_node,
573 )
574 # Generic
575 if t_name == "κλ":
576 return basic_extract(extract_argument=2)
578 # Notes:
579 # * All occurrences in wiktionary have at least one argument
580 # * Only handle cases where the second argument refers to a form:
581 # μορφ / μορφή / λόγια μορφή του, etc.
582 # and ignore those mistakenly used as synonym templates
583 if t_name == "γρ" and 2 in t_node.template_parameters:
584 second_arg = t_node.template_parameters[2]
585 second_arg_str = clean_node(wxr, None, second_arg)
586 if "μορφ" in second_arg_str:
587 return basic_extract(extract_argument=1)
589 # Nouns and adjectives
590 inflection_t_names = ("πτώσεις", "πτώση")
591 if any(name in t_name for name in inflection_t_names):
592 return extract_form_of_templates_ptosi(wxr, parent_sense, t_node)
594 # Verbs
595 if t_name == "ρημ τύπος":
596 return basic_extract(extract_argument=2)
598 if t_name.startswith("μτχ"):
599 return basic_extract(extract_argument=1)
602def extract_form_of_templates_basic(
603 wxr: WiktextractContext,
604 parent_sense: Sense | WordEntry,
605 siblings: list[str | WikiNode],
606 sibling_index: int,
607 t_name: str,
608 t_node: TemplateNode,
609 extract_argument: int | str,
610):
611 t_args = t_node.template_parameters
612 if extract_argument not in t_args:
613 # mtxpp template has no args, consume the next links for the
614 # form_of field
615 wxr.wtp.warning(
616 f"Form-of template does not have lemma data: {t_name}, {t_args=}",
617 sortid="pos/570/20250517",
618 )
619 links: list[str | WikiNode] = []
620 for node in siblings[sibling_index + 1 :]:
621 if not (
622 (isinstance(node, str) and node.strip() == "")
623 or (isinstance(node, WikiNode) and node.kind == NodeKind.LINK)
624 ):
625 break
626 links.append(node)
627 lemma = clean_node(wxr, None, links).strip()
628 else:
629 lemma = clean_node(wxr, None, t_args[extract_argument]).strip()
631 if lemma:
632 form_of = FormOf(word=lemma)
633 parent_sense.form_of.append(form_of)
634 else:
635 wxr.wtp.warning(
636 "Lemma extract from form-of template was empty or whitespace:"
637 f"{t_name}, {t_args=}, {lemma=}",
638 sortid="pos/609/20250925",
639 )
642def extract_form_of_templates_ptosi(
643 wxr: WiktextractContext,
644 parent_sense: Sense | WordEntry,
645 t_node: TemplateNode,
646) -> None:
647 """Parse form_of for nouns and adjectives.
649 Supports:
650 * [gender του] πτώση-πτώσεις templates
652 Notes:
653 * πτώση has exactly one case, πτώσεις as at least two cases
654 """
655 t_name = t_node.template_name
656 inflection_t_names = ("πτώσεις", "πτώση")
657 tags: list[str] = []
659 # Parse and consume gender if any
660 if "-" in t_name:
661 # Cf. {{ουδ του-πτώσειςΟΑΚεν|καλός|grc}}
662 gender, inflection = t_name.split("-")
663 code = gender[:3]
664 GENDER_INFLECTION_MAP = {
665 "θηλ": "feminine",
666 "αρσ": "masculine",
667 "ουδ": "neuter",
668 }
669 try:
670 gender_tag = GENDER_INFLECTION_MAP[code]
671 except KeyError:
672 # Bad template name.
673 return
674 tags.append(gender_tag)
675 else:
676 inflection = t_name
678 # Remove πτώση-πτώσεις prefix
679 for prefix in inflection_t_names: 679 ↛ 684line 679 didn't jump to line 684 because the loop on line 679 didn't complete
680 if inflection.startswith(prefix):
681 inflection = inflection[len(prefix) :]
682 break
684 PTOSI_INFLECTION_MAP = {
685 "Ο": "nominative",
686 "Α": "accusative",
687 "Γ": "genitive",
688 "Κ": "vocative",
689 }
691 # The πτώση-πτώσεις templates contains:
692 # * Case(s) (1 for πτώση, >1 for πτώσεις) in uppercase characters.
693 # * Number in either "εν" (singular) or "πλ" (plural)
694 #
695 # Examples:
696 # * {{πτώσηΑεν|κόρφος}} > accusative | singular
697 # * {{πτώσειςΟΚπλ|κόρφος}} > nominative, vocative | plural
698 try:
699 lowercase = "".join(ch for ch in inflection if ch.islower())
700 number = {"εν": "singular", "πλ": "plural"}[lowercase]
701 uppercase = [ch for ch in inflection if not ch.islower()]
702 cases = [PTOSI_INFLECTION_MAP[ch] for ch in uppercase]
703 except KeyError:
704 # Bad template name.
705 return
707 tags.extend([elt for elt in cases + [number]])
709 t_args = t_node.template_parameters
711 if 1 not in t_args: 711 ↛ 712line 711 didn't jump to line 712 because the condition on line 711 was never true
712 wxr.wtp.warning(
713 f"Form-of template does not have lemma data: {t_name}, {t_args=}",
714 sortid="pos/620/20250416",
715 )
716 return
718 lemma = clean_node(wxr, None, t_args[1])
719 form_of = FormOf(word=lemma)
720 parent_sense.form_of.append(form_of)
721 tags.sort() # For the tests, but also good practice
722 parent_sense.tags.extend(tags)
725def parse_gloss(
726 wxr: WiktextractContext, parent_sense: Sense, contents: list[str | WikiNode]
727) -> bool:
728 """Take what is preferably a line of text and extract tags and a gloss from
729 it. The data is inserted into parent_sense, and for recursion purposes
730 we return a boolean that tells whether there was any gloss text in a
731 lower node."""
732 if len(contents) == 0: 732 ↛ 733line 732 didn't jump to line 733 because the condition on line 732 was never true
733 return False
735 for i, t_node in enumerate(contents):
736 if isinstance(t_node, TemplateNode):
737 extract_form_of_templates(wxr, parent_sense, t_node, contents, i)
739 template_tags: list[str] = []
741 bl_linkages: list[Linkage] = []
742 no_gloss_but_keep_anyway = False
744 def bl_template_handler_fn(name: str, ht: TemplateArgs) -> str | None:
745 nonlocal bl_linkages
746 if name == "βλ":
747 for k, v in ht.items():
748 if isinstance(k, int):
749 bl_linkages.append(Linkage(word=clean_node(wxr, None, v)))
750 return ""
751 return None
753 # The rest of the text.
754 text = clean_node(
755 wxr,
756 parent_sense,
757 contents,
758 template_fn=bl_template_handler_fn,
759 node_handler_fn=bold_node_fn,
760 )
762 if len(bl_linkages) > 0:
763 parent_sense.related.extend(bl_linkages)
764 no_gloss_but_keep_anyway = True
766 if not text.strip():
767 if len(bl_linkages) <= 0: 767 ↛ 768line 767 didn't jump to line 768 because the condition on line 767 was never true
768 return False
770 # print(f" ============ {contents=}, {text=}")
772 # Greek Wiktionary uses a lot of template-less tags.
773 if parens_n := PARENS_BEFORE_RE.match(text):
774 blocks = ITER_PARENS_RE.findall(parens_n.group(0))
775 # print(f"{blocks=}")
776 kept_blocks: list[str] = []
777 forms: list[str] = []
778 raw_tag_texts: list[str] = []
779 for block in blocks:
780 if block_has_non_greek_text(block):
781 # Keep parentheses with non-greek text with gloss text)
782 kept_blocks.extend(("(", block, ") "))
783 continue
784 nforms, nraw_tag_texts = extract_forms_and_tags(block)
785 forms.extend(nforms)
786 raw_tag_texts.extend(nraw_tag_texts)
787 # print(f"{forms=}, {raw_tag_texts=}")
788 if forms: 788 ↛ 790line 788 didn't jump to line 790 because the condition on line 788 was never true
789 # print(f"{forms=}")
790 parent_sense.related.extend(Linkage(word=form) for form in forms)
791 parent_sense.raw_tags.extend(raw_tag_texts)
792 kept_blocks.append(text[parens_n.end() :])
793 text = "".join(kept_blocks)
795 text = re.sub(r"__/?[IB]__", "", text)
797 if len(template_tags) > 0: 797 ↛ 798line 797 didn't jump to line 798 because the condition on line 797 was never true
798 parent_sense.raw_tags.extend(template_tags)
800 if len(text) > 0:
801 parent_sense.glosses.append(text)
802 return True
804 if no_gloss_but_keep_anyway: 804 ↛ 808line 804 didn't jump to line 808 because the condition on line 804 was always true
805 parent_sense.raw_tags.append("no-gloss")
806 return True
808 return False
811Related: TypeAlias = Linkage
812Synonym: TypeAlias = Linkage
813Antonym: TypeAlias = Linkage
816def recurse_glosses1(
817 wxr: WiktextractContext,
818 parent_sense: Sense,
819 node: WikiNode,
820) -> tuple[
821 list[Sense],
822 list[Example],
823 list[Related],
824 list[Synonym],
825 list[Antonym],
826]:
827 """Helper function for recurse_glosses"""
828 # print(f"{node=}")
830 ret_senses: list[Sense] = []
831 ret_examples: list[Example] = []
832 ret_related: list[Related] = []
833 ret_synonyms: list[Synonym] = []
834 ret_antonyms: list[Antonym] = []
835 found_gloss = False
837 # Pydantic stuff doesn't play nice with Tatu's manual dict manipulation
838 # functions, so we'll use a dummy dict here that we then check for
839 # content and apply to `parent_sense`.
840 dummy_parent: dict = {}
842 related_linkages: list[Linkage] = []
843 example_is_synonym = False
844 example_is_antonym = False
846 def bl_template_handler_fn(name: str, ht: TemplateArgs) -> str | None:
847 nonlocal related_linkages
848 nonlocal example_is_synonym
849 nonlocal example_is_antonym
850 # Sometimes the bl-templates point to synonyms or antonyms, instead
851 # of just "related"; we save them, and if example_is_xxxnym is true,
852 # we later return them as xxxnyms.
853 if name == "βλ":
854 for k, v in ht.items():
855 if isinstance(k, int):
856 related_linkages.append(
857 Linkage(word=clean_node(wxr, None, v))
858 )
859 return ""
860 if name == "συνων":
861 example_is_synonym = True
862 return ""
863 if name == "αντων":
864 example_is_antonym = True
865 return ""
866 return None
868 # List nodes contain only LIST_ITEM nodes, which may contain sub-LIST nodes.
869 if node.kind == NodeKind.LIST:
870 list_ret: tuple[
871 list[Sense],
872 list[Example],
873 list[Related],
874 list[Synonym],
875 list[Antonym],
876 ] = ([], [], [], [], [])
877 for child in node.children:
878 if isinstance(child, str) or child.kind != NodeKind.LIST_ITEM: 878 ↛ 880line 878 didn't jump to line 880 because the condition on line 878 was never true
879 # This should never happen
880 wxr.wtp.error(
881 f"{child=} is direct child of NodeKind.LIST",
882 sortid="simple/pos/44",
883 )
884 continue
885 (
886 senses,
887 examples,
888 related,
889 synonyms,
890 antonyms,
891 ) = recurse_glosses1(wxr, parent_sense.model_copy(deep=True), child)
892 list_ret[0].extend(senses)
893 list_ret[1].extend(examples)
894 list_ret[2].extend(related)
895 list_ret[3].extend(synonyms)
896 list_ret[4].extend(antonyms)
897 return list_ret
899 elif node.kind == NodeKind.LIST_ITEM: 899 ↛ 1001line 899 didn't jump to line 1001 because the condition on line 899 was always true
900 # Split at first LIST node found
901 split_at = next(
902 (
903 i
904 for i, c in enumerate(node.children)
905 if isinstance(c, WikiNode) and c.kind == NodeKind.LIST
906 ),
907 len(node.children),
908 )
909 contents = node.children[:split_at]
910 sublists = node.children[split_at:]
912 # A LIST and LIST_ITEM `sarg` is basically the prefix of the line, like
913 # `#` or `##:`: the token that appears at the very start of a line that
914 # is used to parse the depth and structure of lists.
915 # `#` Item 1
916 # `##` Item 1.1
917 # `##*` Example 1.1
918 if node.sarg.endswith((":", "*")) and node.sarg not in (":", "*"):
919 # This is either a quotation or example.
920 text = clean_node(
921 wxr, dummy_parent, contents, template_fn=bl_template_handler_fn
922 ).strip("⮡ \n")
924 # print(f"{contents=}, {text=}, {related_linkages=}")
926 if example_is_synonym or example_is_antonym:
927 link_linkages = []
928 for snode in contents:
929 if not isinstance(snode, WikiNode):
930 continue
931 if snode.kind == NodeKind.LINK:
932 link_linkages.append(
933 Linkage(
934 word=clean_node(wxr, None, snode.largs[0][0])
935 )
936 )
937 else:
938 for link in snode.find_child_recursively(NodeKind.LINK): 938 ↛ 939line 938 didn't jump to line 939 because the loop on line 938 never started
939 link_linkages.append(
940 Linkage(word=clean_node(wxr, None, link))
941 )
943 # print("=====")
944 # print(f"{link_linkages=}")
946 if example_is_synonym:
947 return [], [], [], link_linkages + related_linkages, []
948 elif example_is_antonym: 948 ↛ 951line 948 didn't jump to line 951 because the condition on line 948 was always true
949 return [], [], [], [], link_linkages + related_linkages
951 if len(related_linkages) > 0:
952 # parent_sense.related.extend(bl_linkages)
953 # related_linkages = []
954 # if not text.strip():
955 return [], [], related_linkages, [], []
957 example_is_synonym = False
958 example_is_antonym = False
960 if not text.strip(): 960 ↛ 961line 960 didn't jump to line 961 because the condition on line 960 was never true
961 return [], [], [], [], []
963 example = Example(text=text)
964 # logger.debug(f"{wxr.wtp.title}/example\n{text}")
965 if len(sublists) > 0:
966 translation = clean_node(wxr, dummy_parent, sublists).strip(
967 "#*: \n"
968 )
969 if translation != "": 969 ↛ 972line 969 didn't jump to line 972 because the condition on line 969 was always true
970 example.translation = translation
972 for k, v in dummy_parent.items(): 972 ↛ 973line 972 didn't jump to line 973 because the loop on line 972 never started
973 if k == "categories":
974 parent_sense.categories.extend(v)
975 dummy_parent = {}
977 return [], [example], [], [], []
979 found_gloss = parse_gloss(wxr, parent_sense, contents)
981 for sl in sublists:
982 if not (isinstance(sl, WikiNode) and sl.kind == NodeKind.LIST): 982 ↛ 984line 982 didn't jump to line 984 because the condition on line 982 was never true
983 # Should not happen
984 wxr.wtp.error(
985 f"Sublist is not NodeKind.LIST: {sublists=!r}",
986 sortid="simple/pos/82",
987 )
988 continue
989 (
990 senses,
991 examples,
992 related,
993 synonyms,
994 antonyms,
995 ) = recurse_glosses1(wxr, parent_sense.model_copy(deep=True), sl)
996 ret_senses.extend(senses)
997 ret_examples.extend(examples)
998 ret_related.extend(related)
999 ret_synonyms.extend(synonyms)
1000 ret_antonyms.extend(antonyms)
1001 if len(ret_senses) > 0:
1002 # the recursion returned actual senses from below, so we will
1003 # ignore everything else (incl. any example data that might have
1004 # been given to parent_sense) and return that instead.
1005 # XXX if this becomes relevant, add the example data to a returned
1006 # subsense instead?
1007 # if any(
1008 # isinstance(r, Sense) and r.raw_tags == ["no-gloss"] for r in ret
1009 # ):
1010 # print(f"{ret=}")
1011 return (
1012 combine_senses_with_identical_glosses(ret_senses),
1013 [],
1014 [],
1015 [],
1016 [],
1017 )
1019 # If nothing came from below, then this.
1020 if found_gloss is True or "no-gloss" in parent_sense.raw_tags: 1020 ↛ 1028line 1020 didn't jump to line 1028 because the condition on line 1020 was always true
1021 parent_sense.examples.extend(ret_examples)
1022 parent_sense.related.extend(ret_related)
1023 parent_sense.synonyms.extend(ret_synonyms)
1024 parent_sense.antonyms.extend(ret_antonyms)
1026 return [parent_sense], [], [], [], []
1028 return [], ret_examples, ret_related, ret_synonyms, ret_antonyms
1031def recurse_glosses(
1032 wxr: WiktextractContext, node: WikiNode, data: WordEntry
1033) -> list[Sense]:
1034 """Recurse through WikiNodes to find glosses and sense-related data."""
1035 base_sense = Sense()
1036 ret: list[Sense] = []
1038 senses, examples, related, synonyms, antonyms = recurse_glosses1(
1039 wxr, base_sense, node
1040 )
1041 if ( 1041 ↛ 1047line 1041 didn't jump to line 1047 because the condition on line 1041 was never true
1042 len(examples) > 0
1043 or len(related) > 0
1044 or len(synonyms) > 0
1045 or len(antonyms) > 0
1046 ):
1047 wxr.wtp.error(
1048 "NOT Sense has bubbled to recurse_glosses: "
1049 f"{examples=}, {related=}, {synonyms=}, {antonyms=}",
1050 sortid="pos/glosses/966",
1051 )
1052 for sense in senses:
1053 convert_tags_in_sense(sense)
1054 ret.append(sense)
1056 return ret
1059def split_nodes_to_lines(
1060 nodes: list[WikiNode | str],
1061) -> Iterator[list[WikiNode | str]]:
1062 """Take a list of nodes and split up the list into lines.
1063 This could be done by using node_to_wikitext() to reverse the parsing,
1064 and then you could parse the individual lines after splitting the text,
1065 but it seems unnecessary in the context of Greek Wiktionary PoS sections.
1066 """
1067 parts: list[WikiNode | str] = []
1068 for node in nodes:
1069 if isinstance(node, WikiNode):
1070 # Lists are returned as whole, they're their own line
1071 if node.kind == NodeKind.LIST:
1072 if len(parts) > 0: 1072 ↛ 1073line 1072 didn't jump to line 1073 because the condition on line 1072 was never true
1073 yield parts
1074 parts = []
1075 yield [node]
1076 continue
1077 if isinstance(node, TemplateNode) and node.template_name in ( 1077 ↛ 1084line 1077 didn't jump to line 1084 because the condition on line 1077 was never true
1078 # Ignore specific templates, like {{((}} that bookends a column.
1079 "((",
1080 "))",
1081 "clear",
1082 "κλείδα-ελλ",
1083 ):
1084 continue
1085 parts.append(node)
1086 else:
1087 if "\n" in node:
1088 split_string = node.splitlines()
1089 for spl in split_string[:-1]:
1090 if spl: 1090 ↛ 1091line 1090 didn't jump to line 1091 because the condition on line 1090 was never true
1091 parts.append(spl)
1092 yield parts
1093 parts = []
1094 # special handling for final newline; splitlines ignores it
1095 if node.endswith("\n"):
1096 if split_string[-1]:
1097 parts.append(split_string[-1])
1098 yield parts
1099 parts = []
1100 elif split_string[-1]: 1100 ↛ 1068line 1100 didn't jump to line 1068 because the condition on line 1100 was always true
1101 parts.append(split_string[-1])
1102 elif node: 1102 ↛ 1068line 1102 didn't jump to line 1068 because the condition on line 1102 was always true
1103 parts.append(node)
1105 # yield final parts
1106 if len(parts) > 0: 1106 ↛ 1107line 1106 didn't jump to line 1107 because the condition on line 1106 was never true
1107 yield parts
1110BOLD_RE = re.compile(r"(__/?[BI]__|, |\. )")
1113def extract_forms_and_tags(tagged_text: str) -> tuple[list[str], list[str]]:
1114 forms: list[str] = []
1115 tags: list[str] = []
1117 # print(f"{tagged_text=}")
1118 # inside_italics = False
1119 inside_bold = False
1121 for i, t in enumerate(BOLD_RE.split(tagged_text)):
1122 t = t.strip()
1123 # print(f"{i}: {t=}")
1124 if not t:
1125 continue
1127 if i % 2 == 0:
1128 # Text between splitters
1129 if inside_bold is True: 1129 ↛ 1130line 1129 didn't jump to line 1130 because the condition on line 1129 was never true
1130 forms.append(t)
1131 continue
1132 # Add everything else to raw_tags
1133 # if inside_italics is True:
1134 # tags.append(t)
1135 # continue
1136 # ". " and ", " just split. They're stripped to "." and "," if
1137 # this needs to be modified later.
1138 tags.append(t)
1139 continue
1140 match t:
1141 case "__B__": 1141 ↛ 1142line 1141 didn't jump to line 1142 because the pattern on line 1141 never matched
1142 inside_bold = True
1143 case "__/B__": 1143 ↛ 1144line 1143 didn't jump to line 1144 because the pattern on line 1143 never matched
1144 inside_bold = False
1145 # case "__I__":
1146 # inside_italics = True
1147 # case "__/I__":
1148 # inside_italics = False
1150 return forms, tags
1153META_RE = re.compile(r"__/?[ILEB]__")
1156def block_has_non_greek_text(text: str) -> bool:
1157 text = META_RE.sub("", text)
1158 for t in text.split():
1159 for ch in t: 1159 ↛ 1158line 1159 didn't jump to line 1158 because the loop on line 1159 didn't complete
1160 if not ch.isalpha(): 1160 ↛ 1161line 1160 didn't jump to line 1161 because the condition on line 1160 was never true
1161 continue
1162 if not unicode_name(ch).startswith("GREEK"):
1163 return True
1164 break
1165 return False
1168def combine_senses_with_identical_glosses(
1169 orig_senses: list[Sense],
1170) -> list[Sense]:
1171 glosses_to_senses: dict[tuple[str, ...], list[Sense]] = {}
1172 senses: list[Sense] = []
1174 found_identical_glosses = False
1176 for item in orig_senses:
1177 glosses_key = tuple(item.glosses)
1178 if glosses_key not in glosses_to_senses: 1178 ↛ 1181line 1178 didn't jump to line 1181 because the condition on line 1178 was always true
1179 glosses_to_senses[glosses_key] = [item]
1180 else:
1181 glosses_to_senses[glosses_key].append(item)
1182 found_identical_glosses = True
1184 if not found_identical_glosses: 1184 ↛ 1187line 1184 didn't jump to line 1187 because the condition on line 1184 was always true
1185 return orig_senses
1187 for twinned_senses in glosses_to_senses.values():
1188 main_sense = twinned_senses[0]
1189 for other_sense in twinned_senses[1:]:
1190 main_sense.merge(other_sense)
1191 senses.append(main_sense)
1193 return senses