Coverage for src/wiktextract/extractor/el/pos.py: 77%
446 statements
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-04 10:58 +0000
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-04 10:58 +0000
1import re
2from collections.abc import Iterator
3from typing import TypeAlias
4from unicodedata import name as unicode_name
6from wikitextprocessor import (
7 HTMLNode,
8 NodeKind,
9 TemplateArgs,
10 TemplateNode,
11 WikiNode,
12)
13from wikitextprocessor.parser import LEVEL_KIND_FLAGS
15from wiktextract import WiktextractContext
16from wiktextract.page import clean_node
17from wiktextract.wxr_logging import logger
19from .head import parse_head
20from .linkages import process_linkage_section
21from .models import Example, FormOf, Linkage, Sense, TemplateData, WordEntry
22from .parse_utils import (
23 GREEK_LANGCODES,
24 Heading,
25 parse_lower_heading,
26 remove_duplicate_forms,
27)
28from .section_titles import POS_HEADINGS
29from .table import parse_table, process_inflection_section
30from .tags_utils import convert_tags_in_sense
31from .text_utils import (
32 ENDING_NUMBER_RE,
33 normalized_int,
34)
35from .translations import process_translations
37# from wiktextract.wxr_logging import logger
40def process_pos(
41 wxr: WiktextractContext,
42 node: WikiNode,
43 data: WordEntry,
44 prev_data: WordEntry | None, # data from the last entry in this language
45 # the "noun" in "Noun 2"
46 pos: str,
47 title: str,
48 # the "2" in "Noun 2"
49 pos_tags: list[str],
50 pos_num: int = -1,
51) -> WordEntry | None:
52 """Process a part-of-speech section, like 'Noun'. `data` provides basic
53 data common with other POS sections, like pronunciation or etymology."""
55 # Metadata for different part-of-speech kinds.
56 # print(f"{pos_title=}, {pos_tags=}, {pos_num=}")
57 data.pos = pos # the internal/translated name for the POS
58 data.pos_num = pos_num # SEW uses "Noun 1", "Noun 2" style headings.
60 wxr.wtp.start_subsection(title)
62 # Sound data associated with this POS might be coming from a shared
63 # section, in which case we've tried to tag the sound data with its
64 # pos name + number if possible. Filter out stuff that doesn't fit.
65 # This is actually pretty common, but if the edition has proper hierarchies
66 # for this, doing this step might be unnecessary.
67 new_sounds = []
68 for sound in data.sounds: 68 ↛ 69line 68 didn't jump to line 69 because the loop on line 68 never started
69 if len(sound.poses) == 0:
70 # This sound data wasn't tagged with any specific pos section(s), so
71 # we add it to everything; this is basically the default behavior.
72 new_sounds.append(sound)
73 else:
74 for sound_pos in sound.poses:
75 m = ENDING_NUMBER_RE.search(sound_pos)
76 if m is not None:
77 s_num = normalized_int(m.group(1).strip())
78 s_pos = sound_pos[: m.start()].strip().lower()
79 else:
80 s_pos = sound_pos.strip().lower()
81 s_num = -1
82 sound_meta = POS_HEADINGS[s_pos]
83 s_pos = sound_meta["pos"]
84 if s_pos == data.pos and s_num == data.pos_num:
85 new_sounds.append(sound)
86 data.sounds = new_sounds
88 # Get child nodes *except* headings (= LEVEL).
89 pos_contents = list(
90 node.invert_find_child(LEVEL_KIND_FLAGS, include_empty_str=True)
91 # include empty string only for debug printing?
92 )
94 if len(pos_contents) == 0 or ( 94 ↛ 101line 94 didn't jump to line 101 because the condition on line 94 was never true
95 len(pos_contents) == 1
96 and isinstance(pos_contents[0], str)
97 # Just a single newline or whitespace after heading.
98 and not pos_contents[0].strip()
99 ):
100 # Most probably a bad article.
101 wxr.wtp.error(
102 "No body for Part-of-speech section.", sortid="simple/pos/271"
103 )
104 data.senses.append(Sense(tags=["no-gloss"]))
105 return data
107 # split_nodes_to_lines returns lists items on their own 'line'
108 node_lines = list(split_nodes_to_lines(pos_contents))
110 glosses_index = None
111 glosses_lists = []
112 for i, line in enumerate(node_lines):
113 # Looking at the "rump" after glosses lists starts, it's simplest
114 # just to pull all the list nodes, and handle them. Anything after
115 # or inbetween (like categories, extra templates, tables and images)
116 # can be ignored.
117 if (
118 len(line) == 1
119 and isinstance(line[0], WikiNode)
120 and line[0].kind == NodeKind.LIST
121 and (line[0].sarg != ":")
122 ):
123 if glosses_index is None: 123 ↛ 125line 123 didn't jump to line 125 because the condition on line 123 was always true
124 glosses_index = i
125 glosses_lists.append(line[0])
127 if glosses_index is None:
128 # if nothing found, accept ":" nodes
129 for i, line in enumerate(node_lines):
130 if (
131 len(line) == 1
132 and isinstance(line[0], WikiNode)
133 and line[0].kind == NodeKind.LIST
134 ):
135 if glosses_index is None: 135 ↛ 137line 135 didn't jump to line 137 because the condition on line 135 was always true
136 glosses_index = i
137 glosses_lists.append(line[0])
139 if glosses_index is None: 139 ↛ 142line 139 didn't jump to line 142 because the condition on line 139 was never true
140 # Could not find any glosses.
141 # logger.info(f" //// {wxr.wtp.title}\n MISSING GLOSSES")
142 wxr.wtp.warning("Missing glosses", sortid="pos/20250121")
143 data.tags.append("no-gloss")
145 template_data: list[TemplateData] = []
146 category_data: list[str] = []
147 table_nodes: list[tuple[str | None, WikiNode]] = []
148 # template_depth is used as a nonlocal variable in bold_node_handler
149 # to gauge how deep inside a top-level template we are; we want to
150 # collect template data only for the top-level templates that are
151 # visible in the wikitext, not templates inside templates.
152 template_depth = 0
153 top_template_name: str | None = None
155 def bold_node_handler_fn(
156 node: WikiNode,
157 ) -> list[str | WikiNode] | None:
158 """Insert special markers `__*S__` and `__*E__` around bold nodes so
159 that the strings can later be split into "head-word" and "tag-words"
160 parts. Collect incidental stuff, like side-tables, that are often
161 put around the head."""
162 assert isinstance(node, WikiNode)
163 kind = node.kind
164 nonlocal template_depth
165 nonlocal top_template_name
166 if kind == NodeKind.BOLD or (
167 isinstance(node, HTMLNode)
168 and node.tag == "span"
169 and "style" in node.attrs
170 and (
171 "bold" in node.attrs["style"]
172 # Special handling for output for stuff in arabic script
173 or node.attrs["style"] == "color:black; font-size:200%;"
174 )
175 ):
176 # These are word forms almost always
177 return ["__B__", *node.children, "__/B__"]
178 elif kind == NodeKind.ITALIC:
179 # These are almost always tag words; often 'kai' isn't italicized,
180 # for example.
181 return ["__I__", *node.children, "__/I__"]
182 elif isinstance(node, TemplateNode):
183 # Recursively expand templates so that even nodes inside the
184 # the templates are handled with bold_node_handler.
185 # Argh. Don't use "node_to_text", that causes bad output...
186 expanded = wxr.wtp.expand(wxr.wtp.node_to_wikitext(node))
187 if template_depth == 0: 187 ↛ 201line 187 didn't jump to line 201 because the condition on line 187 was always true
188 # We are looking at a top-level template in the original
189 # wikitext.
190 template_data.append(
191 TemplateData(
192 name=node.template_name,
193 args={
194 str(k): clean_node(wxr, None, v)
195 for k, v in node.template_parameters.items()
196 },
197 expansion=expanded,
198 )
199 )
200 top_template_name = node.template_name
201 new_node = wxr.wtp.parse(expanded)
203 template_depth += 1
204 ret = wxr.wtp.node_to_text(
205 new_node, node_handler_fn=bold_node_handler_fn
206 )
207 template_depth -= 1
208 if template_depth == 0: 208 ↛ 210line 208 didn't jump to line 210 because the condition on line 208 was always true
209 top_template_name = None
210 return ret
211 elif kind == NodeKind.LINK:
212 if not isinstance(node.largs[0][0], str): 212 ↛ 213line 212 didn't jump to line 213 because the condition on line 212 was never true
213 return None
214 if node.largs[0][0].startswith("Κατηγορία:"): 214 ↛ 215line 214 didn't jump to line 215 because the condition on line 214 was never true
215 category_data.append(node.largs[0][0][len("Κατηγορία:") :])
216 return [""]
217 if node.largs[0][0].startswith("Αρχείο:"): 217 ↛ 218line 217 didn't jump to line 218 because the condition on line 217 was never true
218 return [""]
219 # Often forms are 'formatted' with links, so let's mark these
220 # too.
221 return [
222 "__L__",
223 # unpacking a list-comprehension, unpacking into a list
224 # seems to be more performant than adding lists together.
225 *(
226 wxr.wtp.node_to_text(
227 node.largs[1:2] or node.largs[0],
228 node_handler_fn=bold_node_handler_fn,
229 )
230 # output the "visible" half of the link.
231 ),
232 # XXX collect link data if it turns out to be important.
233 "__/L__",
234 ]
235 # print(f"{node.largs=}")
237 elif kind in { 237 ↛ 243line 237 didn't jump to line 243 because the condition on line 237 was never true
238 NodeKind.TABLE,
239 }:
240 # XXX Handle tables here
241 # template depth and top-level template name
242 nonlocal table_nodes
243 table_nodes.append((top_template_name, node))
244 return [""]
245 return None
247 # Get Head Line
248 # Head *should* be immediately before the glosses...
249 # print(node_lines[:glosses_index])
250 found_head = False
252 for line in reversed(node_lines[:glosses_index]):
253 template_data = []
254 template_depth = 0
255 stripped = (
256 wxr.wtp.node_to_text(line, node_handler_fn=bold_node_handler_fn)
257 .removeprefix(":")
258 .strip()
259 )
260 if not stripped:
261 continue
262 if not found_head and parse_head(wxr, data, stripped): 262 ↛ 252line 262 didn't jump to line 252 because the condition on line 262 was always true
263 # print(data)
264 found_head = True
265 if not found_head: 265 ↛ 271line 265 didn't jump to line 271 because the condition on line 265 was never true
266 # There are a bunch of Greek Wiktionary articles with POS sections
267 # without heads, but they seem to always follow ones with heads;
268 # in this case, the result is just not including any `forms` field
269 # for these (or copying the previous one).
271 if prev_data is None:
272 wxr.wtp.warning(
273 f"Part of speech missing head: {wxr.wtp.title}",
274 sortid="pos/460/20250104",
275 )
276 else:
277 # No head found, copy previous (in this language)
278 data.forms = [
279 form.model_copy(deep=True) for form in prev_data.forms
280 ]
282 if len(template_data) > 0: 282 ↛ 283line 282 didn't jump to line 283 because the condition on line 282 was never true
283 data.head_templates = template_data
284 # logger.info(
285 # f" //// {wxr.wtp.title}\n >>>"
286 # + "\n >>>".join(repr(td) for td in template_data)
287 # )
289 if len(table_nodes) > 0: 289 ↛ 290line 289 didn't jump to line 290 because the condition on line 289 was never true
290 for template_name, table_node in table_nodes:
291 # XXX template_name
292 parse_table(
293 wxr,
294 table_node,
295 data,
296 data.lang_code in GREEK_LANGCODES,
297 template_name=template_name or "",
298 )
300 data.forms = remove_duplicate_forms(wxr, data.forms)
302 # Ignore images and files
303 # 2025-01-17 13:10:10,035 INFO: //// Ηρόδοτος
304 # //// [[Αρχείο:Herodotus Massimo Inv124478.jpg|200px|thumb|[[προτομή]] του Ηροδότου]]
306 # Have to ignore {{(( specifically. Creates columns.
307 # 2025-01-17 13:10:11,059 INFO: //// κάνω
308 # //// {{((|width=97%}}
310 # logger.info(f"<<<<<<<<< {wxr.wtp.title}\n<<<" + "\n<<<".join(pparts))
311 # see: free -> {{en-verb-'free'}} creates a floating inflection table
312 # followed by the usual head template
314 # see: τηλεομοιοτυπία
315 # '''{{PAGENAME}}''' {{θ}}
316 # theta is basically {{f|...}}
318 # see: θηλυκός
319 # '''{{PAGENAME}}, -ή''' ''και'' '''-ιά, -ό'''
320 # pagename, -e and -ia, -o, no indication of what these mean
322 # Ιόνια νησιά
323 # >>>'''{{PAGENAME}}''' ''πληθυντικός του'' [[ιόνιος|Ιόνιο]] [[νησί]]
324 # plural of 'Ionian island'
326 # >>>>>>>>> free
327 # >>>{{en-adj-r}} # floating table
328 # >>>{{τ|en|{{PAGENAME}}}}, ''συγκριτικός'' '''freer''', ''υπερθετικός'' '''freest'''
329 # pretty consistent bolding and italics
331 # genus
332 # {{τ|la|{{PAGENAME}}}} {{ο}} ([[γενική]]: generis) (3ης [[κλίση]]ς)
334 # ουδέτερος
335 # >>>'''{{PAGENAME}} -η -ο'''
337 # καφέ
338 # >>>'''{{PAGENAME}}''' {{ακλ|επίθ}}
339 # aklitos, uninflected
341 # καφέ
342 # >>>[[Αρχείο:Cafe Museum - Vienna (5402363918).jpg|μικρογραφία|τραπέζι σε βιενέζικο '''καφέ''']]
343 # >>>'''{{PAGENAME}}''' {{ο}} {{ακλ|ουδ}}
344 # Ignore images
346 # κρόκος
347 # >>>{| align="right"
348 # >>>
349 # >>>|-
350 # >>>
351 # >>>|[[Αρχείο:Crocus sativus1.jpg|thumb|150px|Άνθη '''κρόκου''' (''Crocus sativus'').]]
352 # >>>
353 # >>>
354 # >>>|[[Αρχείο:Raw egg.jpg|thumb|200px|Ο '''κρόκος''' ενός αβγού.]]
355 # >>>
356 # >>>
357 # >>>|}
358 # >>>
359 # >>>'''{{PAGENAME}}''' {{α}}
361 # p
362 # >>>'''{{PAGENAME}}''' ''πεζό'' (''κεφαλαίο:'' '''{{l|P|la}}''')
363 # lowercase, uppercase
365 # Δημόκριτος
366 # >>>'''{{PAGENAME}}'''
367 # >>># {{όνομα||α}}
368 # >>>{{clear}}
369 # Clear is just formatting to move the line down where there are empty
370 # margins.
372 # BIG ASSUMPTION: let us assume that Greek Wiktionary doesn't use templates
373 # that generate multiline text that is part of head. That is, we can see
374 # each newline because they are in strings, and when something that does
375 # generate virtual newlines (list) pops up, that's when the head portion
376 # ends.
377 # Greek Wiktionary head sections look like this:
378 # > Pre-head templates that create side-tables, like inflections
379 # > Possible formatting templates like {{clear}} that should be ignored
380 # > Head template last before glosses list
381 # > Clear again...
382 # > Glosses list tree, where we can stop.
383 # We can create "lines" of these by looping over the items in pos_content
384 # and looking for newlines in strings, because that's where they mainly
385 # should be (except side-table templates). We handle earlier lines
386 # differently than the last line before the glosses list, which is the
387 # head.
389 # return None
391 # ======================
393 ### Glosses after head ###
394 # parts = []
395 got_senses = False
396 for lst in glosses_lists:
397 # Wiktionaries handle glosses the usual way: with numbered lists.
398 # Each list entry is a gloss, sometimes with subglosses, but with
399 # Simple English Wiktionary that seems rare.
400 # logger.debug(f"{lst}")
401 senses = recurse_glosses(wxr, lst, data)
402 if len(senses) > 0: 402 ↛ 396line 402 didn't jump to line 396 because the condition on line 402 was always true
403 got_senses = True
404 data.senses.extend(senses)
406 if not got_senses and len(glosses_lists) > 0: 406 ↛ 407line 406 didn't jump to line 407 because the condition on line 406 was never true
407 wxr.wtp.error(
408 "POS had a list, but the list did not return senses.",
409 sortid="simple/pos/313",
410 )
412 # If there is no list, clump everything into one gloss.
413 # if not len(glosses_lists > 0):
414 # sense = Sense()
415 # found_gloss = parse_gloss(wxr, sense, pos_contents[i:])
416 # if found_gloss is True or len(sense.raw_tags) > 0:
417 # convert_tags_in_sense(sense)
418 # if len(sense.glosses) == 0 and "no-gloss" not in sense.tags:
419 # sense.tags.append("no-gloss")
420 # data.senses.append(sense)
422 if len(data.senses) == 0: 422 ↛ 423line 422 didn't jump to line 423 because the condition on line 422 was never true
423 data.senses.append(Sense(tags=["no-gloss"]))
425 #####
426 #####
427 # TEMP DEBUG PRINTS
429 pos_sublevels = list(
430 node.find_child(LEVEL_KIND_FLAGS)
431 # include empty string only for debug printing?
432 )
434 for sl in pos_sublevels: 434 ↛ 435line 434 didn't jump to line 435 because the loop on line 434 never started
435 subtitle = clean_node(wxr, None, sl.largs[0]).lower().strip()
437 type, pos, heading_name, tags, num, ok = parse_lower_heading(
438 wxr, subtitle
439 )
441 if type == Heading.Translations:
442 process_translations(wxr, data, sl)
443 elif type == Heading.Infl:
444 process_inflection_section(wxr, data, sl)
445 elif type in (
446 Heading.Related,
447 Heading.Synonyms,
448 Heading.Antonyms,
449 Heading.Transliterations,
450 ):
451 process_linkage_section(wxr, data, sl, type)
452 # if type not in (
453 # Heading.Translations,
454 # Heading.Ignored,
455 # Heading.Infl,
456 # Heading.Related,
457 # Heading.Synonyms,
458 # Heading.Antonyms,
459 # Heading.Derived,
460 # # We're going to ignore homonyms because they're
461 # # only tangentially related, like anagrams
462 # Heading.Homonyms,
463 # ):
464 # # ...
465 # expanded = wxr.wtp.expand(wxr.wtp.node_to_wikitext(sl))
466 # # text = clean_node(wxr, None, sl)
467 # logger.warning(
468 # f"""
469 # {wxr.wtp.title}: {type}, '{heading_name}', {ok=}
470 # {expanded}
472 # ###########################
473 # """
474 # )
476 #####
477 #####
478 return data
481PARENS_BEFORE_RE = re.compile(r"\s*(\([^()]+\)\s*)+")
482ITER_PARENS_RE = re.compile(r"\(([^()]+)\)")
485def bold_node_fn(
486 node: WikiNode,
487) -> list[str | WikiNode] | None:
488 """Handle nodes in the parse tree specially."""
489 # print(f"{node=}")
490 if node.kind == NodeKind.ITALIC:
491 return ["__I__", *node.children, "__/I__"]
492 if node.kind == NodeKind.BOLD:
493 return ["__B__", *node.children, "__/B__"]
494 # if node.kind == NodeKind.LINK:
495 # if not isinstance(node.largs[0][0], str):
496 # return None
497 # return [
498 # "__L__",
499 # # unpacking a list-comprehension, unpacking into a list
500 # # seems to be more performant than adding lists together.
501 # *(
502 # wxr.wtp.node_to_text(
503 # node.largs[1:2] or node.largs[0],
504 # )
505 # # output the "visible" half of the link.
506 # ),
507 # # XXX collect link data if it turns out to be important.
508 # "__/L__",
509 # ]
510 # # print(f"{node.largs=}")
511 return None
514def extract_form_of_templates(
515 wxr: WiktextractContext, parent_sense: Sense, t_node: TemplateNode
516) -> None:
517 """Parse form_of for nouns, adjectives and verbs.
519 Supports:
520 * κλ | generic | form_of
521 * πτώση/πτώσεις | nouns, adjectives etc. | form_of and tags
522 * ρημ τύπος | verbs | form_of
523 * μτχ | verbs | form_of
525 * References:
526 https://el.wiktionary.org/wiki/Πρότυπο:κλ
527 https://el.wiktionary.org/wiki/Κατηγορία:Πρότυπα_για_κλιτικούς_τύπους
528 https://el.wiktionary.org/wiki/Πρότυπο:ρημ_τύπος
529 https://el.wiktionary.org/wiki/Κατηγορία:Πρότυπα_για_μετοχές
530 """
531 t_name = t_node.template_name
533 # Generic
534 if t_name == "κλ":
535 t_args = t_node.template_parameters
536 if 2 not in t_args: 536 ↛ 537line 536 didn't jump to line 537 because the condition on line 536 was never true
537 wxr.wtp.warning(
538 "Form-of template does not have lemma data: "
539 f"{t_name}, {t_args=}",
540 sortid="pos/535/20250416",
541 )
542 return
543 lemma = clean_node(wxr, None, t_args[2])
544 form_of = FormOf(word=lemma)
545 parent_sense.form_of.append(form_of)
547 # Nouns and adjectives
548 inflection_t_names = ("πτώσεις", "πτώση")
549 if any(name in t_name for name in inflection_t_names):
550 return extract_form_of_templates_ptosi(wxr, parent_sense, t_node)
552 # Verbs
553 if t_name == "ρημ τύπος":
554 t_args = t_node.template_parameters
555 if 2 not in t_args: 555 ↛ 556line 555 didn't jump to line 556 because the condition on line 555 was never true
556 wxr.wtp.warning(
557 "Form-of template does not have lemma data: "
558 f"{t_name}, {t_args=}",
559 sortid="pos/535/20250416",
560 )
561 return
562 lemma = clean_node(wxr, None, t_args[2])
563 form_of = FormOf(word=lemma)
564 parent_sense.form_of.append(form_of)
565 if t_name.startswith("μτχ"):
566 t_args = t_node.template_parameters
567 if 1 not in t_args: 567 ↛ 568line 567 didn't jump to line 568 because the condition on line 567 was never true
568 wxr.wtp.warning(
569 "Form-of template does not have lemma data: "
570 f"{t_name}, {t_args=}",
571 sortid="pos/570/20250517",
572 )
573 return
574 lemma = clean_node(wxr, None, t_args[1])
575 form_of = FormOf(word=lemma)
576 parent_sense.form_of.append(form_of)
579def extract_form_of_templates_ptosi(
580 wxr: WiktextractContext, parent_sense: Sense, t_node: TemplateNode
581) -> None:
582 """Parse form_of for nouns and adjectives.
584 Supports:
585 * [gender του] πτώση-πτώσεις templates
587 Notes:
588 * πτώση has exactly one case, πτώσεις as at least two cases
589 """
590 t_name = t_node.template_name
591 inflection_t_names = ("πτώσεις", "πτώση")
592 tags: list[str] = []
594 # Parse and consume gender if any
595 if "-" in t_name:
596 # Cf. {{ουδ του-πτώσειςΟΑΚεν|καλός|grc}}
597 gender, inflection = t_name.split("-")
598 code = gender[:3]
599 GENDER_INFLECTION_MAP = {
600 "θηλ": "feminine",
601 "αρσ": "masculine",
602 "ουδ": "neuter",
603 }
604 try:
605 gender_tag = GENDER_INFLECTION_MAP[code]
606 except KeyError:
607 # Bad template name.
608 return
609 tags.append(gender_tag)
610 else:
611 inflection = t_name
613 # Remove πτώση-πτώσεις prefix
614 for prefix in inflection_t_names: 614 ↛ 619line 614 didn't jump to line 619 because the loop on line 614 didn't complete
615 if inflection.startswith(prefix):
616 inflection = inflection[len(prefix) :]
617 break
619 PTOSI_INFLECTION_MAP = {
620 "Ο": "nominative",
621 "Α": "accusative",
622 "Γ": "genitive",
623 "Κ": "vocative",
624 }
626 # The πτώση-πτώσεις templates contains:
627 # * Case(s) (1 for πτώση, >1 for πτώσεις) in uppercase characters.
628 # * Number in either "εν" (singular) or "πλ" (plural)
629 #
630 # Examples:
631 # * {{πτώσηΑεν|κόρφος}} > accusative | singular
632 # * {{πτώσειςΟΚπλ|κόρφος}} > nominative, vocative | plural
633 try:
634 lowercase = "".join(ch for ch in inflection if ch.islower())
635 number = {"εν": "singular", "πλ": "plural"}[lowercase]
636 uppercase = [ch for ch in inflection if not ch.islower()]
637 cases = [PTOSI_INFLECTION_MAP[ch] for ch in uppercase]
638 except KeyError:
639 # Bad template name.
640 return
642 tags.extend([elt for elt in cases + [number]])
644 t_args = t_node.template_parameters
646 if 1 not in t_args: 646 ↛ 647line 646 didn't jump to line 647 because the condition on line 646 was never true
647 wxr.wtp.warning(
648 f"Form-of template does not have lemma data: {t_name}, {t_args=}",
649 sortid="pos/620/20250416",
650 )
651 return
653 lemma = clean_node(wxr, None, t_args[1])
654 form_of = FormOf(word=lemma)
655 parent_sense.form_of.append(form_of)
656 tags.sort() # For the tests, but also good practice
657 parent_sense.tags.extend(tags)
660def parse_gloss(
661 wxr: WiktextractContext, parent_sense: Sense, contents: list[str | WikiNode]
662) -> bool:
663 """Take what is preferably a line of text and extract tags and a gloss from
664 it. The data is inserted into parent_sense, and for recursion purposes
665 we return a boolean that tells whether there was any gloss text in a
666 lower node."""
667 if len(contents) == 0: 667 ↛ 668line 667 didn't jump to line 668 because the condition on line 667 was never true
668 return False
670 for t_node in contents:
671 if isinstance(t_node, TemplateNode):
672 extract_form_of_templates(wxr, parent_sense, t_node)
674 template_tags: list[str] = []
676 bl_linkages: list[Linkage] = []
677 no_gloss_but_keep_anyway = False
679 def bl_template_handler_fn(name: str, ht: TemplateArgs) -> str | None:
680 nonlocal bl_linkages
681 if name == "βλ":
682 for k, v in ht.items():
683 if isinstance(k, int):
684 bl_linkages.append(Linkage(word=clean_node(wxr, None, v)))
685 return ""
686 return None
688 # The rest of the text.
689 text = clean_node(
690 wxr,
691 parent_sense,
692 contents,
693 template_fn=bl_template_handler_fn,
694 node_handler_fn=bold_node_fn,
695 )
697 if len(bl_linkages) > 0:
698 parent_sense.related.extend(bl_linkages)
699 no_gloss_but_keep_anyway = True
701 if not text.strip():
702 if len(bl_linkages) <= 0: 702 ↛ 703line 702 didn't jump to line 703 because the condition on line 702 was never true
703 return False
705 # print(f" ============ {contents=}, {text=}")
707 # Greek Wiktionary uses a lot of template-less tags.
708 if parens_n := PARENS_BEFORE_RE.match(text):
709 blocks = ITER_PARENS_RE.findall(parens_n.group(0))
710 # print(f"{blocks=}")
711 kept_blocks: list[str] = []
712 forms: list[str] = []
713 raw_tag_texts: list[str] = []
714 for block in blocks:
715 if block_has_non_greek_text(block):
716 # Keep parentheses with non-greek text with gloss text)
717 kept_blocks.extend(("(", block, ") "))
718 continue
719 nforms, nraw_tag_texts = extract_forms_and_tags(block)
720 forms.extend(nforms)
721 raw_tag_texts.extend(nraw_tag_texts)
722 # print(f"{forms=}, {raw_tag_texts=}")
723 if forms: 723 ↛ 725line 723 didn't jump to line 725 because the condition on line 723 was never true
724 # print(f"{forms=}")
725 parent_sense.related.extend(Linkage(word=form) for form in forms)
726 parent_sense.raw_tags.extend(raw_tag_texts)
727 kept_blocks.append(text[parens_n.end() :])
728 text = "".join(kept_blocks)
730 text = re.sub(r"__/?[IB]__", "", text)
732 if len(template_tags) > 0: 732 ↛ 733line 732 didn't jump to line 733 because the condition on line 732 was never true
733 parent_sense.raw_tags.extend(template_tags)
735 if len(text) > 0:
736 parent_sense.glosses.append(text)
737 return True
739 if no_gloss_but_keep_anyway: 739 ↛ 743line 739 didn't jump to line 743 because the condition on line 739 was always true
740 parent_sense.raw_tags.append("no-gloss")
741 return True
743 return False
746Related: TypeAlias = Linkage
747Synonym: TypeAlias = Linkage
748Antonym: TypeAlias = Linkage
751def recurse_glosses1(
752 wxr: WiktextractContext,
753 parent_sense: Sense,
754 node: WikiNode,
755) -> tuple[
756 list[Sense],
757 list[Example],
758 list[Related],
759 list[Synonym],
760 list[Antonym],
761]:
762 """Helper function for recurse_glosses"""
763 # print(f"{node=}")
765 ret_senses: list[Sense] = []
766 ret_examples: list[Example] = []
767 ret_related: list[Related] = []
768 ret_synonyms: list[Synonym] = []
769 ret_antonyms: list[Antonym] = []
770 found_gloss = False
772 # Pydantic stuff doesn't play nice with Tatu's manual dict manipulation
773 # functions, so we'll use a dummy dict here that we then check for
774 # content and apply to `parent_sense`.
775 dummy_parent: dict = {}
777 related_linkages: list[Linkage] = []
778 example_is_synonym = False
779 example_is_antonym = False
781 def bl_template_handler_fn(name: str, ht: TemplateArgs) -> str | None:
782 nonlocal related_linkages
783 nonlocal example_is_synonym
784 nonlocal example_is_antonym
785 # Sometimes the bl-templates point to synonyms or antonyms, instead
786 # of just "related"; we save them, and if example_is_xxxnym is true,
787 # we later return them as xxxnyms.
788 if name == "βλ":
789 for k, v in ht.items():
790 if isinstance(k, int):
791 related_linkages.append(
792 Linkage(word=clean_node(wxr, None, v))
793 )
794 return ""
795 if name == "συνων":
796 example_is_synonym = True
797 return ""
798 if name == "αντων":
799 example_is_antonym = True
800 return ""
801 return None
803 # List nodes contain only LIST_ITEM nodes, which may contain sub-LIST nodes.
804 if node.kind == NodeKind.LIST:
805 list_ret: tuple[
806 list[Sense],
807 list[Example],
808 list[Related],
809 list[Synonym],
810 list[Antonym],
811 ] = ([], [], [], [], [])
812 for child in node.children:
813 if isinstance(child, str) or child.kind != NodeKind.LIST_ITEM: 813 ↛ 815line 813 didn't jump to line 815 because the condition on line 813 was never true
814 # This should never happen
815 wxr.wtp.error(
816 f"{child=} is direct child of NodeKind.LIST",
817 sortid="simple/pos/44",
818 )
819 continue
820 (
821 senses,
822 examples,
823 related,
824 synonyms,
825 antonyms,
826 ) = recurse_glosses1(wxr, parent_sense.model_copy(deep=True), child)
827 list_ret[0].extend(senses)
828 list_ret[1].extend(examples)
829 list_ret[2].extend(related)
830 list_ret[3].extend(synonyms)
831 list_ret[4].extend(antonyms)
832 return list_ret
834 elif node.kind == NodeKind.LIST_ITEM: 834 ↛ 941line 834 didn't jump to line 941 because the condition on line 834 was always true
835 # Split at first LIST node found
836 split_at = next(
837 (
838 i
839 for i, c in enumerate(node.children)
840 if isinstance(c, WikiNode) and c.kind == NodeKind.LIST
841 ),
842 len(node.children),
843 )
844 contents = node.children[:split_at]
845 sublists = node.children[split_at:]
847 # A LIST and LIST_ITEM `sarg` is basically the prefix of the line, like
848 # `#` or `##:`: the token that appears at the very start of a line that
849 # is used to parse the depth and structure of lists.
850 # `#` Item 1
851 # `##` Item 1.1
852 # `##*` Example 1.1
853 if node.sarg.endswith((":", "*")) and node.sarg not in (":", "*"):
854 # This is either a quotation or example.
855 text = clean_node(
856 wxr, dummy_parent, contents, template_fn=bl_template_handler_fn
857 ).strip("⮡ \n")
859 # print(f"{contents=}, {text=}, {related_linkages=}")
862 if example_is_synonym or example_is_antonym:
863 link_linkages = []
864 for snode in contents:
865 if not isinstance(snode, WikiNode):
866 continue
867 if snode.kind == NodeKind.LINK:
868 link_linkages.append(
869 Linkage(
870 word=clean_node(wxr, None, snode.largs[0][0])
871 )
872 )
873 else:
874 for link in snode.find_child_recursively(NodeKind.LINK): 874 ↛ 875line 874 didn't jump to line 875 because the loop on line 874 never started
875 link_linkages.append(
876 Linkage(
877 word=clean_node(
878 wxr, None, snode.largs[0][0]
879 )
880 )
881 )
883 # print("=====")
884 # print(f"{link_linkages=}")
886 if example_is_synonym:
887 return [], [], [], link_linkages + related_linkages, []
888 elif example_is_antonym: 888 ↛ 891line 888 didn't jump to line 891 because the condition on line 888 was always true
889 return [], [], [], [], link_linkages + related_linkages
891 if len(related_linkages) > 0:
892 # parent_sense.related.extend(bl_linkages)
893 # related_linkages = []
894 # if not text.strip():
895 return [], [], related_linkages, [], []
897 example_is_synonym = False
898 example_is_antonym = False
900 if not text.strip(): 900 ↛ 901line 900 didn't jump to line 901 because the condition on line 900 was never true
901 return [], [], [], [], []
903 example = Example(text=text)
904 # logger.debug(f"{wxr.wtp.title}/example\n{text}")
905 if len(sublists) > 0:
906 translation = clean_node(wxr, dummy_parent, sublists).strip(
907 "#*: \n"
908 )
909 if translation != "": 909 ↛ 912line 909 didn't jump to line 912 because the condition on line 909 was always true
910 example.translation = translation
912 for k, v in dummy_parent.items(): 912 ↛ 913line 912 didn't jump to line 913 because the loop on line 912 never started
913 if k == "categories":
914 parent_sense.categories.extend(v)
915 dummy_parent = {}
917 return [], [example], [], [], []
919 found_gloss = parse_gloss(wxr, parent_sense, contents)
921 for sl in sublists:
922 if not (isinstance(sl, WikiNode) and sl.kind == NodeKind.LIST): 922 ↛ 924line 922 didn't jump to line 924 because the condition on line 922 was never true
923 # Should not happen
924 wxr.wtp.error(
925 f"Sublist is not NodeKind.LIST: {sublists=!r}",
926 sortid="simple/pos/82",
927 )
928 continue
929 (
930 senses,
931 examples,
932 related,
933 synonyms,
934 antonyms,
935 ) = recurse_glosses1(wxr, parent_sense.model_copy(deep=True), sl)
936 ret_senses.extend(senses)
937 ret_examples.extend(examples)
938 ret_related.extend(related)
939 ret_synonyms.extend(synonyms)
940 ret_antonyms.extend(antonyms)
941 if len(ret_senses) > 0:
942 # the recursion returned actual senses from below, so we will
943 # ignore everything else (incl. any example data that might have
944 # been given to parent_sense) and return that instead.
945 # XXX if this becomes relevant, add the example data to a returned
946 # subsense instead?
947 # if any(
948 # isinstance(r, Sense) and r.raw_tags == ["no-gloss"] for r in ret
949 # ):
950 # print(f"{ret=}")
951 return (
952 combine_senses_with_identical_glosses(ret_senses),
953 [],
954 [],
955 [],
956 [],
957 )
959 # If nothing came from below, then this.
960 if found_gloss is True or "no-gloss" in parent_sense.raw_tags: 960 ↛ 968line 960 didn't jump to line 968 because the condition on line 960 was always true
961 parent_sense.examples.extend(ret_examples)
962 parent_sense.related.extend(ret_related)
963 parent_sense.synonyms.extend(ret_synonyms)
964 parent_sense.antonyms.extend(ret_antonyms)
966 return [parent_sense], [], [], [], []
968 return [], ret_examples, ret_related, ret_synonyms, ret_antonyms
971def recurse_glosses(
972 wxr: WiktextractContext, node: WikiNode, data: WordEntry
973) -> list[Sense]:
974 """Recurse through WikiNodes to find glosses and sense-related data."""
975 base_sense = Sense()
976 ret: list[Sense] = []
978 senses, examples, related, synonyms, antonyms = recurse_glosses1(
979 wxr, base_sense, node
980 )
981 if ( 981 ↛ 987line 981 didn't jump to line 987 because the condition on line 981 was never true
982 len(examples) > 0
983 or len(related) > 0
984 or len(synonyms) > 0
985 or len(antonyms) > 0
986 ):
987 wxr.wtp.error(
988 "NOT Sense has bubbled to recurse_glosses: "
989 f"{examples=}, {related=}, {synonyms=}, {antonyms=}",
990 sortid="pos/glosses/966",
991 )
992 for sense in senses:
993 convert_tags_in_sense(sense)
994 ret.append(sense)
996 return ret
999def split_nodes_to_lines(
1000 nodes: list[WikiNode | str],
1001) -> Iterator[list[WikiNode | str]]:
1002 """Take a list of nodes and split up the list into lines.
1003 This could be done by using node_to_wikitext() to reverse the parsing,
1004 and then you could parse the individual lines after splitting the text,
1005 but it seems unnecessary in the context of Greek Wiktionary PoS sections.
1006 """
1007 parts: list[WikiNode | str] = []
1008 for node in nodes:
1009 if isinstance(node, WikiNode):
1010 # Lists are returned as whole, they're their own line
1011 if node.kind == NodeKind.LIST:
1012 if len(parts) > 0: 1012 ↛ 1013line 1012 didn't jump to line 1013 because the condition on line 1012 was never true
1013 yield parts
1014 parts = []
1015 yield [node]
1016 continue
1017 if isinstance(node, TemplateNode) and node.template_name in ( 1017 ↛ 1024line 1017 didn't jump to line 1024 because the condition on line 1017 was never true
1018 # Ignore specific templates, like {{((}} that bookends a column.
1019 "((",
1020 "))",
1021 "clear",
1022 "κλείδα-ελλ",
1023 ):
1024 continue
1025 parts.append(node)
1026 else:
1027 if "\n" in node:
1028 split_string = node.splitlines()
1029 for spl in split_string[:-1]:
1030 if spl: 1030 ↛ 1031line 1030 didn't jump to line 1031 because the condition on line 1030 was never true
1031 parts.append(spl)
1032 yield parts
1033 parts = []
1034 # special handling for final newline; splitlines ignores it
1035 if node.endswith("\n"):
1036 if split_string[-1]:
1037 parts.append(split_string[-1])
1038 yield parts
1039 parts = []
1040 elif split_string[-1]: 1040 ↛ 1008line 1040 didn't jump to line 1008 because the condition on line 1040 was always true
1041 parts.append(split_string[-1])
1042 elif node: 1042 ↛ 1008line 1042 didn't jump to line 1008 because the condition on line 1042 was always true
1043 parts.append(node)
1045 # yield final parts
1046 if len(parts) > 0: 1046 ↛ 1047line 1046 didn't jump to line 1047 because the condition on line 1046 was never true
1047 yield parts
1050BOLD_RE = re.compile(r"(__/?[BI]__|, |\. )")
1053def extract_forms_and_tags(tagged_text: str) -> tuple[list[str], list[str]]:
1054 forms: list[str] = []
1055 tags: list[str] = []
1057 # print(f"{tagged_text=}")
1058 # inside_italics = False
1059 inside_bold = False
1061 for i, t in enumerate(BOLD_RE.split(tagged_text)):
1062 t = t.strip()
1063 # print(f"{i}: {t=}")
1064 if not t:
1065 continue
1067 if i % 2 == 0:
1068 # Text between splitters
1069 if inside_bold is True: 1069 ↛ 1070line 1069 didn't jump to line 1070 because the condition on line 1069 was never true
1070 forms.append(t)
1071 continue
1072 # Add everything else to raw_tags
1073 # if inside_italics is True:
1074 # tags.append(t)
1075 # continue
1076 # ". " and ", " just split. They're stripped to "." and "," if
1077 # this needs to be modified later.
1078 tags.append(t)
1079 continue
1080 match t:
1081 case "__B__": 1081 ↛ 1082line 1081 didn't jump to line 1082 because the pattern on line 1081 never matched
1082 inside_bold = True
1083 case "__/B__": 1083 ↛ 1084line 1083 didn't jump to line 1084 because the pattern on line 1083 never matched
1084 inside_bold = False
1085 # case "__I__":
1086 # inside_italics = True
1087 # case "__/I__":
1088 # inside_italics = False
1090 return forms, tags
1093META_RE = re.compile(r"__/?[ILEB]__")
1096def block_has_non_greek_text(text: str) -> bool:
1097 text = META_RE.sub("", text)
1098 for t in text.split():
1099 for ch in t: 1099 ↛ 1098line 1099 didn't jump to line 1098 because the loop on line 1099 didn't complete
1100 if not ch.isalpha(): 1100 ↛ 1101line 1100 didn't jump to line 1101 because the condition on line 1100 was never true
1101 continue
1102 if not unicode_name(ch).startswith("GREEK"):
1103 return True
1104 break
1105 return False
1108def combine_senses_with_identical_glosses(
1109 orig_senses: list[Sense],
1110) -> list[Sense]:
1111 glosses_to_senses: dict[tuple[str, ...], list[Sense]] = {}
1112 senses: list[Sense] = []
1114 found_identical_glosses = False
1116 for item in orig_senses:
1117 glosses_key = tuple(item.glosses)
1118 if glosses_key not in glosses_to_senses: 1118 ↛ 1121line 1118 didn't jump to line 1121 because the condition on line 1118 was always true
1119 glosses_to_senses[glosses_key] = [item]
1120 else:
1121 glosses_to_senses[glosses_key].append(item)
1122 found_identical_glosses = True
1124 if not found_identical_glosses: 1124 ↛ 1127line 1124 didn't jump to line 1127 because the condition on line 1124 was always true
1125 return orig_senses
1127 for twinned_senses in glosses_to_senses.values():
1128 main_sense = twinned_senses[0]
1129 for other_sense in twinned_senses[1:]:
1130 main_sense.merge(other_sense)
1131 senses.append(main_sense)
1133 return senses