Coverage for src/wiktextract/extractor/en/inflection.py: 87%
1549 statements
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 06:55 +0000
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 06:55 +0000
1# Code for parsing inflection tables.
2#
3# Copyright (c) 2021-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org.
5import collections
6import copy
7import functools
8import html
9import re
10import unicodedata
11from typing import TYPE_CHECKING, Generator, Literal, Optional, Union
13from mediawiki_langcodes import code_to_name, name_to_code
14from wikitextprocessor import MAGIC_FIRST, HTMLNode, NodeKind, WikiNode
16from ...clean import clean_value
17from ...datautils import data_append, freeze, split_at_comma_semi
18from ...tags import valid_tags
19from ...wxr_context import WiktextractContext
20from .form_descriptions import (
21 classify_desc,
22 decode_tags,
23 distw,
24 match_links_to_form,
25 parse_head_final_tags,
26)
27from .inflection_kludges import ka_decl_noun_template_cell
28from .inflectiondata import infl_map, infl_start_map, infl_start_re
29from .lang_specific_configs import get_lang_conf, lang_specific_tags
30from .table_headers_heuristics_data import LANGUAGES_WITH_CELLS_AS_HEADERS
31from .type_utils import FormData, WordData
33# --debug-text-cell WORD
34# Command-line parameter for debugging. When parsing inflection tables,
35# print out debug messages when encountering this text.
36debug_cell_text: Optional[str] = None
39def set_debug_cell_text(text: str) -> None:
40 global debug_cell_text
41 debug_cell_text = text
44TagSets = list[tuple[str, ...]]
46# Column texts that are interpreted as an empty column.
47IGNORED_COLVALUES = {
48 "-",
49 "־",
50 "᠆",
51 "‐",
52 "‑",
53 "‒",
54 "–",
55 "—",
56 "―",
57 "−",
58 "⸺",
59 "⸻",
60 "﹘",
61 "﹣",
62 "-",
63 "/",
64 "?",
65 "not used",
66 "not applicable",
67}
69# These tags are never inherited from above
70# XXX merge with lang_specific
71noinherit_tags = {
72 "infinitive-i",
73 "infinitive-i-long",
74 "infinitive-ii",
75 "infinitive-iii",
76 "infinitive-iv",
77 "infinitive-v",
78}
80# Subject->object transformation mapping, when using dummy-object-concord
81# to replace subject concord tags with object concord tags
82object_concord_replacements = {
83 "first-person": "object-first-person",
84 "second-person": "object-second-person",
85 "third-person": "object-third-person",
86 "singular": "object-singular",
87 "plural": "object-plural",
88 "definite": "object-definite",
89 "indefinite": "object-indefinite",
90 "class-1": "object-class-1",
91 "class-2": "object-class-2",
92 "class-3": "object-class-3",
93 "class-4": "object-class-4",
94 "class-5": "object-class-5",
95 "class-6": "object-class-6",
96 "class-7": "object-class-7",
97 "class-8": "object-class-8",
98 "class-9": "object-class-9",
99 "class-10": "object-class-10",
100 "class-11": "object-class-11",
101 "class-12": "object-class-12",
102 "class-13": "object-class-13",
103 "class-14": "object-class-14",
104 "class-15": "object-class-15",
105 "class-16": "object-class-16",
106 "class-17": "object-class-17",
107 "class-18": "object-class-18",
108 "masculine": "object-masculine",
109 "feminine": "object-feminine",
110}
112# Words in title that cause addition of tags in all entries
113title_contains_global_map = {
114 "possessive": "possessive",
115 "possessed forms of": "possessive",
116 "predicative forms of": "predicative",
117 "negative": "negative",
118 "positive definite forms": "positive definite",
119 "positive indefinite forms": "positive indefinite",
120 "comparative": "comparative",
121 "superlative": "superlative",
122 "combined forms": "combined-form",
123 "mutation": "mutation",
124 "definite article": "definite",
125 "indefinite article": "indefinite",
126 "indefinite declension": "indefinite",
127 "bare forms": "indefinite", # e.g., cois/Irish
128 "definite declension": "definite",
129 "pre-reform": "dated",
130 "personal pronouns": "personal pronoun",
131 "composed forms of": "multiword-construction",
132 "subordinate-clause forms of": "subordinate-clause",
133 "participles of": "participle",
134 "variation of": "dummy-skip-this", # a'/Scottish Gaelic
135 "command form of": "imperative", # a راتلل/Pashto
136 "historical inflection of": "dummy-skip-this", # kork/Norwegian Nynorsk
137 "obsolete declension": "obsolete", # März/German 20241111
138}
139for k, v in title_contains_global_map.items():
140 if any(t not in valid_tags for t in v.split()): 140 ↛ 141line 140 didn't jump to line 141 because the condition on line 140 was never true
141 print("TITLE_CONTAINS_GLOBAL_MAP UNRECOGNIZED TAG: {}: {}".format(k, v))
142table_hdr_ign_part = r"(Inflection|Conjugation|Declension|Mutation) of [^\s]"
144table_hdr_ign_part_re = re.compile(r"(?i)(" + table_hdr_ign_part + ")")
145# (?i) python regex extension, ignore case
146title_contains_global_re = re.compile(
147 r"(?i)(^|\b)({}|{})($|\b)".format(
148 table_hdr_ign_part,
149 "|".join(re.escape(x) for x in title_contains_global_map.keys()),
150 )
151)
153# Words in title that cause addition of tags to table-tags "form"
154title_contains_wordtags_map = {
155 "pf": "perfective",
156 "impf": "imperfective",
157 "strong": "strong",
158 "weak": "weak",
159 "countable": "countable",
160 "uncountable": "uncountable",
161 "inanimate": "inanimate",
162 "animate": "animate",
163 "transitive": "transitive",
164 "intransitive": "intransitive",
165 "ditransitive": "ditransitive",
166 "ambitransitive": "ambitransitive",
167 "archaic": "archaic",
168 "dated": "dated",
169 "affirmative": "affirmative",
170 "negative": "negative",
171 "subject pronouns": "subjective",
172 "object pronouns": "objective",
173 "emphatic": "emphatic",
174 "proper noun": "proper-noun",
175 "no plural": "no-plural",
176 "imperfective": "imperfective",
177 "perfective": "perfective",
178 "no supine stem": "no-supine",
179 "no perfect stem": "no-perfect",
180 "deponent": "deponent",
181 "irregular": "irregular",
182 "no short forms": "no-short-form",
183 "iō-variant": "iō-variant",
184 "1st declension": "declension-1",
185 "2nd declension": "declension-2",
186 "3rd declension": "declension-3",
187 "4th declension": "declension-4",
188 "5th declension": "declension-5",
189 "6th declension": "declension-6",
190 "first declension": "declension-1",
191 "second declension": "declension-2",
192 "third declension": "declension-3",
193 "fourth declension": "declension-4",
194 "fifth declension": "declension-5",
195 "sixth declension": "declension-6",
196 "1st conjugation": "conjugation-1",
197 "2nd conjugation": "conjugation-2",
198 "3rd conjugation": "conjugation-3",
199 "4th conjugation": "conjugation-4",
200 "5th conjugation": "conjugation-5",
201 "6th conjugation": "conjugation-6",
202 "7th conjugation": "conjugation-7",
203 "first conjugation": "conjugation-1",
204 "second conjugation": "conjugation-2",
205 "third conjugation": "conjugation-3",
206 "fourth conjugation": "conjugation-4",
207 "fifth conjugation": "conjugation-5",
208 "sixth conjugation": "conjugation-6",
209 "seventh conjugation": "conjugation-7",
210 # Corsican regional tags in table header
211 "cismontane": "Cismontane",
212 "ultramontane": "Ultramontane",
213 "western lombard": "Western-Lombard",
214 "eastern lombard": "Eastern-Lombard",
215 "contracted": "contracted",
216 "present": "present",
217 "perfect": "perfect",
218 "imperfect": "imperfect",
219 "pluperfect": "pluperfect",
220 "future": "future",
221 "aorist": "aorist",
222 "eastern armenian": "Eastern-Armenian",
223 "western armenian": "Western-Armenian",
224 "-al conjugation": "-al-conjugation",
225 "-al negative conjugation": "-al-conjugation",
226 "-il conjugation": "-il-conjugation",
227 "-il negative conjugation": "-il-conjugation",
228 "-el conjugation": "-el-conjugation",
229 "-el negative conjugation": "-el-conjugation",
230 "-ul conjugation": "-ul-conjugation",
231 "-ul negative conjugation": "-ul-conjugation",
232 "u-type": "u-type",
233 "nominalized infinitive": "noun infinitive",
234}
235for k, v in title_contains_wordtags_map.items():
236 if any(t not in valid_tags for t in v.split()): 236 ↛ 237line 236 didn't jump to line 237 because the condition on line 236 was never true
237 print(
238 "TITLE_CONTAINS_WORDTAGS_MAP UNRECOGNIZED TAG: {}: {}".format(k, v)
239 )
240title_contains_wordtags_re = re.compile(
241 r"(?i)(^|\b)({}|{})($|\b)".format(
242 table_hdr_ign_part,
243 "|".join(
244 re.escape(x)
245 for x in reversed(
246 sorted(title_contains_wordtags_map.keys(), key=len)
247 )
248 ),
249 )
250)
252# Parenthesized elements in title that are converted to tags in
253# "table-tags" form
254title_elements_map = {
255 "weak": "weak",
256 "strong": "strong",
257 "separable": "separable",
258 "masculine": "masculine",
259 "feminine": "feminine",
260 "neuter": "neuter",
261 "singular": "singular",
262 "plural": "plural",
263 "archaic": "archaic",
264 "dated": "dated",
265 "iterative": "iterative",
266 "poetic": "poetic",
267 "Attic": "Attic",
268 "Epic": "Epic",
269 "Aeolic": "Aeolic",
270 "Arcadocypriot": "Arcadocypriot",
271 "Old Attic": "Old-Attic",
272 "Boeotian": "Boeotian",
273 "Byzantine": "Byzantine",
274 "Choral Doric": "Choral-Doric",
275 "Doric": "Doric",
276 "Elean": "Elean",
277 "Epirote": "Epirote",
278 "Ionic": "Ionic",
279 "Koine": "Koine",
280 "Cretan": "Cretan",
281 "Corinthian": "Corinthian",
282 "Laconian": "Laconian",
283 "Later poetic": "Later-poetic-Ancient-Greek",
284 "Lesbian": "Lesbian",
285 "Locrian": "Locrian",
286 "Lyric": "Lyric-Ancient-Greek",
287 "Thessalian": "Thessalian",
288 "Tragic": "Tragic-Ancient-Greek",
289}
290for k, v in title_elements_map.items():
291 if any(t not in valid_tags for t in v.split()): 291 ↛ 292line 291 didn't jump to line 292 because the condition on line 291 was never true
292 print("TITLE_ELEMENTS_MAP UNRECOGNIZED TAG: {}: {}".format(k, v))
294# Parenthized element starts to map them to tags for form for the rest of
295# the element
296title_elemstart_map = {
297 "auxiliary": "auxiliary",
298 "Kotus type": "class",
299 "ÕS type": "class",
300 "class": "class",
301 "short class": "class",
302 "type": "class",
303 "strong class": "class",
304 "weak class": "class",
305 "accent paradigm": "accent-paradigm",
306 "stem in": "class",
307}
308for k, v in title_elemstart_map.items():
309 if any(t not in valid_tags for t in v.split()): 309 ↛ 310line 309 didn't jump to line 310 because the condition on line 309 was never true
310 print("TITLE_ELEMSTART_MAP UNRECOGNIZED TAG: {}: {}".format(k, v))
311title_elemstart_re = re.compile(
312 r"^({}) ".format("|".join(re.escape(x) for x in title_elemstart_map.keys()))
313)
316# Regexp for cell starts that are likely definitions of reference symbols.
317# See also nondef_re.
318def_re = re.compile(
319 r"(\s*•?\s+)?"
320 r"((\*+|[△†0123456789⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻]+)([⁾):]|\s|(?=[A-Z]))|"
321 r"\^(\*+|[△†])|"
322 r"([¹²³⁴⁵⁶⁷⁸⁹])|"
323 r"([ᴬᴮᴰᴱᴳᴴᴵᴶᴷᴸᴹᴺᴼᴾᴿᵀᵁⱽᵂᵃᵇᶜᵈᵉᶠᵍʰⁱʲᵏˡᵐⁿᵒᵖʳˢᵗᵘᵛʷˣʸᶻᵝᵞᵟᶿᶥᵠᵡ]))"
324)
325# ᴺᴸᴴ persan/Old Irish
327# Regexp for cell starts that are exceptions to def_re and do not actually
328# start a definition.
329nondef_re = re.compile(
330 r"(^\s*(1|2|3)\s+(sg|pl)\s*$|" # 1s or 3p etc.
331 r"\s*\d\d?\s*/\s*\d\d?\s*$)"
332) # taka/Swahili "15 / 17"
335class InflCell:
336 """Cell in an inflection table."""
338 __slots__ = (
339 "text",
340 "is_title",
341 "colspan",
342 "rowspan",
343 "target",
344 "links",
345 )
347 def __init__(
348 self,
349 text: str,
350 is_title: bool,
351 colspan: int,
352 rowspan: int,
353 target: str | None,
354 cell_links: list[tuple[str, str]] | None = None,
355 ) -> None:
356 assert isinstance(text, str)
357 assert is_title in (True, False)
358 assert isinstance(colspan, int) and colspan >= 1
359 assert isinstance(rowspan, int) and rowspan >= 1
360 assert target is None or isinstance(target, str)
361 self.text = text.strip()
362 self.is_title = text and is_title
363 self.colspan = colspan
364 self.rowspan = rowspan
365 self.target = target
366 self.links = cell_links
368 def __str__(self) -> str:
369 v = "{}/{}/{}/{!r}".format(
370 self.text, self.is_title, self.colspan, self.rowspan
371 )
372 if self.target:
373 v += ": {!r}".format(self.target)
374 return v
376 def __repr__(self) -> str:
377 return str(self)
380class HdrSpan:
381 """Saved information about a header cell/span during the parsing
382 of a table."""
384 __slots__ = (
385 "start",
386 "colspan",
387 "rowspan",
388 "rownum", # Row number where this occurred
389 "tagsets", # list of tuples
390 "text", # For debugging
391 "all_headers_row",
392 "expanded", # The header has been expanded to cover whole row/part
393 )
395 def __init__(
396 self,
397 start: int,
398 colspan: int,
399 rowspan: int,
400 rownum: int,
401 tagsets: TagSets,
402 text: str,
403 all_headers_row: bool,
404 ) -> None:
405 assert isinstance(start, int) and start >= 0
406 assert isinstance(colspan, int) and colspan >= 1
407 assert isinstance(rownum, int)
408 assert isinstance(tagsets, list)
409 for x in tagsets:
410 assert isinstance(x, tuple)
411 assert all_headers_row in (True, False)
412 self.start = start
413 self.colspan = colspan
414 self.rowspan = rowspan
415 self.rownum = rownum
416 self.tagsets = list(tuple(sorted(set(tags))) for tags in tagsets)
417 self.text = text
418 self.all_headers_row = all_headers_row
419 self.expanded = False
422def is_superscript(ch: str) -> bool:
423 """Returns True if the argument is a superscript character."""
424 assert isinstance(ch, str) and len(ch) == 1
425 try:
426 name = unicodedata.name(ch)
427 except ValueError:
428 return False
429 return (
430 re.match(
431 r"SUPERSCRIPT |"
432 r"MODIFIER LETTER SMALL |"
433 r"MODIFIER LETTER CAPITAL ",
434 name,
435 )
436 is not None
437 )
440def remove_useless_tags(lang: str, pos: str, tags: set[str]) -> None:
441 """Remove certain tag combinations from ``tags`` when they serve no purpose
442 together (cover all options)."""
443 assert isinstance(lang, str)
444 assert isinstance(pos, str)
445 assert isinstance(tags, set)
446 if (
447 "animate" in tags
448 and "inanimate" in tags
449 and get_lang_conf(lang, "animate_inanimate_remove")
450 ):
451 tags.remove("animate")
452 tags.remove("inanimate")
453 if (
454 "virile" in tags
455 and "nonvirile" in tags
456 and get_lang_conf(lang, "virile_nonvirile_remove")
457 ):
458 tags.remove("virile")
459 tags.remove("nonvirile")
460 # If all numbers in the language are listed, remove them all
461 numbers = get_lang_conf(lang, "numbers")
462 if numbers and all(x in tags for x in numbers):
463 for x in numbers:
464 tags.remove(x)
465 # If all genders in the language are listed, remove them all
466 genders = get_lang_conf(lang, "genders")
467 if genders and all(x in tags for x in genders):
468 for x in genders:
469 tags.remove(x)
470 # If all voices in the language are listed, remove them all
471 voices = get_lang_conf(lang, "voices")
472 if voices and all(x in tags for x in voices):
473 for x in voices:
474 tags.remove(x)
475 # If all strengths of the language are listed, remove them all
476 strengths = get_lang_conf(lang, "strengths")
477 if strengths and all(x in tags for x in strengths):
478 for x in strengths:
479 tags.remove(x)
480 # If all persons of the language are listed, remove them all
481 persons = get_lang_conf(lang, "persons")
482 if persons and all(x in tags for x in persons):
483 for x in persons:
484 tags.remove(x)
485 # If all definitenesses of the language are listed, remove them all
486 definitenesses = get_lang_conf(lang, "definitenesses")
487 if definitenesses and all(x in tags for x in definitenesses):
488 for x in definitenesses:
489 tags.remove(x)
492def tagset_cats(tagset: TagSets) -> set[str]:
493 """Returns a set of tag categories for the tagset (merged from all
494 alternatives)."""
495 return set(valid_tags[t] for ts in tagset for t in ts)
498def or_tagsets(
499 lang: str, pos: str, tagsets1: TagSets, tagsets2: TagSets
500) -> TagSets:
501 """Merges two tagsets (the new tagset just merges the tags from both, in
502 all combinations). If they contain simple alternatives (differ in
503 only one category), they are simply merged; otherwise they are split to
504 more alternatives. The tagsets are assumed be sets of sorted tuples."""
505 assert isinstance(tagsets1, list)
506 assert all(isinstance(x, tuple) for x in tagsets1)
507 assert isinstance(tagsets2, list)
508 assert all(isinstance(x, tuple) for x in tagsets1)
509 tagsets: TagSets = [] # This will be the result
511 def add_tags(tags1: tuple[str, ...]) -> None:
512 # CONTINUE
513 if not tags1:
514 return # empty set would merge with anything, won't change result
515 if not tagsets:
516 tagsets.append(tags1)
517 return
518 for tags2 in tagsets:
519 # Determine if tags1 can be merged with tags2
520 num_differ = 0
521 if tags1 and tags2: 521 ↛ 539line 521 didn't jump to line 539 because the condition on line 521 was always true
522 cats1 = set(valid_tags[t] for t in tags1)
523 cats2 = set(valid_tags[t] for t in tags2)
524 cats = cats1 | cats2
525 for cat in cats:
526 tags1_in_cat = set(t for t in tags1 if valid_tags[t] == cat)
527 tags2_in_cat = set(t for t in tags2 if valid_tags[t] == cat)
528 if (
529 tags1_in_cat != tags2_in_cat
530 or not tags1_in_cat
531 or not tags2_in_cat
532 ):
533 num_differ += 1
534 if not tags1_in_cat or not tags2_in_cat:
535 # Prevent merging if one is empty
536 num_differ += 1
537 # print("tags1={} tags2={} num_differ={}"
538 # .format(tags1, tags2, num_differ))
539 if num_differ <= 1:
540 # Yes, they can be merged
541 tagsets.remove(tags2)
542 tags_s = set(tags1) | set(tags2)
543 remove_useless_tags(lang, pos, tags_s)
544 tags_t = tuple(sorted(tags_s))
545 add_tags(tags_t) # Could result in further merging
546 return
547 # If we could not merge, add to tagsets
548 tagsets.append(tags1)
550 for tags in tagsets1:
551 add_tags(tags)
552 for tags in tagsets2:
553 add_tags(tags)
554 if not tagsets:
555 tagsets.append(())
557 # print("or_tagsets: {} + {} -> {}"
558 # .format(tagsets1, tagsets2, tagsets))
559 return tagsets
562def and_tagsets(
563 lang: str,
564 pos: str,
565 tagsets1: list[tuple[str, ...]],
566 tagsets2: list[tuple[str, ...]],
567) -> list[tuple[str, ...]]:
568 """Merges tagsets by taking union of all cobinations, without trying
569 to determine whether they are compatible."""
570 assert isinstance(tagsets1, list) and len(tagsets1) >= 1
571 assert all(isinstance(x, tuple) for x in tagsets1)
572 assert isinstance(tagsets2, list) and len(tagsets2) >= 1
573 assert all(isinstance(x, tuple) for x in tagsets1)
574 new_tagsets = []
575 tags: Union[set[str], tuple[str, ...]]
576 for tags1 in tagsets1:
577 for tags2 in tagsets2:
578 tags = set(tags1) | set(tags2)
579 remove_useless_tags(lang, pos, tags)
580 if "dummy-ignored-text-cell" in tags: 580 ↛ 581line 580 didn't jump to line 581 because the condition on line 580 was never true
581 tags.remove("dummy-ignored-text-cell")
582 tags = tuple(sorted(tags))
583 if tags not in new_tagsets: 583 ↛ 577line 583 didn't jump to line 577 because the condition on line 583 was always true
584 new_tagsets.append(tags)
585 # print("and_tagsets: {} + {} -> {}"
586 # .format(tagsets1, tagsets2, new_tagsets))
587 return new_tagsets
590@functools.lru_cache(65536)
591def extract_cell_content(
592 lang: str, word: str, col: str
593) -> tuple[str, list[str], list[tuple[str, str]], list[str]]:
594 """Cleans a row/column header for later processing. This returns
595 (cleaned, refs, defs, tags)."""
596 # print("EXTRACT_CELL_CONTENT {!r}".format(col))
597 hdr_tags = []
598 col = re.sub(r"(?s)\s*,\s*$", "", col)
599 col = re.sub(r"(?s)\s*•\s*$", "", col)
600 col = re.sub(r"\s+", " ", col)
601 col = col.strip()
602 if re.search(
603 r"^\s*(There are |"
604 r"\* |"
605 r"see |"
606 r"Use |"
607 r"use the |"
608 r"Only used |"
609 r"The forms in |"
610 r"these are also written |"
611 r"The genitive can be |"
612 r"Genitive forms are rare or non-existant|"
613 r"Accusative Note: |"
614 r"Classifier Note: |"
615 r"Noun: Assamese nouns are |"
616 r"the active conjugation|"
617 r"the instrumenal singular|"
618 r"Note:|"
619 r"\^* Note:|"
620 r"possible mutated form |"
621 r"The future tense: )",
622 col,
623 ):
624 return "dummy-ignored-text-cell", [], [], []
626 # Temporarily remove final parenthesized part (if separated by whitespace),
627 # so that we can extract reference markers before it.
628 final_paren = ""
629 m = re.search(r"\s+\([^)]*\)$", col)
630 if m is not None:
631 final_paren = m.group(0)
632 col = col[: m.start()]
634 # Extract references and tag markers
635 refs = []
636 special_references = get_lang_conf(lang, "special_references")
637 while True:
638 m = re.search(r"\^(.|\([^)]*\))$", col)
639 if not m:
640 break
641 r = m.group(1)
642 if r.startswith("(") and r.endswith(")"):
643 r = r[1:-1]
644 for r1 in r.split(","):
645 if r1 == "rare": 645 ↛ 646line 645 didn't jump to line 646 because the condition on line 645 was never true
646 hdr_tags.append("rare")
647 elif special_references and r1 in special_references:
648 hdr_tags.extend(special_references[r1].split())
649 else:
650 # v = m.group(1)
651 if r1.startswith("(") and r1.endswith(")"): 651 ↛ 652line 651 didn't jump to line 652 because the condition on line 651 was never true
652 r1 = r1[1:-1]
653 refs.append(unicodedata.normalize("NFKD", r1))
654 col = col[: m.start()]
655 # See if it is a ref definition
656 # print("BEFORE REF CHECK: {!r}".format(col))
657 m = def_re.match(col)
658 # print(f"Before def_re: {refs=}")
659 if m and not nondef_re.match(col):
660 ofs = 0
661 ref = None
662 deflst = []
663 for m in re.finditer(def_re, col):
664 if ref:
665 deflst.append((ref, col[ofs : m.start()].strip()))
666 ref = unicodedata.normalize(
667 "NFKD", m.group(3) or m.group(5) or m.group(6) or ""
668 )
669 ofs = m.end()
670 if ref: 670 ↛ 673line 670 didn't jump to line 673 because the condition on line 670 was always true
671 deflst.append((ref, col[ofs:].strip()))
672 # print("deflst:", deflst)
673 return "", [], deflst, []
674 # See if it *looks* like a reference to a definition
675 # print(f"After def_re: {refs=}")
676 while col:
677 if is_superscript(col[-1]) or col[-1] in ("†",):
678 if col.endswith("ʳᵃʳᵉ"):
679 hdr_tags.append("rare")
680 col = col[:-4].strip()
681 continue
682 if special_references:
683 stop_flag = False
684 for r in special_references:
685 if col.endswith(r):
686 hdr_tags.extend(special_references[r].split())
687 col = col[: -len(r)].strip()
688 stop_flag = True
689 break # this for loop
690 if stop_flag:
691 continue # this while loop
692 # Numbers and H/L/N are useful information
693 refs.append(unicodedata.normalize("NFKD", col[-1]))
694 col = col[:-1]
695 else:
696 break
698 # Check for another form of note definition
699 if ( 699 ↛ 705line 699 didn't jump to line 705 because the condition on line 699 was never true
700 len(col) > 2
701 and col[1] in (")", " ", ":")
702 and col[0].isdigit()
703 and not re.match(nondef_re, col)
704 ):
705 return "", [], [(col[0], col[2:].strip())], []
706 col = col.strip()
708 # Extract final "*" reference symbols. Sometimes there are multiple.
709 m = re.search(r"\*+$", col)
710 if m is not None:
711 col = col[: m.start()]
712 refs.append(unicodedata.normalize("NFKD", m.group(0)))
713 if col.endswith("(*)"): 713 ↛ 714line 713 didn't jump to line 714 because the condition on line 713 was never true
714 col = col[:-3].strip()
715 refs.append("*")
717 # Put back the final parenthesized part
718 col = col.strip() + final_paren
719 # print("EXTRACT_CELL_CONTENT: orig_col={!r} col={!r} refs={!r} hdr_tags={}"
720 # .format(orig_col, col, refs, hdr_tags))
721 return col.strip(), refs, [], hdr_tags
724@functools.lru_cache(10000)
725def parse_title(
726 title: str, source: str
727) -> tuple[list[str], list[str], list[FormData]]:
728 """Parses inflection table title. This returns (global_tags, table_tags,
729 extra_forms), where ``global_tags`` is tags to be added to each inflection
730 entry, ``table_tags`` are tags for the word but not to be added to every
731 form, and ``extra_forms`` is dictionary describing additional forms to be
732 included in the part-of-speech entry)."""
733 assert isinstance(title, str)
734 assert isinstance(source, str)
735 title = html.unescape(title)
736 title = re.sub(r"(?i)<[^>]*>", "", title).strip()
737 title = re.sub(r"\s+", " ", title)
738 # print("PARSE_TITLE:", title)
739 global_tags: list[str] = []
740 table_tags: list[str] = []
741 extra_forms = []
742 # Add certain global tags based on contained words
743 for m in re.finditer(title_contains_global_re, title):
744 v = m.group(0).lower()
745 if re.match(table_hdr_ign_part_re, v): 745 ↛ 746line 745 didn't jump to line 746 because the condition on line 745 was never true
746 continue
747 global_tags.extend(title_contains_global_map[v].split())
748 # Add certain tags to table-tags "form" based on contained words
749 for m in re.finditer(title_contains_wordtags_re, title):
750 v = m.group(0).lower()
751 if re.match(table_hdr_ign_part_re, v): 751 ↛ 752line 751 didn't jump to line 752 because the condition on line 751 was never true
752 continue
753 table_tags.extend(title_contains_wordtags_map[v].split())
754 if re.search(r"Conjugation of (s’|se ).*French verbs", title): 754 ↛ 755line 754 didn't jump to line 755 because the condition on line 754 was never true
755 global_tags.append("reflexive")
756 # Check for <x>-type at the beginning of title (e.g., Armenian) and various
757 # other ways of specifying an inflection class.
758 for m in re.finditer(
759 r"\b("
760 r"[\w/]+-type|"
761 r"accent-\w+|"
762 r"[\w/]+-stem|"
763 r"[^ ]+ gradation|"
764 r"\b(stem in [\w/ ]+)|"
765 r"[^ ]+ alternation|"
766 r"(First|Second|Third|Fourth|Fifth|Sixth|Seventh) "
767 r"(Conjugation|declension)|"
768 r"First and second declension|"
769 r"(1st|2nd|3rd|4th|5th|6th) declension|"
770 r"\w[\w/ ]* harmony"
771 r")\b",
772 title,
773 ):
774 dt: FormData = {"form": m.group(1), "source": source, "tags": ["class"]}
775 extra_forms.append(dt)
776 # Parse parenthesized part from title
777 for m in re.finditer(r"\(([^)]*)\)", title):
778 for elem in m.group(1).split(","):
779 # group(0) is the whole string, group(1) first parens
780 elem = elem.strip()
781 if elem in title_elements_map:
782 table_tags.extend(title_elements_map[elem].split())
783 else:
784 m1 = re.match(title_elemstart_re, elem)
785 if m1:
786 tags = title_elemstart_map[m1.group(1)].split()
787 dt = {
788 "form": elem[m1.end() :],
789 "source": source,
790 "tags": tags,
791 }
792 extra_forms.append(dt)
793 # For titles that contains no parenthesized parts, do some special
794 # handling to still interpret parts from them
795 if "(" not in title:
796 # No parenthesized parts
797 m1 = re.search(r"\b(Portuguese) (-.* verb) ", title)
798 if m1 is not None:
799 dt = {"form": m1.group(2), "tags": ["class"], "source": source}
800 extra_forms.append(dt)
801 for elem in title.split(","):
802 elem = elem.strip()
803 if elem in title_elements_map: 803 ↛ 804line 803 didn't jump to line 804 because the condition on line 803 was never true
804 table_tags.extend(title_elements_map[elem].split())
805 elif elem.endswith("-stem"): 805 ↛ 806line 805 didn't jump to line 806 because the condition on line 805 was never true
806 dt = {"form": elem, "tags": ["class"], "source": source}
807 extra_forms.append(dt)
808 return global_tags, table_tags, extra_forms
811def expand_header(
812 wxr: WiktextractContext,
813 tablecontext: "TableContext",
814 word: str,
815 lang: str,
816 pos: str,
817 text: str,
818 base_tags: Union[list[str], set[str], tuple[str, ...]],
819 silent=False,
820 ignore_tags=False,
821 depth=0,
822 column_number: int | None = None,
823) -> list[tuple[str, ...]]:
824 """Expands a cell header to tagset, handling conditional expressions
825 in infl_map. This returns list of tuples of tags, each list element
826 describing an alternative interpretation. ``base_tags`` is combined
827 column and row tags for the cell in which the text is being interpreted
828 (conditional expressions in inflection data may depend on it).
829 If ``silent`` is True, then no warnings will be printed. If ``ignore_tags``
830 is True, then tags listed in "if" will be ignored in the test (this is
831 used when trying to heuristically detect whether a non-<th> cell is anyway
832 a header)."""
833 assert isinstance(wxr, WiktextractContext)
834 assert isinstance(word, str)
835 assert isinstance(lang, str)
836 assert isinstance(pos, str)
837 assert isinstance(text, str)
838 assert isinstance(base_tags, (list, tuple, set))
839 assert silent in (True, False)
840 assert isinstance(depth, int)
841 # print("EXPAND_HDR: text={!r} base_tags={!r}".format(text, base_tags))
842 # First map the text using the inflection map
843 text = clean_value(wxr, text)
844 combined_return: list[tuple[str, ...]] = []
845 parts = split_at_comma_semi(text, separators=[";"])
846 for text in parts:
847 if not text: 847 ↛ 848line 847 didn't jump to line 848 because the condition on line 847 was never true
848 continue
849 if text in infl_map:
850 v = infl_map[text] # list or string
851 else:
852 m = re.match(infl_start_re, text)
853 if m is not None: 853 ↛ 854line 853 didn't jump to line 854 because the condition on line 853 was never true
854 v = infl_start_map[m.group(1)]
855 # print("INFL_START {} -> {}".format(text, v))
856 elif re.match(r"Notes", text):
857 # Ignored header
858 # print("IGNORING NOTES")
859 combined_return = or_tagsets(
860 lang, pos, combined_return, [("dummy-skip-this",)]
861 )
862 # this just adds dummy-skip-this
863 continue
864 elif text in IGNORED_COLVALUES:
865 combined_return = or_tagsets(
866 lang, pos, combined_return, [("dummy-ignore-skipped",)]
867 )
868 continue
869 # Try without final parenthesized part
870 text_without_parens = re.sub(r"[,/]?\s+\([^)]*\)\s*$", "", text)
871 if text_without_parens in infl_map:
872 v = infl_map[text_without_parens]
873 elif m is None: 873 ↛ 889line 873 didn't jump to line 889 because the condition on line 873 was always true
874 if not silent:
875 wxr.wtp.debug(
876 "inflection table: unrecognized header: {}".format(
877 repr(text)
878 ),
879 sortid="inflection/735",
880 )
881 # Unrecognized header
882 combined_return = or_tagsets(
883 lang, pos, combined_return, [("error-unrecognized-form",)]
884 )
885 continue
887 # Then loop interpreting the value, until the value is a simple string.
888 # This may evaluate nested conditional expressions.
889 default_else = None
890 while True:
891 # If it is a string, we are done.
892 if isinstance(v, str):
893 tags = set(v.split())
894 remove_useless_tags(lang, pos, tags)
895 tagset = [tuple(sorted(tags))]
896 break
897 # For a list, just interpret it as alternatives. (Currently the
898 # alternatives must directly be strings.)
899 if isinstance(v, (list, tuple)):
900 tagset = []
901 for x in v:
902 tags = set(x.split())
903 remove_useless_tags(lang, pos, tags)
904 tags_t = tuple(sorted(tags))
905 if tags_t not in tagset: 905 ↛ 901line 905 didn't jump to line 901 because the condition on line 905 was always true
906 tagset.append(tags_t)
907 break
908 # Otherwise the value should be a dictionary describing a
909 # conditional expression.
910 if not isinstance(v, dict): 910 ↛ 911line 910 didn't jump to line 911 because the condition on line 910 was never true
911 wxr.wtp.debug(
912 "inflection table: internal: "
913 "UNIMPLEMENTED INFL_MAP VALUE: {}".format(infl_map[text]),
914 sortid="inflection/767",
915 )
916 tagset = [()]
917 break
918 # Evaluate the conditional expression.
919 assert isinstance(v, dict)
920 cond: Union[bool, str] = "default-true"
921 c: Union[str, list[str], set[str]] = ""
922 # Handle "lang" condition. The value must be either a
923 # single language or a list of languages, and the
924 # condition evaluates to True if the table is one of
925 # those languages.
926 if "lang" in v:
927 c = v["lang"]
928 # check if it's a code and transform if necessary
929 if isinstance(c, str):
930 if c != lang:
931 cond = lang == code_to_name(c, "en")
932 else:
933 cond = True
934 else:
935 assert isinstance(c, (list, tuple, set))
936 if lang not in c:
937 cond = name_to_code(lang, "en") in c
938 else:
939 cond = True
940 # Handle "nested-table-depth" condition. The value must
941 # be an int or list of ints, and the condition evaluates
942 # True if the depth is one of those values.
943 # "depth" is how deep into a nested table tree the current
944 # table lies. It is first started in handle_wikitext_table,
945 # so only applies to tables-within-tables, not other
946 # WikiNode content. `depth` is currently only passed as a
947 # parameter down the table parsing stack, and not stored.
948 if cond and "nested-table-depth" in v: 948 ↛ 949line 948 didn't jump to line 949 because the condition on line 948 was never true
949 d = v["nested-table-depth"]
950 if isinstance(d, int):
951 cond = d == depth
952 else:
953 assert isinstance(d, (list, tuple, set))
954 cond = depth in d
955 # Column index: check if we're in position X of the row
956 if cond and "column-index" in v:
957 index = v["column-index"]
958 if isinstance(index, int): 958 ↛ 961line 958 didn't jump to line 961 because the condition on line 958 was always true
959 cond = index == column_number
960 else:
961 assert isinstance(index, (list, tuple, set))
962 cond = column_number in index
963 # Handle inflection-template condition. Must be a string
964 # or list of strings, and if tablecontext.template_name is in
965 # those, accept the condition.
966 # TableContext.template_name is passed down from page/
967 # parse_inflection, before parsing and expanding itself
968 # has begun.
969 if cond and tablecontext and "inflection-template" in v:
970 d1 = v["inflection-template"]
971 if isinstance(d1, str): 971 ↛ 974line 971 didn't jump to line 974 because the condition on line 971 was always true
972 cond = d1 == tablecontext.template_name
973 else:
974 assert isinstance(d1, (list, tuple, set))
975 cond = tablecontext.template_name in d1
976 # Handle "pos" condition. The value must be either a single
977 # part-of-speech or a list of them, and the condition evaluates to
978 # True if the part-of-speech is any of those listed.
979 if cond and "pos" in v:
980 c = v["pos"]
981 if isinstance(c, str):
982 cond = c == pos
983 else:
984 assert isinstance(c, (list, tuple, set))
985 cond = pos in c
986 # Handle "if" condition. The value must be a string containing a
987 # space-separated list of tags. The condition evaluates to True if
988 # ``base_tags`` contains all of the listed tags. If the condition
989 # is of the form "any: ...tags...", then any of the tags will be
990 # enough.
991 if cond and "if" in v and not ignore_tags:
992 c = v["if"]
993 assert isinstance(c, str)
994 # "if" condition is true if any of the listed tags is present if
995 # it starts with "any:", otherwise all must be present
996 if c.startswith("any: "):
997 cond = any(t in base_tags for t in c[5:].split())
998 else:
999 cond = all(t in base_tags for t in c.split())
1001 # Handle "default" assignment. Store the value to be used
1002 # as a default later.
1003 if "default" in v:
1004 assert isinstance(v["default"], str)
1005 default_else = v["default"]
1007 # Warning message about missing conditions for debugging.
1009 if cond == "default-true" and not default_else and not silent:
1010 wxr.wtp.debug(
1011 "inflection table: IF MISSING COND: word={} "
1012 "lang={} text={} base_tags={} c={} cond={}".format(
1013 word, lang, text, base_tags, c, cond
1014 ),
1015 sortid="inflection/851",
1016 )
1017 # Based on the result of evaluating the condition, select either
1018 # "then" part or "else" part.
1019 if cond:
1020 v = v.get("then", "")
1021 else:
1022 v1 = v.get("else")
1023 if v1 is None:
1024 if default_else is not None:
1025 v = default_else
1026 else:
1027 if not silent:
1028 wxr.wtp.debug(
1029 "inflection table: IF WITHOUT ELSE EVALS "
1030 "False: "
1031 "{}/{} {!r} base_tags={}".format(
1032 word, lang, text, base_tags
1033 ),
1034 sortid="inflection/865",
1035 )
1036 v = "error-unrecognized-form"
1037 else:
1038 v = v1
1040 # Merge the resulting tagset from this header part with the other
1041 # tagsets from the whole header
1042 combined_return = or_tagsets(lang, pos, combined_return, tagset)
1044 # Return the combined tagsets, or empty tagset if we got no tagsets
1045 if not combined_return:
1046 combined_return = [()]
1047 return combined_return
1050def compute_coltags(
1051 lang: str,
1052 pos: str,
1053 hdrspans: list[HdrSpan],
1054 start: int,
1055 colspan: int,
1056 celltext: str,
1057) -> list[tuple[str, ...]]:
1058 """Computes column tags for a column of the given width based on the
1059 current header spans."""
1060 assert isinstance(lang, str)
1061 assert isinstance(pos, str)
1062 assert isinstance(hdrspans, list)
1063 assert isinstance(start, int) and start >= 0
1064 assert isinstance(colspan, int) and colspan >= 1
1065 assert isinstance(celltext, str) # For debugging only
1066 # print("COMPUTE_COLTAGS CALLED start={} colspan={} celltext={!r}"
1067 # .format(start, colspan, celltext))
1068 # For debugging, set this to the form for whose cell you want debug prints
1069 if celltext == debug_cell_text: 1069 ↛ 1070line 1069 didn't jump to line 1070 because the condition on line 1069 was never true
1070 print(
1071 "COMPUTE_COLTAGS CALLED start={} colspan={} celltext={!r}".format(
1072 start, colspan, celltext
1073 )
1074 )
1075 for hdrspan in hdrspans:
1076 print(
1077 " row={} start={} colspans={} tagsets={}".format(
1078 hdrspan.rownum,
1079 hdrspan.start,
1080 hdrspan.colspan,
1081 hdrspan.tagsets,
1082 )
1083 )
1084 used = set()
1085 coltags: list[tuple[str, ...]] = [()]
1086 last_header_row = 1000000
1087 # Iterate through the headers in reverse order, i.e., headers lower in the
1088 # table (closer to the cell) first.
1089 row_tagsets: list[tuple[str, ...]] = [()]
1090 row_tagsets_rownum = 1000000
1091 used_hdrspans = set()
1092 for hdrspan in reversed(hdrspans):
1093 if (
1094 hdrspan.start + hdrspan.colspan <= start
1095 or hdrspan.start >= start + colspan
1096 ):
1097 # Does not horizontally overlap current cell. Ignore this hdrspan.
1098 if celltext == debug_cell_text: 1098 ↛ 1099line 1098 didn't jump to line 1099 because the condition on line 1098 was never true
1099 print(
1100 "Ignoring row={} start={} colspan={} tagsets={}".format(
1101 hdrspan.rownum,
1102 hdrspan.start,
1103 hdrspan.colspan,
1104 hdrspan.tagsets,
1105 )
1106 )
1107 continue
1108 # If the cell partially overlaps the current cell, assume we have
1109 # reached something unrelated and abort.
1110 if (
1111 hdrspan.start < start
1112 and hdrspan.start + hdrspan.colspan > start
1113 and hdrspan.start + hdrspan.colspan < start + colspan
1114 ):
1115 if celltext == debug_cell_text: 1115 ↛ 1116line 1115 didn't jump to line 1116 because the condition on line 1115 was never true
1116 print(
1117 "break on partial overlap at start {} {} {}".format(
1118 hdrspan.start, hdrspan.colspan, hdrspan.tagsets
1119 )
1120 )
1121 break
1122 if (
1123 hdrspan.start < start + colspan
1124 and hdrspan.start > start
1125 and hdrspan.start + hdrspan.colspan > start + colspan
1126 and not hdrspan.expanded
1127 ):
1128 if celltext == debug_cell_text: 1128 ↛ 1129line 1128 didn't jump to line 1129 because the condition on line 1128 was never true
1129 print(
1130 "break on partial overlap at end {} {} {}".format(
1131 hdrspan.start, hdrspan.colspan, hdrspan.tagsets
1132 )
1133 )
1134 break
1135 # Check if we have already used this cell.
1136 if id(hdrspan) in used_hdrspans:
1137 continue
1138 # We are going to use this cell.
1139 used_hdrspans.add(id(hdrspan))
1140 tagsets = hdrspan.tagsets
1141 # If the hdrspan is fully inside the current cell and does not cover
1142 # it fully, check if we should merge information from multiple cells.
1143 if not hdrspan.expanded and (
1144 hdrspan.start > start
1145 or hdrspan.start + hdrspan.colspan < start + colspan
1146 ):
1147 # Multiple columns apply to the current cell, only
1148 # gender/number/case tags present
1149 # If there are no tags outside the range in any of the
1150 # categories included in these cells, don't add anything
1151 # (assume all choices valid in the language are possible).
1152 in_cats = set(
1153 valid_tags[t]
1154 for x in hdrspans
1155 if x.rownum == hdrspan.rownum
1156 and x.start >= start
1157 and x.start + x.colspan <= start + colspan
1158 for tt in x.tagsets
1159 for t in tt
1160 )
1161 if celltext == debug_cell_text: 1161 ↛ 1162line 1161 didn't jump to line 1162 because the condition on line 1161 was never true
1162 print("in_cats={} tagsets={}".format(in_cats, tagsets))
1163 # Merge the tagsets into existing tagsets. This merges
1164 # alternatives into the same tagset if there is only one
1165 # category different; otherwise this splits the tagset into
1166 # more alternatives.
1167 includes_all_on_row = True
1168 for x in hdrspans:
1169 # print("X: x.rownum={} x.start={}".format(x.rownum, x.start))
1170 if x.rownum != hdrspan.rownum:
1171 continue
1172 if x.start < start or x.start + x.colspan > start + colspan:
1173 if celltext == debug_cell_text: 1173 ↛ 1174line 1173 didn't jump to line 1174 because the condition on line 1173 was never true
1174 print(
1175 "NOT IN RANGE: {} {} {}".format(
1176 x.start, x.colspan, x.tagsets
1177 )
1178 )
1179 includes_all_on_row = False
1180 continue
1181 if id(x) in used_hdrspans:
1182 if celltext == debug_cell_text: 1182 ↛ 1183line 1182 didn't jump to line 1183 because the condition on line 1182 was never true
1183 print(
1184 "ALREADY USED: {} {} {}".format(
1185 x.start, x.colspan, x.tagsets
1186 )
1187 )
1188 continue
1189 used_hdrspans.add(id(x))
1190 if celltext == debug_cell_text: 1190 ↛ 1191line 1190 didn't jump to line 1191 because the condition on line 1190 was never true
1191 print(
1192 "Merging into wide col: x.rownum={} "
1193 "x.start={} x.colspan={} "
1194 "start={} colspan={} tagsets={} x.tagsets={}".format(
1195 x.rownum,
1196 x.start,
1197 x.colspan,
1198 start,
1199 colspan,
1200 tagsets,
1201 x.tagsets,
1202 )
1203 )
1204 tagsets = or_tagsets(lang, pos, tagsets, x.tagsets)
1205 # If all headers on the row were included, ignore them.
1206 # See e.g. kunna/Swedish/Verb.
1207 ts_cats = tagset_cats(tagsets)
1208 if (
1209 includes_all_on_row
1210 or
1211 # Kludge, see fut/Hungarian/Verb
1212 ("tense" in ts_cats and "object" in ts_cats)
1213 ):
1214 tagsets = [()]
1215 # For limited categories, if the category doesn't appear
1216 # outside, we won't include the category
1217 if not in_cats - set(
1218 ("gender", "number", "person", "case", "category", "voice")
1219 ):
1220 # Sometimes we have masc, fem, neut and plural, so treat
1221 # number and gender as the same here (if one given, look for
1222 # the other too)
1223 if "number" in in_cats or "gender" in in_cats:
1224 in_cats.update(("number", "gender"))
1225 # Determine which categories occur outside on
1226 # the same row. Ignore headers that have been expanded
1227 # to cover the whole row/part of it.
1228 out_cats = set(
1229 valid_tags[t]
1230 for x in hdrspans
1231 if x.rownum == hdrspan.rownum
1232 and not x.expanded
1233 and (
1234 x.start < start or x.start + x.colspan > start + colspan
1235 )
1236 for tt in x.tagsets
1237 for t in tt
1238 )
1239 if celltext == debug_cell_text: 1239 ↛ 1240line 1239 didn't jump to line 1240 because the condition on line 1239 was never true
1240 print("in_cats={} out_cats={}".format(in_cats, out_cats))
1241 # Remove all inside categories that do not appear outside
1243 new_tagsets = []
1244 for ts in tagsets:
1245 tags = tuple(
1246 sorted(t for t in ts if valid_tags[t] in out_cats)
1247 )
1248 if tags not in new_tagsets: 1248 ↛ 1244line 1248 didn't jump to line 1244 because the condition on line 1248 was always true
1249 new_tagsets.append(tags)
1250 if celltext == debug_cell_text and new_tagsets != tagsets: 1250 ↛ 1251line 1250 didn't jump to line 1251 because the condition on line 1250 was never true
1251 print(
1252 "Removed tags that do not "
1253 "appear outside {} -> {}".format(
1254 # have_hdr never used?
1255 tagsets,
1256 new_tagsets,
1257 )
1258 )
1259 tagsets = new_tagsets
1260 key = (hdrspan.start, hdrspan.colspan)
1261 if key in used:
1262 if celltext == debug_cell_text: 1262 ↛ 1263line 1262 didn't jump to line 1263 because the condition on line 1262 was never true
1263 print(
1264 "Cellspan already used: start={} "
1265 "colspan={} rownum={} {}".format(
1266 hdrspan.start,
1267 hdrspan.colspan,
1268 hdrspan.rownum,
1269 hdrspan.tagsets,
1270 )
1271 )
1272 action = get_lang_conf(lang, "reuse_cellspan")
1273 # can be "stop", "skip" or "reuse"
1274 if action == "stop":
1275 break
1276 if action == "skip":
1277 continue
1278 assert action == "reuse"
1279 tcats = tagset_cats(tagsets)
1280 # Most headers block using the same column position above. However,
1281 # "register" tags don't do this (cf. essere/Italian/verb: "formal")
1282 if len(tcats) != 1 or "register" not in tcats:
1283 used.add(key)
1284 # If we have moved to a different row, merge into column tagsets
1285 # (we use different and_tagsets within the row)
1286 if row_tagsets_rownum != hdrspan.rownum:
1287 # row_tagsets_rownum was initialized as 10000000
1288 ret = and_tagsets(lang, pos, coltags, row_tagsets)
1289 if celltext == debug_cell_text: 1289 ↛ 1290line 1289 didn't jump to line 1290 because the condition on line 1289 was never true
1290 print(
1291 "merging rows: {} {} -> {}".format(
1292 coltags, row_tagsets, ret
1293 )
1294 )
1295 coltags = ret
1296 row_tagsets = [()]
1297 row_tagsets_rownum = hdrspan.rownum
1298 # Merge into coltags
1299 if hdrspan.all_headers_row and hdrspan.rownum + 1 == last_header_row:
1300 # If this row is all headers and immediately preceeds the last
1301 # header we accepted, take any header from there.
1302 row_tagsets = and_tagsets(lang, pos, row_tagsets, tagsets)
1303 if celltext == debug_cell_text: 1303 ↛ 1304line 1303 didn't jump to line 1304 because the condition on line 1303 was never true
1304 print("merged (next header row): {}".format(row_tagsets))
1305 else:
1306 # new_cats is for the new tags (higher up in the table)
1307 new_cats = tagset_cats(tagsets)
1308 # cur_cats is for the tags already collected (lower in the table)
1309 cur_cats = tagset_cats(coltags)
1310 if celltext == debug_cell_text: 1310 ↛ 1311line 1310 didn't jump to line 1311 because the condition on line 1310 was never true
1311 print(
1312 "row={} start={} colspan={} tagsets={} coltags={} "
1313 "new_cats={} cur_cats={}".format(
1314 hdrspan.rownum,
1315 hdrspan.start,
1316 hdrspan.colspan,
1317 tagsets,
1318 coltags,
1319 new_cats,
1320 cur_cats,
1321 )
1322 )
1323 if "detail" in new_cats:
1324 if not any(coltags): # Only if no tags so far
1325 coltags = or_tagsets(lang, pos, coltags, tagsets)
1326 if celltext == debug_cell_text: 1326 ↛ 1327line 1326 didn't jump to line 1327 because the condition on line 1326 was never true
1327 print("stopping on detail after merge")
1328 break
1329 # Here, we block bleeding of categories from above
1330 elif "non-finite" in cur_cats and "non-finite" in new_cats:
1331 stop = get_lang_conf(lang, "stop_non_finite_non_finite")
1332 if stop: 1332 ↛ 1358line 1332 didn't jump to line 1358 because the condition on line 1332 was always true
1333 if celltext == debug_cell_text: 1333 ↛ 1334line 1333 didn't jump to line 1334 because the condition on line 1333 was never true
1334 print("stopping on non-finite-non-finite")
1335 break
1336 elif "non-finite" in cur_cats and "voice" in new_cats:
1337 stop = get_lang_conf(lang, "stop_non_finite_voice")
1338 if stop: 1338 ↛ 1358line 1338 didn't jump to line 1358 because the condition on line 1338 was always true
1339 if celltext == debug_cell_text: 1339 ↛ 1340line 1339 didn't jump to line 1340 because the condition on line 1339 was never true
1340 print("stopping on non-finite-voice")
1341 break
1342 elif "non-finite" in new_cats and cur_cats & set(
1343 ("person", "number")
1344 ):
1345 if celltext == debug_cell_text: 1345 ↛ 1346line 1345 didn't jump to line 1346 because the condition on line 1345 was never true
1346 print("stopping on non-finite new")
1347 break
1348 elif "non-finite" in new_cats and "tense" in new_cats:
1349 stop = get_lang_conf(lang, "stop_non_finite_tense")
1350 if stop:
1351 if celltext == debug_cell_text: 1351 ↛ 1352line 1351 didn't jump to line 1352 because the condition on line 1351 was never true
1352 print("stopping on non-finite new")
1353 break
1354 elif "non-finite" in cur_cats and new_cats & set(("mood",)): 1354 ↛ 1355line 1354 didn't jump to line 1355 because the condition on line 1354 was never true
1355 if celltext == debug_cell_text:
1356 print("stopping on non-finite cur")
1357 break
1358 if (
1359 "tense" in new_cats
1360 and any("imperative" in x for x in coltags)
1361 and get_lang_conf(lang, "imperative_no_tense")
1362 ):
1363 if celltext == debug_cell_text: 1363 ↛ 1364line 1363 didn't jump to line 1364 because the condition on line 1363 was never true
1364 print("skipping tense in imperative")
1365 continue
1366 elif (
1367 "mood" in new_cats
1368 and "mood" in cur_cats
1369 and
1370 # Allow if all new tags are already in current set
1371 any(
1372 t not in ts1
1373 for ts1 in coltags # current
1374 for ts2 in tagsets # new (from above)
1375 for t in ts2
1376 )
1377 ):
1378 skip = get_lang_conf(lang, "skip_mood_mood")
1379 if skip:
1380 if celltext == debug_cell_text: 1380 ↛ 1381line 1380 didn't jump to line 1381 because the condition on line 1380 was never true
1381 print("skipping on mood-mood")
1382 # we continue to next header
1383 else:
1384 if celltext == debug_cell_text: 1384 ↛ 1385line 1384 didn't jump to line 1385 because the condition on line 1384 was never true
1385 print("stopping on mood-mood")
1386 break
1387 elif "tense" in new_cats and "tense" in cur_cats:
1388 skip = get_lang_conf(lang, "skip_tense_tense")
1389 if skip:
1390 if celltext == debug_cell_text: 1390 ↛ 1391line 1390 didn't jump to line 1391 because the condition on line 1390 was never true
1391 print("skipping on tense-tense")
1392 # we continue to next header
1393 else:
1394 if celltext == debug_cell_text: 1394 ↛ 1395line 1394 didn't jump to line 1395 because the condition on line 1394 was never true
1395 print("stopping on tense-tense")
1396 break
1397 elif "aspect" in new_cats and "aspect" in cur_cats:
1398 if celltext == debug_cell_text: 1398 ↛ 1399line 1398 didn't jump to line 1399 because the condition on line 1398 was never true
1399 print("skipping on aspect-aspect")
1400 continue
1401 elif "number" in cur_cats and "number" in new_cats:
1402 if celltext == debug_cell_text: 1402 ↛ 1403line 1402 didn't jump to line 1403 because the condition on line 1402 was never true
1403 print("stopping on number-number")
1404 break
1405 elif "number" in cur_cats and "gender" in new_cats:
1406 if celltext == debug_cell_text: 1406 ↛ 1407line 1406 didn't jump to line 1407 because the condition on line 1406 was never true
1407 print("stopping on number-gender")
1408 break
1409 elif "person" in cur_cats and "person" in new_cats:
1410 if celltext == debug_cell_text: 1410 ↛ 1411line 1410 didn't jump to line 1411 because the condition on line 1410 was never true
1411 print("stopping on person-person")
1412 break
1413 else:
1414 # Merge tags and continue to next header up/left in the table.
1415 row_tagsets = and_tagsets(lang, pos, row_tagsets, tagsets)
1416 if celltext == debug_cell_text: 1416 ↛ 1417line 1416 didn't jump to line 1417 because the condition on line 1416 was never true
1417 print("merged: {}".format(coltags))
1418 # Update the row number from which we have last taken headers
1419 last_header_row = hdrspan.rownum
1420 # Merge the final row tagset into coltags
1421 coltags = and_tagsets(lang, pos, coltags, row_tagsets)
1422 # print(
1423 # "HDRSPANS:", list((x.start, x.colspan, x.tagsets) for x in hdrspans)
1424 # )
1425 if celltext == debug_cell_text: 1425 ↛ 1426line 1425 didn't jump to line 1426 because the condition on line 1425 was never true
1426 print("COMPUTE_COLTAGS {} {}: {}".format(start, colspan, coltags))
1427 assert isinstance(coltags, list)
1428 assert all(isinstance(x, tuple) for x in coltags)
1429 return coltags
1432def parse_simple_table(
1433 wxr: WiktextractContext,
1434 tablecontext: "TableContext",
1435 word: str,
1436 lang: str,
1437 pos: str,
1438 rows: list[list[InflCell]],
1439 titles: list[str],
1440 source: str,
1441 after: str,
1442 depth: int,
1443) -> list[FormData]:
1444 """This is the default table parser. Despite its name, it can parse
1445 complex tables. This returns a list of forms to be added to the
1446 part-of-speech, or None if the table could not be parsed."""
1447 assert isinstance(wxr, WiktextractContext)
1448 assert isinstance(tablecontext, TableContext)
1449 assert isinstance(word, str)
1450 assert isinstance(lang, str)
1451 assert isinstance(pos, str)
1452 assert isinstance(rows, list)
1453 assert isinstance(source, str)
1454 assert isinstance(after, str)
1455 assert isinstance(depth, int)
1456 for row in rows:
1457 for cell in row:
1458 assert isinstance(cell, InflCell)
1459 assert isinstance(titles, list)
1460 for x in titles:
1461 assert isinstance(x, str)
1463 # print("PARSE_SIMPLE_TABLE: TITLES:", titles)
1464 if debug_cell_text: 1464 ↛ 1465line 1464 didn't jump to line 1465 because the condition on line 1464 was never true
1465 print("ROWS:")
1466 for row in rows:
1467 print(" ", row)
1469 # Check for forced rowspan kludge. See e.g.
1470 # maorski/Serbo-Croatian. These are essentially multi-row
1471 # cells implemented using <br> rather than separate cell. We fix this
1472 # by identifying rows where this happens, and splitting the current row
1473 # to multiple rows by synthesizing additional cells.
1474 new_rows = []
1475 for row in rows:
1476 split_row = (
1477 any(x.is_title and x.text in ("inanimate\nanimate",) for x in row)
1478 and
1479 # x is an InflCell
1480 all(x.rowspan == 1 for x in row)
1481 )
1482 if not split_row:
1483 new_rows.append(row)
1484 continue
1485 row1 = []
1486 row2 = []
1487 for cell in row:
1488 cell1 = copy.deepcopy(cell)
1489 if "\n" in cell.text:
1490 # Has more than one line - split this cell
1491 parts = cell.text.strip().splitlines()
1492 if len(parts) != 2: 1492 ↛ 1493line 1492 didn't jump to line 1493 because the condition on line 1492 was never true
1493 wxr.wtp.debug(
1494 "forced rowspan kludge got {} parts: {!r}".format(
1495 len(parts), cell.text
1496 ),
1497 sortid="inflection/1234",
1498 )
1499 cell2 = copy.deepcopy(cell)
1500 cell1.text = parts[0]
1501 cell2.text = parts[1]
1502 else:
1503 cell1.rowspan = 2
1504 cell2 = cell1 # ref, not a copy
1505 row1.append(cell1)
1506 row2.append(cell2)
1507 new_rows.append(row1)
1508 new_rows.append(row2)
1509 rows = new_rows
1510 # print("ROWS AFTER FORCED ROWSPAN KLUDGE:")
1511 # for row in rows:
1512 # print(" ", row)
1514 # Parse definitions for references (from table itself and from text
1515 # after it)
1516 def_ht = {}
1518 def add_defs(defs: list[tuple[str, str]]) -> None:
1519 for ref, d in defs:
1520 # print("DEF: ref={} d={}".format(ref, d))
1521 d = d.strip()
1522 d = d.split(". ")[0].strip() # text before ". "
1523 if not d: 1523 ↛ 1524line 1523 didn't jump to line 1524 because the condition on line 1523 was never true
1524 continue
1525 if d.endswith("."): # catc ".."??
1526 d = d[:-1]
1527 tags, topics = decode_tags(d, no_unknown_starts=True)
1528 # print(f"{ref=}, {transformed=}, {tags=}")
1529 if topics or any("error-unknown-tag" in ts for ts in tags):
1530 d = d[0].lower() + d[1:]
1531 tags, topics = decode_tags(d, no_unknown_starts=True)
1532 if topics or any("error-unknown-tag" in ts for ts in tags):
1533 # Failed to parse as tags
1534 # print("Failed: topics={} tags={}"
1535 # .format(topics, tags))
1536 continue
1537 tags1_s: set[str] = set()
1538 for ts in tags:
1539 # Set.update is a union operation: definition tags are flat
1540 tags1_s.update(ts)
1541 tags1 = tuple(sorted(tags1_s))
1542 # print("DEFINED: {} -> {}".format(ref, tags1))
1543 def_ht[ref] = tags1
1545 def generate_tags(
1546 rowtags: list[tuple[str, ...]], table_tags: list[str]
1547 ) -> tuple[
1548 list[tuple[str, ...]], list[tuple[str, ...]], list[tuple[str, ...]]
1549 ]:
1550 new_coltags: list[tuple[str, ...]] = []
1551 all_hdr_tags: list[tuple[str, ...]] = [] # list of tuples
1552 new_rowtags: list[tuple[str, ...]] = []
1553 for rt0 in rowtags:
1554 for ct0 in compute_coltags(
1555 lang,
1556 pos,
1557 hdrspans,
1558 col_idx, # col_idx=>start
1559 colspan,
1560 col, # cell_text
1561 ):
1562 base_tags: set[str] = (
1563 set(rt0) | set(ct0) | set(global_tags) | set(table_tags)
1564 ) # Union.
1565 # print(f"{rt0=}, {ct0=}, {global_tags=},"
1566 # f" {table_tags=}, {base_tags=}")
1567 alt_tags = expand_header(
1568 wxr,
1569 tablecontext,
1570 word,
1571 lang,
1572 pos,
1573 text,
1574 base_tags,
1575 depth=depth,
1576 column_number=col_idx,
1577 )
1578 # base_tags are used in infl_map "if"-conds.
1579 for tt in alt_tags:
1580 if tt not in all_hdr_tags:
1581 all_hdr_tags.append(tt)
1582 tt_s = set(tt)
1583 # Add tags from referenced footnotes
1584 tt_s.update(refs_tags)
1585 # Sort, convert to tuple, and add to set of
1586 # alternatives.
1587 tt = tuple(sorted(tt_s))
1588 if tt not in new_coltags:
1589 new_coltags.append(tt)
1590 # Kludge (saprast/Latvian/Verb): ignore row tags
1591 # if trying to add a non-finite after mood.
1592 if any(valid_tags[t] == "mood" for t in rt0) and any(
1593 valid_tags[t] == "non-finite" for t in tt
1594 ):
1595 tags = tuple(sorted(set(tt) | set(hdr_tags)))
1596 else:
1597 tags = tuple(sorted(set(tt) | set(rt0) | set(hdr_tags)))
1598 if tags not in new_rowtags:
1599 new_rowtags.append(tags)
1600 return new_rowtags, new_coltags, all_hdr_tags
1602 def add_new_hdrspan(
1603 col: str,
1604 hdrspans: list[HdrSpan],
1605 store_new_hdrspan: bool,
1606 col0_followed_by_nonempty: bool,
1607 col0_hdrspan: Optional[HdrSpan],
1608 ) -> tuple[str, bool, Optional[HdrSpan]]:
1609 hdrspan = HdrSpan(
1610 col_idx, colspan, rowspan, rownum, new_coltags, col, all_headers
1611 )
1612 hdrspans.append(hdrspan)
1614 # infl-map tag "dummy-store-hdrspan" causes this new hdrspan
1615 # to be added to a register of stored hdrspans to be used
1616 # later with "dummy-load-stored-hdrspans".
1617 if store_new_hdrspan: 1617 ↛ 1618line 1617 didn't jump to line 1618 because the condition on line 1617 was never true
1618 tablecontext.stored_hdrspans.append(hdrspan)
1620 # Handle headers that are above left-side header
1621 # columns and are followed by personal pronouns in
1622 # remaining columns (basically headers that
1623 # evaluate to no tags). In such cases widen the
1624 # left-side header to the full row.
1625 if previously_seen: # id(cell) in seen_cells previously
1626 col0_followed_by_nonempty = True
1627 return col, col0_followed_by_nonempty, col0_hdrspan
1628 elif col0_hdrspan is None:
1629 col0_hdrspan = hdrspan
1630 elif any(all_hdr_tags): 1630 ↛ 1698line 1630 didn't jump to line 1698 because the condition on line 1630 was always true
1631 col0_cats = tagset_cats(col0_hdrspan.tagsets)
1632 later_cats = tagset_cats(all_hdr_tags)
1633 col0_allowed = get_lang_conf(lang, "hdr_expand_first")
1634 later_allowed = get_lang_conf(lang, "hdr_expand_cont")
1635 later_allowed = later_allowed | set(["dummy"])
1636 # dummy2 has different behavior than plain dummy
1637 # and does not belong here.
1639 # print("col0_cats={} later_cats={} "
1640 # "fol_by_nonempty={} col_idx={} end={} "
1641 # "tagsets={}"
1642 # .format(col0_cats, later_cats,
1643 # col0_followed_by_nonempty, col_idx,
1644 # col0_hdrspan.start +
1645 # col0_hdrspan.colspan,
1646 # col0_hdrspan.tagsets))
1647 # print("col0.rowspan={} rowspan={}"
1648 # .format(col0_hdrspan.rowspan, rowspan))
1649 # Only expand if [col0_cats and later_cats are allowed
1650 # and don't overlap] and [col0 has tags], and there have
1651 # been [no disallowed cells in between].
1652 #
1653 # There are three cases here:
1654 # - col0_hdrspan set, continue with allowed current
1655 # - col0_hdrspan set, expand, start new
1656 # - col0_hdrspan set, no expand, start new
1657 if (
1658 not col0_followed_by_nonempty
1659 and
1660 # XXX Only one cat of tags: kunna/Swedish
1661 # XXX len(col0_cats) == 1 and
1662 col0_hdrspan.rowspan >= rowspan
1663 and
1664 # from hdrspan
1665 not (later_cats - later_allowed)
1666 and not (col0_cats & later_cats)
1667 ):
1668 # First case: col0 set, continue
1669 return col, col0_followed_by_nonempty, col0_hdrspan
1670 # We are going to start new col0_hdrspan. Check if
1671 # we should expand.
1672 if (
1673 not col0_followed_by_nonempty
1674 and not (col0_cats - col0_allowed)
1675 and
1676 # Only "allowed" allowed
1677 # XXX len(col0_cats) == 1 and
1678 col_idx > col0_hdrspan.start + col0_hdrspan.colspan
1679 ):
1680 # col_idx is beyond current colspan
1681 # *Expand* current col0_hdrspan
1682 # print("EXPANDING COL0 MID: {} from {} to {} "
1683 # "cols {}"
1684 # .format(col0_hdrspan.text,
1685 # col0_hdrspan.colspan,
1686 # col_idx - col0_hdrspan.start,
1687 # col0_hdrspan.tagsets))
1688 col0_hdrspan.colspan = col_idx - col0_hdrspan.start
1689 col0_hdrspan.expanded = True
1690 # Clear old col0_hdrspan
1691 if col == debug_cell_text: 1691 ↛ 1692line 1691 didn't jump to line 1692 because the condition on line 1691 was never true
1692 print("START NEW {}".format(hdrspan.tagsets))
1693 col0_hdrspan = None
1694 # Now start new, unless it comes from previous row
1695 if not previously_seen: 1695 ↛ 1698line 1695 didn't jump to line 1698 because the condition on line 1695 was always true
1696 col0_hdrspan = hdrspan
1697 col0_followed_by_nonempty = False
1698 return col, col0_followed_by_nonempty, col0_hdrspan
1700 def split_text_into_alts(col: str) -> tuple[str, list[str], list[str]]:
1701 # Split the cell text into alternatives
1702 split_extra_tags = []
1703 if col and is_superscript(col[0]): 1703 ↛ 1704line 1703 didn't jump to line 1704 because the condition on line 1703 was never true
1704 alts = [col]
1705 else:
1706 separators = [";", "•", r"\n", " or "]
1707 if " + " not in col:
1708 separators.append(",")
1709 if not col.endswith("/"):
1710 separators.append("/")
1711 if col in special_phrase_splits:
1712 # Use language-specific special splits.
1713 # These are phrases and constructions that have
1714 # unique ways of splitting, not specific characters
1715 # to split on like with the default splitting.
1716 alts, tags = special_phrase_splits[col]
1717 split_extra_tags = tags.split()
1718 for x in split_extra_tags:
1719 assert x in valid_tags
1720 assert isinstance(alts, (list, tuple))
1721 assert isinstance(tags, str)
1722 elif ( 1722 ↛ 1742line 1722 didn't jump to line 1742 because the condition on line 1722 was never true
1723 (
1724 m := re.match(
1725 # word1, word2 (romanization1, romanization2)
1726 r"\s*([^(),]+),([^(),]+)\(([^(),]+),([^(),]+)\)",
1727 col,
1728 )
1729 )
1730 # NOT `word, (tag, tag)` with an empty m.group(2)...
1731 # There is a test that fails because of this. It's an
1732 # outdated table, but still, ...Italian_verb1
1733 and all(s.strip() for s in m.groups())
1734 and any(
1735 (
1736 # except for entries like word1, word2 (tag2, tag2)...
1737 classify_desc(s) in ("english", "romanization")
1738 for s in (m.group(3), m.group(4))
1739 )
1740 )
1741 ):
1742 alts = [m.group(1), m.group(2), m.group(3), m.group(4)]
1743 else:
1744 # Use default splitting. However, recognize
1745 # language-specific replacements and change them to magic
1746 # characters before splitting. This way we won't split
1747 # them. This is important for, e.g., recognizing
1748 # alternative pronouns.
1749 # The magic characters are characters out of Unicode scope
1750 # that are given a simple incremental value, int > unicode.
1751 repls = {}
1752 magic_ch = MAGIC_FIRST
1753 trs = get_lang_conf(lang, "form_transformations")
1754 # trs is a list of lists of strings
1755 for _, v, _, _ in trs:
1756 # v is a pattern string, like "^ich"
1757 # form_transformations data is doing double-duty here,
1758 # because the pattern strings are already known to us and
1759 # not meant to be split.
1760 m = re.search(v, col)
1761 if m is not None:
1762 # if pattern found in text
1763 magic = chr(magic_ch)
1764 magic_ch += 1 # next magic character value
1765 col = re.sub(v, magic, col) # replace with magic ch
1766 repls[magic] = m.group(0)
1767 # remember what regex match string each magic char
1768 # replaces. .group(0) is the whole match.
1769 alts0 = split_at_comma_semi(col, separators=separators)
1770 # with magic characters in place, split the text so that
1771 # pre-transformation text is out of the way.
1772 alts = []
1773 for alt in alts0:
1774 # create a new list with the separated items and
1775 # the magic characters replaced with the original texts.
1776 for k, v in repls.items():
1777 alt = re.sub(k, v, alt)
1778 alts.append(alt)
1780 # Remove "*" from beginning of forms, as in non-attested
1781 # or reconstructed forms. Otherwise it might confuse romanization
1782 # detection.
1783 alts = list(re.sub(r"^\*\*?([^ ])", r"\1", x) for x in alts)
1784 alts = list(
1785 x for x in alts if not re.match(r"pronounced with |\(with ", x)
1786 )
1787 alts = list(
1788 re.sub(r"^\((in the sense [^)]*)\)\s+", "", x) for x in alts
1789 )
1790 return col, alts, split_extra_tags
1792 def handle_parens(
1793 form: str, roman: str, clitic: str | None, extra_tags: list[str]
1794 ) -> tuple[str, str, str | None]:
1795 if TYPE_CHECKING:
1796 assert isinstance(paren, str)
1797 assert isinstance(m, re.Match)
1798 if re.match(r"[’'][a-z]([a-z][a-z]?)?$", paren):
1799 # is there a clitic starting with apostrophe?
1800 clitic = paren
1801 # assume the whole paren is a clitic
1802 # then remove paren from form
1803 form = (form[: m.start()] + subst + form[m.end() :]).strip()
1804 elif classify_desc(paren) == "tags":
1805 tagsets1, topics1 = decode_tags(paren)
1806 if not topics1: 1806 ↛ 1827line 1806 didn't jump to line 1827 because the condition on line 1806 was always true
1807 for ts in tagsets1:
1808 ts = tuple(x for x in ts if " " not in x)
1809 # There are some generated tags containing
1810 # spaces; do not let them through here.
1811 extra_tags.extend(ts)
1812 form = (form[: m.start()] + subst + form[m.end() :]).strip()
1813 # brackets contain romanization
1814 elif (
1815 m.start() > 0
1816 and not roman
1817 and classify_desc(form[: m.start()]) == "other"
1818 and
1819 # "other" ~ text
1820 classify_desc(paren) in ("romanization", "english")
1821 and not re.search(r"^with |-form$", paren)
1822 ):
1823 roman = paren
1824 form = (form[: m.start()] + subst + form[m.end() :]).strip()
1825 elif re.search(r"^with |-form", paren): 1825 ↛ 1826line 1825 didn't jump to line 1826 because the condition on line 1825 was never true
1826 form = (form[: m.start()] + subst + form[m.end() :]).strip()
1827 return form, roman, clitic
1829 def merge_row_and_column_tags(
1830 form: str,
1831 some_has_covered_text: bool,
1832 links: list[tuple[str, str]] | None = None,
1833 ) -> tuple[list[FormData], str, bool]:
1834 # Merge column tags and row tags. We give preference
1835 # to moods etc coming from rowtags (cf. austteigen/German/Verb
1836 # imperative forms).
1838 # In certain cases, what a tag means depends on whether
1839 # it is a row or column header. Depending on the language,
1840 # we replace certain tags with others if they're in
1841 # a column or row
1843 ret: list[FormData] = []
1844 # rtagreplacs = get_lang_conf(lang, "rowtag_replacements")
1845 # ctagreplacs = get_lang_conf(lang, "coltag_replacements")
1846 for rt in sorted(rowtags):
1847 if "dummy-use-as-coltags" in rt: 1847 ↛ 1848line 1847 didn't jump to line 1848 because the condition on line 1847 was never true
1848 continue
1849 # if lang was in rowtag_replacements)
1850 # if not rtagreplacs == None:
1851 # rt = replace_directional_tags(rt, rtagreplacs)
1852 for ct in sorted(coltags):
1853 if "dummy-use-as-rowtags" in ct: 1853 ↛ 1854line 1853 didn't jump to line 1854 because the condition on line 1853 was never true
1854 continue
1855 # if lang was in coltag_replacements
1856 # if not ctagreplacs == None:
1857 # ct = replace_directional_tags(ct,
1858 # ctagreplacs)
1859 tags = set(global_tags)
1860 tags.update(extra_tags)
1861 tags.update(rt)
1862 tags.update(refs_tags)
1863 tags.update(tablecontext.section_header)
1864 # Merge tags from column. For certain kinds of tags,
1865 # those coming from row take precedence.
1866 old_tags = set(tags)
1867 for t in ct:
1868 c = valid_tags[t]
1869 if c in ("mood", "case", "number") and any(
1870 valid_tags[tt] == c for tt in old_tags
1871 ):
1872 continue
1873 tags.add(t)
1875 # Extract language-specific tags from the
1876 # form. This may also adjust the form.
1877 form, lang_tags = lang_specific_tags(lang, pos, form)
1878 tags.update(lang_tags)
1880 # For non-finite verb forms, see if they have
1881 # a gender/class suffix
1882 if pos == "verb" and any(
1883 valid_tags[t] == "non-finite" for t in tags
1884 ):
1885 form, tt = parse_head_final_tags(wxr, lang, form)
1886 tags.update(tt)
1888 # Remove "personal" tag if have nth person; these
1889 # come up with e.g. reconhecer/Portuguese/Verb. But
1890 # not if we also have "pronoun"
1891 if (
1892 "personal" in tags
1893 and "pronoun" not in tags
1894 and any(
1895 x in tags
1896 for x in [
1897 "first-person",
1898 "second-person",
1899 "third-person",
1900 ]
1901 )
1902 ):
1903 tags.remove("personal")
1905 # If we have impersonal, remove person and number.
1906 # This happens with e.g. viajar/Portuguese/Verb
1907 if "impersonal" in tags:
1908 tags = tags - set(
1909 [
1910 "first-person",
1911 "second-person",
1912 "third-person",
1913 "singular",
1914 "plural",
1915 ]
1916 )
1918 # Remove unnecessary "positive" tag from verb forms
1919 if pos == "verb" and "positive" in tags:
1920 if "negative" in tags: 1920 ↛ 1921line 1920 didn't jump to line 1921 because the condition on line 1920 was never true
1921 tags.remove("negative")
1922 tags.remove("positive")
1924 # Many Russian (and other Slavic) inflection tables
1925 # have animate/inanimate distinction that generates
1926 # separate entries for neuter/feminine, but the
1927 # distinction only applies to masculine. Remove them
1928 # form neuter/feminine and eliminate duplicates.
1929 if get_lang_conf(lang, "masc_only_animate"):
1930 for t1 in ("animate", "inanimate"):
1931 for t2 in ("neuter", "feminine"):
1932 if (
1933 t1 in tags
1934 and t2 in tags
1935 and "masculine" not in tags
1936 and "plural" not in tags
1937 ):
1938 tags.remove(t1)
1940 # German adjective tables contain "(keiner)" etc
1941 # for mixed declension plural. When the adjective
1942 # disappears and it becomes just one word, remove
1943 # the "includes-article" tag. e.g. eiskalt/German
1944 if "includes-article" in tags and " " not in form:
1945 tags.remove("includes-article")
1947 # Handle ignored forms. We mark that the form was
1948 # provided. This is important information; some words
1949 # just do not have a certain form. However, there also
1950 # many cases where no word in a language has a
1951 # particular form. Post-processing could detect and
1952 # remove such cases.
1953 if form in IGNORED_COLVALUES:
1954 # if cell text seems to be ignorable
1955 if "dummy-ignore-skipped" in tags:
1956 continue
1957 if (
1958 col_idx not in has_covering_hdr
1959 and some_has_covered_text
1960 ):
1961 continue
1962 # don't ignore this cell if there's been a header
1963 # above it
1964 form = "-"
1965 elif col_idx in has_covering_hdr:
1966 some_has_covered_text = True
1968 # Handle ambiguous object concord. If a header
1969 # gives the "dummy-object-concord"-tag to a word,
1970 # replace person, number and gender tags with
1971 # their "object-" counterparts so that the verb
1972 # agrees with the object instead.
1973 # Use only when the verb has ONLY object agreement!
1974 # a پخول/Pashto
1975 if "dummy-object-concord" in tags: 1975 ↛ 1976line 1975 didn't jump to line 1976 because the condition on line 1975 was never true
1976 for subtag, objtag in object_concord_replacements.items():
1977 if subtag in tags:
1978 tags.remove(subtag)
1979 tags.add(objtag)
1981 # Remove the dummy mood tag that we sometimes
1982 # use to block adding other mood and related
1983 # tags
1984 tags = tags - set(
1985 [
1986 "dummy-mood",
1987 "dummy-tense",
1988 "dummy-ignore-skipped",
1989 "dummy-object-concord",
1990 "dummy-reset-headers",
1991 "dummy-use-as-coltags",
1992 "dummy-use-as-rowtags",
1993 "dummy-store-hdrspan",
1994 "dummy-load-stored-hdrspans",
1995 "dummy-reset-stored-hdrspans",
1996 "dummy-section-header",
1997 ]
1998 )
2000 # Perform language-specific tag replacements according
2001 # to rules in a table.
2002 lang_tag_mappings = get_lang_conf(lang, "lang_tag_mappings")
2003 if lang_tag_mappings is not None: 2003 ↛ 2004line 2003 didn't jump to line 2004 because the condition on line 2003 was never true
2004 for pre, post in lang_tag_mappings.items():
2005 if all(t in tags for t in pre):
2006 tags = (tags - set(pre)) | set(post)
2008 # Warn if there are entries with empty tags
2009 if not tags:
2010 wxr.wtp.debug(
2011 "inflection table: empty tags for {}".format(form),
2012 sortid="inflection/1826",
2013 )
2015 # Warn if form looks like IPA
2016 ########## XXX ########
2017 # Because IPA is its own unicode block, we could also
2018 # technically do a Unicode name check to see if a string
2019 # contains IPA. Not all valid IPA characters are in the
2020 # IPA extension block, so you can technically have false
2021 # negatives if it's something like /toki/, but it
2022 # shouldn't give false positives.
2023 # Alternatively, you could make a list of IPA-admissible
2024 # characters and reject non-IPA stuff with that.
2025 if re.match(r"\s*/.*/\s*$", form): 2025 ↛ 2026line 2025 didn't jump to line 2026 because the condition on line 2025 was never true
2026 wxr.wtp.debug(
2027 "inflection table form looks like IPA: "
2028 "form={} tags={}".format(form, tags),
2029 sortid="inflection/1840",
2030 )
2032 # Note that this checks `form`, not `in tags`
2033 if form == "dummy-ignored-text-cell": 2033 ↛ 2034line 2033 didn't jump to line 2034 because the condition on line 2033 was never true
2034 continue
2036 if "dummy-remove-this-cell" in tags: 2036 ↛ 2037line 2036 didn't jump to line 2037 because the condition on line 2036 was never true
2037 continue
2039 # Add the form
2040 tags_list = list(sorted(tags))
2041 dt: FormData = {
2042 "form": form,
2043 "tags": tags_list,
2044 "source": source,
2045 }
2046 if roman:
2047 dt["roman"] = roman
2048 if ipa:
2049 dt["ipa"] = ipa
2050 if cell_links is not None and (
2051 matched_links := match_links_to_form(
2052 wxr, form, cell_links, None
2053 )
2054 ):
2055 dt["links"] = matched_links
2056 ret.append(dt)
2057 # If we got separate clitic form, add it
2058 if clitic:
2059 dt = {
2060 "form": clitic,
2061 "tags": tags_list + ["clitic"],
2062 "source": source,
2063 }
2064 ret.append(dt)
2065 return ret, form, some_has_covered_text
2067 # First extract definitions from cells
2068 # See defs_ht for footnote defs stuff
2069 for row in rows:
2070 for cell in row:
2071 text, refs, defs, hdr_tags = extract_cell_content(
2072 lang, word, cell.text
2073 )
2074 # refs, defs = footnote stuff, defs -> (ref, def)
2075 add_defs(defs)
2076 # Extract definitions from text after table
2077 text, refs, defs, hdr_tags = extract_cell_content(lang, word, after)
2078 add_defs(defs)
2080 # Then extract the actual forms
2081 ret = []
2082 hdrspans: list[HdrSpan] = []
2083 first_col_has_text = False
2084 rownum = 0
2085 title = None
2086 global_tags = []
2087 table_tags = []
2088 special_phrase_splits = get_lang_conf(lang, "special_phrase_splits")
2089 form_replacements = get_lang_conf(lang, "form_replacements")
2090 form_transformations = get_lang_conf(lang, "form_transformations")
2091 possibly_ignored_forms = get_lang_conf(lang, "conditionally_ignored_cells")
2092 cleanup_rules = get_lang_conf(lang, "minor_text_cleanups")
2094 for title in titles:
2095 more_global_tags, more_table_tags, extra_forms = parse_title(
2096 title, source
2097 )
2098 global_tags.extend(more_global_tags)
2099 table_tags.extend(more_table_tags)
2100 ret.extend(extra_forms)
2101 cell_rowcnt: collections.defaultdict[int, int] = collections.defaultdict(
2102 int
2103 )
2104 seen_cells = set()
2105 has_covering_hdr = set()
2106 some_has_covered_text = False
2107 for row in rows:
2108 # print("ROW:", row)
2109 # print("====")
2110 # print(f"Start of PREVIOUS row hdrspans:"
2111 # f"{tuple(sp.tagsets for sp in hdrspans)}")
2112 # print(f"Start of row txt: {tuple(t.text for t in row)}")
2113 if not row: 2113 ↛ 2114line 2113 didn't jump to line 2114 because the condition on line 2113 was never true
2114 continue # Skip empty rows
2115 all_headers = all(x.is_title or not x.text.strip() for x in row)
2116 text = row[0].text
2117 if (
2118 row[0].is_title
2119 and text
2120 and not is_superscript(text[0])
2121 and text not in infl_map # zealous inflation map?
2122 and (
2123 re.match(r"Inflection ", text)
2124 or re.sub(
2125 r"\s+",
2126 " ", # flatten whitespace
2127 re.sub(
2128 r"\s*\([^)]*\)",
2129 "",
2130 # Remove whitespace+parens
2131 text,
2132 ),
2133 ).strip()
2134 not in infl_map
2135 )
2136 and not re.match(infl_start_re, text)
2137 and all(
2138 x.is_title == row[0].is_title and x.text == text
2139 # all InflCells in `row` have the same is_title and text
2140 for x in row
2141 )
2142 ):
2143 if text and title is None:
2144 # Only if there were no titles previously make the first
2145 # text that is found the title
2146 title = text
2147 if re.match(r"(Note:|Notes:)", title): 2147 ↛ 2148line 2147 didn't jump to line 2148 because the condition on line 2147 was never true
2148 continue # not a title
2149 more_global_tags, more_table_tags, extra_forms = parse_title(
2150 title, source
2151 )
2152 global_tags.extend(more_global_tags)
2153 table_tags.extend(more_table_tags)
2154 ret.extend(extra_forms)
2155 continue # Skip title rows without incrementing i
2156 if "dummy-skip-this" in global_tags: 2156 ↛ 2157line 2156 didn't jump to line 2157 because the condition on line 2156 was never true
2157 return []
2158 rowtags: list[tuple[str, ...]] = [()]
2159 # have_hdr = False
2160 # have_hdr never used?
2161 have_text = False
2162 samecell_cnt = 0
2163 col0_hdrspan = None # col0 or later header (despite its name)
2164 col0_followed_by_nonempty = False
2165 row_empty = True
2166 for col_idx, cell in enumerate(row):
2167 colspan = cell.colspan # >= 1
2168 rowspan = cell.rowspan # >= 1
2169 cell_links = cell.links # for weird links
2170 previously_seen = id(cell) in seen_cells
2171 # checks to see if this cell was in the previous ROW
2172 seen_cells.add(id(cell))
2173 if samecell_cnt == 0:
2174 # First column of a (possible multi-column) cell
2175 samecell_cnt = colspan - 1
2176 else:
2177 assert samecell_cnt > 0
2178 samecell_cnt -= 1
2179 continue
2181 # is_first_row_of_cell = cell_rowcnt[id(cell)] == 0
2182 # never used?
2184 # defaultdict(int) around line 1900
2185 cell_rowcnt[id(cell)] += 1
2186 # => how many cols this spans
2187 col: str = cell.text
2188 if not col:
2189 continue
2190 row_empty = False
2191 is_title = cell.is_title
2193 # If the cell has a target, i.e., text after colon, interpret
2194 # it as simply specifying a value for that value and ignore
2195 # it otherwise.
2196 if cell.target:
2197 text, refs, defs, hdr_tags = extract_cell_content(
2198 lang, word, col
2199 )
2200 if not text: 2200 ↛ 2201line 2200 didn't jump to line 2201 because the condition on line 2200 was never true
2201 continue
2202 refs_tags: set[str] = set()
2203 for ref in refs: # gets tags from footnotes 2203 ↛ 2204line 2203 didn't jump to line 2204 because the loop on line 2203 never started
2204 if ref in def_ht:
2205 refs_tags.update(def_ht[ref])
2206 rowtags = expand_header(
2207 wxr,
2208 tablecontext,
2209 word,
2210 lang,
2211 pos,
2212 text,
2213 [],
2214 silent=True,
2215 depth=depth,
2216 column_number=col_idx,
2217 )
2218 rowtags = list(
2219 set(tuple(sorted(set(x) | refs_tags)) for x in rowtags)
2220 )
2221 is_title = False
2222 col = cell.target
2224 # print(rownum, col_idx, col)
2225 # print(f"is_title: {is_title}")
2226 if is_title:
2227 # It is a header cell
2228 text, refs, defs, hdr_tags = extract_cell_content(
2229 lang, word, col
2230 )
2231 if not text:
2232 continue
2233 # Extract tags from referenced footnotes
2234 refs_tags = set()
2235 for ref in refs:
2236 if ref in def_ht:
2237 refs_tags.update(def_ht[ref])
2239 # Expand header to tags
2240 v = expand_header(
2241 wxr,
2242 tablecontext,
2243 word,
2244 lang,
2245 pos,
2246 text,
2247 [],
2248 silent=True,
2249 depth=depth,
2250 column_number=col_idx,
2251 )
2252 # print("EXPANDED {!r} to {}".format(text, v))
2254 if col_idx == 0:
2255 # first_col_has_text is used for a test to ignore
2256 # upper-left cells that are just text without
2257 # header info
2258 first_col_has_text = True
2259 # Check if the header expands to reset hdrspans
2260 if any("dummy-reset-headers" in tt for tt in v):
2261 new_hdrspans = []
2262 for hdrspan in hdrspans:
2263 # if there are HdrSpan objects (abstract headers with
2264 # row- and column-spans) that are to the left or at the
2265 # same row or below, KEEP those; things above and to
2266 # the right of the hdrspan with dummy-reset-headers
2267 # are discarded. Tags from the header together with
2268 # dummy-reset-headers are kept as normal.
2269 if (
2270 hdrspan.start + hdrspan.colspan < col_idx
2271 or hdrspan.rownum > rownum - cell.rowspan
2272 ):
2273 new_hdrspans.append(hdrspan)
2274 hdrspans = new_hdrspans
2276 for tt in v:
2277 if "dummy-section-header" in tt: 2277 ↛ 2278line 2277 didn't jump to line 2278 because the condition on line 2277 was never true
2278 tablecontext.section_header = tt
2279 break
2280 if "dummy-reset-section-header" in tt: 2280 ↛ 2281line 2280 didn't jump to line 2281 because the condition on line 2280 was never true
2281 tablecontext.section_header = tuple()
2282 # Text between headers on a row causes earlier headers to
2283 # be reset
2284 if have_text:
2285 # print(" HAVE_TEXT BEFORE HDR:", col)
2286 # Reset rowtags if new title column after previous
2287 # text cells
2288 # +-----+-----+-----+-----+
2289 # |hdr-a|txt-a|hdr-B|txt-B|
2290 # +-----+-----+-----+-----+
2291 # ^reset rowtags=>
2292 # XXX beware of header "—": "" - must not clear on that if
2293 # it expands to no tags
2294 rowtags = [()]
2295 # have_hdr = True
2296 # have_hdr never used?
2297 # print("HAVE_HDR: {} rowtags={}".format(col, rowtags))
2298 # Update rowtags and coltags
2299 has_covering_hdr.add(col_idx) # col_idx == current column
2300 # has_covering_hdr is a set that has the col_idx-ids of columns
2301 # that have previously had some kind of header. It is never
2302 # resetted inside the col_idx-loops OR the bigger rows-loop, so
2303 # applies to the whole table.
2305 new_coltags: list[tuple[str, ...]]
2306 all_hdr_tags: list[tuple[str, ...]]
2307 rowtags, new_coltags, all_hdr_tags = generate_tags(
2308 rowtags, table_tags
2309 )
2311 if any("dummy-skip-this" in ts for ts in rowtags):
2312 continue # Skip this cell
2314 if any("dummy-load-stored-hdrspans" in ts for ts in v): 2314 ↛ 2315line 2314 didn't jump to line 2315 because the condition on line 2314 was never true
2315 hdrspans.extend(tablecontext.stored_hdrspans)
2317 if any("dummy-reset-stored-hdrspans" in ts for ts in v): 2317 ↛ 2318line 2317 didn't jump to line 2318 because the condition on line 2317 was never true
2318 tablecontext.stored_hdrspans = []
2320 if any("dummy-store-hdrspan" in ts for ts in v): 2320 ↛ 2322line 2320 didn't jump to line 2322 because the condition on line 2320 was never true
2321 # print(f"STORED: {col}")
2322 store_new_hdrspan = True
2323 else:
2324 store_new_hdrspan = False
2326 new_coltags = list(
2327 x
2328 for x in new_coltags
2329 if not any(t in noinherit_tags for t in x)
2330 )
2331 # print("new_coltags={} previously_seen={} all_hdr_tags={}"
2332 # .format(new_coltags, previously_seen, all_hdr_tags))
2333 if any(new_coltags):
2334 (
2335 col,
2336 col0_followed_by_nonempty,
2337 col0_hdrspan,
2338 ) = add_new_hdrspan(
2339 col,
2340 hdrspans,
2341 store_new_hdrspan,
2342 col0_followed_by_nonempty,
2343 col0_hdrspan,
2344 )
2346 continue
2348 # These values are ignored, at least for now
2349 if re.match(r"^(# |\(see )", col): 2349 ↛ 2350line 2349 didn't jump to line 2350 because the condition on line 2349 was never true
2350 continue
2352 if any("dummy-skip-this" in ts for ts in rowtags):
2353 continue # Skip this cell
2355 # If the word has no rowtags and is a multi-row cell, then
2356 # ignore this. This happens with empty separator rows
2357 # within a rowspan>1 cell. cf. wander/English/Conjugation.
2358 if rowtags == [()] and rowspan > 1:
2359 continue
2361 # Minor cleanup. See e.g. είμαι/Greek/Verb present participle.
2362 if cleanup_rules:
2363 for regx, substitution in cleanup_rules.items():
2364 col = re.sub(regx, substitution, col)
2366 if ( 2366 ↛ 2371line 2366 didn't jump to line 2371 because the condition on line 2366 was never true
2367 col_idx == 0
2368 and not first_col_has_text
2369 and get_lang_conf(lang, "ignore_top_left_text_cell") is True
2370 ):
2371 continue # Skip text at top left, as in Icelandic, Faroese
2373 # if col0_hdrspan is not None:
2374 # print("COL0 FOLLOWED NONHDR: {!r} by {!r}"
2375 # .format(col0_hdrspan.text, col))
2376 col0_followed_by_nonempty = True
2377 have_text = True
2379 # Determine column tags for the multi-column cell
2380 combined_coltags = compute_coltags(
2381 lang, pos, hdrspans, col_idx, colspan, col
2382 )
2383 if any("dummy-ignored-text-cell" in ts for ts in combined_coltags): 2383 ↛ 2384line 2383 didn't jump to line 2384 because the condition on line 2383 was never true
2384 continue
2386 # Split the text into separate forms. First simplify spaces except
2387 # newline.
2388 col = re.sub(r"[ \t\r]+", " ", col)
2389 # Split the cell text into alternatives
2391 col, alts, split_extra_tags = split_text_into_alts(col)
2393 # Some cells have mixed form content, like text and romanization,
2394 # or text and IPA. Handle these.
2395 altss = handle_mixed_lines(alts, tablecontext)
2397 altsss = list((x, combined_coltags, cell_links) for x in altss)
2399 # Generate forms from the alternatives
2400 # alts is a list of (tuple of forms, tuple of tags)
2401 coltags: list[tuple[str, ...]]
2402 base_roman: str
2403 ipa: str
2404 for (form, base_roman, ipa), coltags, cell_links in altsss:
2405 form = form.strip()
2406 extra_tags: list[str] = []
2407 extra_tags.extend(split_extra_tags)
2408 # Handle special splits again here, so that we can have custom
2409 # mappings from form to form and tags.
2410 if form in form_replacements:
2411 replacement, tags = form_replacements[form]
2412 for x in tags.split():
2413 assert x in valid_tags
2414 assert isinstance(replacement, str)
2415 assert isinstance(tags, str)
2416 form = replacement
2417 extra_tags.extend(tags.split())
2419 check_romanization_form_transformation = False
2420 # loop over regexes in form_transformation and replace text
2421 # in form using regex patterns
2422 # this does a bit of the same stuff the above does,
2423 # but with regexes and re.sub() instead
2424 subst: str
2425 for (
2426 form_transformations_pos,
2427 vv,
2428 subst,
2429 tags,
2430 ) in form_transformations:
2431 # v is a pattern string, like "^ich"
2432 if (
2433 isinstance(form_transformations_pos, str)
2434 and pos != form_transformations_pos
2435 ) or (
2436 (not isinstance(form_transformations_pos, str))
2437 and pos not in form_transformations_pos
2438 ):
2439 continue
2440 m: re.Match | None = re.search(vv, form)
2441 if m is not None:
2442 if base_roman: 2442 ↛ 2443line 2442 didn't jump to line 2443 because the condition on line 2442 was never true
2443 for _, rom_v, rom_sub, _ in form_transformations:
2444 rom_m = re.search(rom_v, base_roman)
2445 if rom_m is not None:
2446 base_roman = re.sub(
2447 rom_v, rom_sub, base_roman
2448 )
2449 break
2450 form = re.sub(vv, subst, form)
2451 for x in tags.split():
2452 assert x in valid_tags
2453 extra_tags.extend(tags.split())
2454 check_romanization_form_transformation = True
2455 break
2457 # Clean the value, extracting reference symbols
2458 form, refs, defs, hdr_tags = extract_cell_content(
2459 lang, word, form
2460 )
2461 # if refs:
2462 # print("REFS:", refs)
2463 extra_tags.extend(hdr_tags)
2464 # Extract tags from referenced footnotes
2465 refs_tags = set()
2466 for ref in refs:
2467 if ref in def_ht:
2468 refs_tags.update(def_ht[ref])
2470 if base_roman:
2471 if check_romanization_form_transformation: 2471 ↛ 2475line 2471 didn't jump to line 2475 because the condition on line 2471 was never true
2472 # because form_transformations are used to handle things
2473 # where the romanization has the "same" structure, we
2474 # need to handle that here too....
2475 for (
2476 _,
2477 vv,
2478 subst,
2479 _,
2480 ) in form_transformations:
2481 # v is a pattern string, like "^ich"
2482 m = re.search(vv, base_roman)
2483 if m is not None:
2484 base_roman = re.sub(vv, subst, base_roman)
2485 # XXX add tag stuff here if needed
2486 break
2488 base_roman, _, _, hdr_tags = extract_cell_content(
2489 lang, word, base_roman
2490 )
2491 extra_tags.extend(hdr_tags)
2493 # Do some additional cleanup on the cell.
2494 form = re.sub(r"^\s*,\s*", "", form)
2495 form = re.sub(r"\s*,\s*$", "", form)
2496 form = re.sub(r"\s*(,\s*)+", ", ", form)
2497 form = re.sub(r"(?i)^Main:", "", form)
2498 form = re.sub(r"\s+", " ", form)
2499 form = form.strip()
2501 # Look for parentheses that have semantic meaning
2502 form, et = find_semantic_parens(form, lang)
2503 extra_tags.extend(et)
2505 # Handle parentheses in the table element. We parse
2506 # tags anywhere and romanizations anywhere but beginning.
2507 roman: str = base_roman
2508 paren: str | None = None
2509 clitic: str | None = None
2510 m = re.search(r"(\s+|^)\(([^)]*)\)", form)
2511 # start|spaces + (anything)
2512 if m is not None:
2513 subst = m.group(1)
2514 paren = m.group(2)
2515 else:
2516 m = re.search(r"\(([^)]*)\)(\s+|$)", form)
2517 # (anything) + spaces|end
2518 if m is not None: 2518 ↛ 2519line 2518 didn't jump to line 2519 because the condition on line 2518 was never true
2519 paren = m.group(1)
2520 subst = m.group(2)
2521 if paren is not None:
2522 form, roman, clitic = handle_parens(
2523 form, roman, clitic, extra_tags
2524 )
2526 # Ignore certain forms that are not really forms,
2527 # unless they're really, really close to the article title
2528 if form in ( 2528 ↛ 2533line 2528 didn't jump to line 2533 because the condition on line 2528 was never true
2529 "",
2530 "unchanged",
2531 "after an", # in sona/Irish/Adj/Mutation
2532 ):
2533 Lev = distw([form], word)
2534 if form and Lev < 0.1:
2535 wxr.wtp.debug(
2536 "accepted possible false positive '{}' with"
2537 "> 0.1 Levenshtein distance in {}/{}".format(
2538 form, word, lang
2539 ),
2540 sortid="inflection/2213",
2541 )
2542 elif form and Lev < 0.3:
2543 wxr.wtp.debug(
2544 "skipped possible match '{}' with > 0.3"
2545 "Levenshtein distance in {}/{}".format(
2546 form, word, lang
2547 ),
2548 sortid="inflection/2218",
2549 )
2550 continue
2551 else:
2552 continue
2553 # print("ROWTAGS={} COLTAGS={} REFS_TAGS={} "
2554 # "FORM={!r} ROMAN={!r}"
2555 # .format(rowtags, coltags, refs_tags,
2556 # form, roman))
2558 # Merge tags from row and column and do miscellaneous
2559 # tag-related handling.
2560 (
2561 merge_ret,
2562 form,
2563 some_has_covered_text,
2564 ) = merge_row_and_column_tags(
2565 form, some_has_covered_text, cell_links
2566 )
2567 ret.extend(merge_ret)
2569 # End of row.
2570 rownum += 1
2571 # For certain languages, if the row was empty, reset
2572 # hdrspans (saprast/Latvian/Verb, but not aussteigen/German/Verb).
2573 if row_empty and get_lang_conf(lang, "empty_row_resets"):
2574 hdrspans = []
2575 # Check if we should expand col0_hdrspan.
2576 if col0_hdrspan is not None:
2577 col0_allowed = get_lang_conf(lang, "hdr_expand_first")
2578 col0_cats = tagset_cats(col0_hdrspan.tagsets)
2579 # Only expand if col0_cats and later_cats are allowed
2580 # and don't overlap and col0 has tags, and there have
2581 # been no disallowed cells in between.
2582 if (
2583 not col0_followed_by_nonempty
2584 and not (col0_cats - col0_allowed)
2585 and
2586 # len(col0_cats) == 1 and
2587 col_idx > col0_hdrspan.start + col0_hdrspan.colspan
2588 ):
2589 # If an earlier header is only followed by headers that yield
2590 # no tags, expand it to entire row
2591 # print("EXPANDING COL0: {} from {} to {} cols {}"
2592 # .format(col0_hdrspan.text, col0_hdrspan.colspan,
2593 # len(row) - col0_hdrspan.start,
2594 # col0_hdrspan.tagsets))
2595 col0_hdrspan.colspan = len(row) - col0_hdrspan.start
2596 col0_hdrspan.expanded = True
2597 # XXX handle refs and defs
2598 # for x in hdrspans:
2599 # print(" HDRSPAN {} {} {} {!r}"
2600 # .format(x.start, x.colspan, x.tagsets, x.text))
2602 # Post-process German nouns with articles in separate columns. We move the
2603 # definite/indefinite/usually-without-article markers into the noun and
2604 # remove the article entries.
2605 if get_lang_conf(lang, "articles_in_separate_columns") and any(
2606 "noun" in x["tags"] for x in ret
2607 ):
2608 new_ret = []
2609 saved_tags: set[str] = set()
2610 had_noun = False
2611 for dt in ret:
2612 tags = dt["tags"]
2613 # print(tags)
2614 if "noun" in tags:
2615 tags = list(
2616 sorted(set(t for t in tags if t != "noun") | saved_tags)
2617 )
2618 had_noun = True
2619 elif ( 2619 ↛ 2646line 2619 didn't jump to line 2646 because the condition on line 2619 was always true
2620 "indefinite" in tags
2621 or "definite" in tags
2622 or "usually-without-article" in tags
2623 or "without-article" in tags
2624 ):
2625 if had_noun:
2626 saved_tags = set(tags)
2627 else:
2628 saved_tags = saved_tags | set(tags) # E.g. Haus/German
2629 remove_useless_tags(lang, pos, saved_tags)
2630 saved_tags = saved_tags & set(
2631 [
2632 "masculine",
2633 "feminine",
2634 "neuter",
2635 "singular",
2636 "plural",
2637 "indefinite",
2638 "definite",
2639 "usually-without-article",
2640 "without-article",
2641 ]
2642 )
2643 had_noun = False
2644 continue # Skip the articles
2646 dt = dt.copy()
2647 dt["tags"] = tags
2648 new_ret.append(dt)
2649 ret = new_ret
2651 elif possibly_ignored_forms:
2652 # Some languages have tables with cells that are kind of separated
2653 # and difficult to handle, like eulersche Formel/German where
2654 # the definite and indefinite articles are just floating.
2655 # If a language has a dict of conditionally_ignored_cells,
2656 # and if the contents of a cell is found in one of the rules
2657 # there, ignore that cell if it
2658 # 1. Does not have the appropriate tag (like "definite" for "die")
2659 # and
2660 # 2. The title of the article is not one of the other co-words
2661 # (ie. it's an article for the definite articles in german etc.)
2662 # pass
2663 new_ret = []
2664 for cell_data in ret:
2665 tags = cell_data["tags"]
2666 text = cell_data["form"]
2667 skip_this = False
2668 for key_tag, ignored_forms in possibly_ignored_forms.items():
2669 if text not in ignored_forms: 2669 ↛ 2671line 2669 didn't jump to line 2671 because the condition on line 2669 was always true
2670 continue
2671 if word in ignored_forms:
2672 continue
2673 if key_tag not in tags:
2674 skip_this = True
2676 if skip_this: 2676 ↛ 2677line 2676 didn't jump to line 2677 because the condition on line 2676 was never true
2677 continue
2678 new_ret.append(cell_data)
2680 ret = new_ret
2682 # Post-process English inflection tables, addding "multiword-construction"
2683 # when the number of words has increased.
2684 if lang == "English" and pos == "verb":
2685 word_words = len(word.split())
2686 new_ret = []
2687 for dt in ret:
2688 form = dt.get("form", "")
2689 if len(form.split()) > word_words:
2690 dt = dt.copy()
2691 dt["tags"] = list(dt.get("tags", []))
2692 # This strange copy-assigning shuffle is preventative black
2693 # magic; do not touch lest you invoke deep bugs.
2694 data_append(dt, "tags", "multiword-construction")
2695 new_ret.append(dt)
2696 ret = new_ret
2698 # Always insert "table-tags" detail as the first entry in any inflection
2699 # table. This way we can reliably detect where a new table starts.
2700 # Table-tags applies until the next table-tags entry.
2701 if ret or table_tags:
2702 table_tags = sorted(set(table_tags))
2703 dt = {
2704 "form": " ".join(table_tags),
2705 "source": source,
2706 "tags": ["table-tags"],
2707 }
2708 if dt["form"] == "":
2709 dt["form"] = "no-table-tags"
2710 if tablecontext.template_name:
2711 tn: FormData = {
2712 "form": tablecontext.template_name,
2713 "source": source,
2714 "tags": ["inflection-template"],
2715 }
2716 ret = [dt] + [tn] + ret
2717 else:
2718 ret = [dt] + ret
2720 return ret
2723def find_semantic_parens(form: str, lang: str) -> tuple[str, list[str]]:
2724 # "Some languages" (=Greek) use brackets to mark things that
2725 # require tags, like (informality), [rarity] and {archaicity}.
2726 extra_tags = []
2727 if re.match(r"\([^][(){}]*\)$", form):
2728 if get_lang_conf(lang, "parentheses_for_informal"):
2729 form = form[1:-1]
2730 extra_tags.append("informal")
2731 else:
2732 form = form[1:-1]
2733 elif re.match(r"\{\[[^][(){}]*\]\}$", form):
2734 if get_lang_conf(lang, "square_brackets_for_rare") and get_lang_conf( 2734 ↛ 2741line 2734 didn't jump to line 2741 because the condition on line 2734 was always true
2735 lang, "curly_brackets_for_archaic"
2736 ):
2737 # είμαι/Greek/Verb
2738 form = form[2:-2]
2739 extra_tags.extend(["rare", "archaic"])
2740 else:
2741 form = form[2:-2]
2742 elif re.match(r"\{[^][(){}]*\}$", form):
2743 if get_lang_conf(lang, "curly_brackets_for_archaic"): 2743 ↛ 2748line 2743 didn't jump to line 2748 because the condition on line 2743 was always true
2744 # είμαι/Greek/Verb
2745 form = form[1:-1]
2746 extra_tags.extend(["archaic"])
2747 else:
2748 form = form[1:-1]
2749 elif re.match(r"\[[^][(){}]*\]$", form):
2750 if get_lang_conf(lang, "square_brackets_for_rare"): 2750 ↛ 2755line 2750 didn't jump to line 2755 because the condition on line 2750 was always true
2751 # είμαι/Greek/Verb
2752 form = form[1:-1]
2753 extra_tags.append("rare")
2754 else:
2755 form = form[1:-1]
2756 return form, extra_tags
2759def handle_mixed_lines(
2760 alts: list[str], tablecontext: "TableContext"
2761) -> list[tuple[str, str, str]]:
2762 # Handle the special case where romanization is given under
2763 # normal form, e.g. in Russian. There can be multiple
2764 # comma-separated forms in each case. We also handle the case
2765 # where instead of romanization we have IPA pronunciation
2766 # (e.g., avoir/French/verb).
2767 len2 = len(alts) // 2
2769 if len(alts) == 1 and "(" not in alts[0]:
2770 return [(alts[0], "", "")]
2772 # Check for IPAs (forms first, IPAs under)
2773 # base, base, IPA, IPA
2774 if (
2775 len(alts) % 2 == 0 # Divisibly by two
2776 and all(
2777 re.match(r"^\s*/.*/\s*$", x) # Inside slashes = IPA
2778 for x in alts[len2:]
2779 )
2780 and not any(
2781 re.match(r"^\s*/.*/\s*$", x) # first half without slashes
2782 for x in alts[:len2]
2783 )
2784 ): # In the second half of alts
2785 return list(
2786 (alts[i], "", alts[i + len2])
2787 # List of tuples: (base, "", ipa)
2788 for i in range(len2)
2789 )
2790 # base, base, base, IPA
2791 elif (
2792 len(alts) > 2
2793 and re.match(r"^\s*/.*/\s*$", alts[-1])
2794 and all(not x.startswith("/") for x in alts[:-1])
2795 ):
2796 # Only if the last alt is IPA
2797 return list((alts[i], "", alts[-1]) for i in range(len(alts) - 1))
2799 # base, IPA, IPA, IPA
2800 elif (
2801 len(alts) > 2
2802 and not alts[0].startswith("/")
2803 and all(re.match(r"^\s*/.*/\s*$", x) for x in alts[1:])
2804 ):
2805 # First is base and the rest is IPA alternatives
2806 return list((alts[0], "", x) for x in alts[1:])
2808 alt_classifications = list(
2809 classify_desc(
2810 re.sub(
2811 r"\^.*$",
2812 "",
2813 # Remove ends of strings starting from ^.
2814 # Supescripts have been already removed
2815 # from the string, while ^xyz needs to be
2816 # removed separately, though it's usually
2817 # something with a single letter?
2818 "".join(xx for xx in x if not is_superscript(xx))
2819 # Remove trailing footnote asterisks that mess with
2820 # classification
2821 .strip("* "),
2822 )
2823 )
2824 for x in alts
2825 )
2827 # Check for romanizations, forms first, romanizations under
2828 if (
2829 len(alts) % 2 == 0
2830 and not any("(" in x for x in alts)
2831 and all(x == "other" for x in alt_classifications[:len2])
2832 and all(
2833 x in ("romanization", "english") for x in alt_classifications[len2:]
2834 )
2835 ):
2836 return list((alts[i], alts[i + len2], "") for i in range(len2))
2837 # Check for romanizations, forms and romanizations alternating
2838 elif (
2839 len(alts) % 2 == 0
2840 and not any("(" in x for x in alts)
2841 and all(
2842 alt_classifications[i] == "other" for i in range(0, len(alts), 2)
2843 )
2844 and all(
2845 alt_classifications[i] in ("romanization", "english")
2846 for i in range(1, len(alts), 2)
2847 )
2848 ):
2849 # odds
2850 return list((alts[i], alts[i + 1], "") for i in range(0, len(alts), 2))
2851 # evens
2852 # Handle complex Georgian entries with alternative forms and*
2853 # *romanizations. It's a bit of a mess. Remove this kludge if not
2854 # needed anymore. NOTE THAT THE PARENTHESES ON THE WEBSITE ARE NOT
2855 # DISPLAYED. They are put inside their own span elements that are
2856 # then hidden with some CSS.
2857 # https://en.wiktionary.org/wiki/%E1%83%90%E1%83%9B%E1%83%94%E1%83%A0%E1%83%98%E1%83%99%E1%83%98%E1%83%A1_%E1%83%A8%E1%83%94%E1%83%94%E1%83%A0%E1%83%97%E1%83%94%E1%83%91%E1%83%A3%E1%83%9A%E1%83%98_%E1%83%A8%E1%83%A2%E1%83%90%E1%83%A2%E1%83%94%E1%83%91%E1%83%98
2858 # ამერიკის შეერთებულ შტატებს(ა) (ameriḳis šeertebul šṭaṭebs(a))
2859 # The above should generate two alts entries, with two different
2860 # parallel versions, one without (a) and with (a) at the end,
2861 # for both the Georgian original and the romanization.
2862 elif (
2863 tablecontext.template_name == "ka-decl-noun"
2864 and len(alts) >= 1
2865 and any(" (" in alt_ for alt_ in alts)
2866 ):
2867 return ka_decl_noun_template_cell(alts)
2868 elif (
2869 len(alts) > 2
2870 and alt_classifications[0] == "other"
2871 and all(
2872 x in ("romanization", "english") for x in alt_classifications[1:]
2873 )
2874 ):
2875 return list((alts[0], x, "") for x in alts[1:])
2876 else:
2877 new_alts = []
2878 for alt in alts:
2879 lst = [""]
2880 idx = 0
2881 for m in re.finditer(
2882 r"(^|\w|\*)\((\w+(/\w+)*)\)",
2883 # start OR letter OR asterisk (word/word*)
2884 # \\___________group 1_______/ \ \_g3_///
2885 # \ \__gr. 2_//
2886 # \_____________group 0________________/
2887 alt,
2888 ):
2889 v = m.group(2) # (word/word/word...)
2890 if (
2891 classify_desc(v) == "tags" # Tags inside parens
2892 or m.group(0) == alt
2893 ): # All in parens
2894 continue
2895 new_lst = []
2896 for x in lst:
2897 x += alt[idx : m.start()] + m.group(1)
2898 # alt until letter or asterisk
2899 idx = m.end()
2900 vparts = v.split("/")
2901 # group(2) = ["word", "wörd"...]
2902 if len(vparts) == 1:
2903 new_lst.append(x)
2904 new_lst.append(x + v)
2905 # "kind(er)" -> ["kind", "kinder"]
2906 else:
2907 for vv in vparts:
2908 new_lst.append(x + vv)
2909 # "lampai(tten/den)" ->
2910 # ["lampaitten", "lampaiden"]
2911 lst = new_lst
2912 for x in lst:
2913 new_alts.append(x + alt[idx:])
2914 # add the end of alt
2915 return list((x, "", "") for x in new_alts)
2916 # [form, no romz, no ipa]
2917 return []
2920def handle_generic_table(
2921 wxr: WiktextractContext,
2922 tablecontext: "TableContext",
2923 data: WordData,
2924 word: str,
2925 lang: str,
2926 pos: str,
2927 rows: list[list[InflCell]],
2928 titles: list[str],
2929 source: str,
2930 after: str,
2931 depth: int,
2932) -> None:
2933 assert isinstance(wxr, WiktextractContext)
2934 assert isinstance(data, dict)
2935 assert isinstance(word, str)
2936 assert isinstance(lang, str)
2937 assert isinstance(pos, str)
2938 assert isinstance(rows, list)
2939 assert isinstance(source, str)
2940 assert isinstance(after, str)
2941 assert isinstance(depth, int)
2942 for row in rows:
2943 assert isinstance(row, list)
2944 for x in row:
2945 assert isinstance(x, InflCell)
2946 assert isinstance(titles, list)
2947 for s in titles:
2948 assert isinstance(s, str)
2950 # Try to parse the table as a simple table
2951 ret = parse_simple_table(
2952 wxr, tablecontext, word, lang, pos, rows, titles, source, after, depth
2953 )
2954 if ret is None: 2954 ↛ 2957line 2954 didn't jump to line 2957 because the condition on line 2954 was never true
2955 # XXX handle other table formats
2956 # We were not able to handle the table
2957 wxr.wtp.debug(
2958 "unhandled inflection table format, {}/{}".format(word, lang),
2959 sortid="inflection/2370",
2960 )
2961 return
2963 # Add the returned forms but eliminate duplicates.
2964 have_forms = set()
2965 for dt in ret:
2966 fdt = freeze(dt)
2967 if fdt in have_forms:
2968 continue # Don't add duplicates
2969 # Some Russian words have Declension and Pre-reform declension partially
2970 # duplicating same data. Don't add "dated" tags variant if already have
2971 # the same without "dated" from the modern declension table
2973 tags = dt.get("tags", [])
2974 for dated_tag in ("dated",):
2975 if dated_tag in tags:
2976 dt2 = dt.copy()
2977 tags2 = list(x for x in tags if x != dated_tag)
2978 dt2["tags"] = tags2
2979 if tags2 and freeze(dt2) in have_forms: 2979 ↛ 2980line 2979 didn't jump to line 2980 because the condition on line 2979 was never true
2980 break # Already have without archaic
2981 else:
2982 if "table-tags" not in tags:
2983 have_forms.add(fdt)
2984 data_append(data, "forms", dt)
2987def determine_header(
2988 wxr: WiktextractContext,
2989 tablecontext,
2990 lang: str,
2991 word: str,
2992 pos: str,
2993 table_kind: NodeKind,
2994 kind: NodeKind | str,
2995 style: str | None,
2996 row: list[InflCell],
2997 col: WikiNode,
2998 celltext: str,
2999 titletext: str,
3000 cols_headered: list[bool],
3001 target: str | None,
3002 cellstyle: str,
3003 # is_title,
3004 # hdr_expansion,
3005 # target,
3006 # celltext,
3007) -> tuple[bool, list[tuple[str, ...]], str | None, str]:
3008 assert isinstance(table_kind, NodeKind)
3009 assert isinstance(kind, (NodeKind, str))
3010 assert style is None or isinstance(style, str)
3011 assert cellstyle is None or isinstance(cellstyle, str)
3013 header_kind: NodeKind | str
3014 if table_kind == NodeKind.TABLE:
3015 header_kind = NodeKind.TABLE_HEADER_CELL
3016 elif table_kind == NodeKind.HTML: 3016 ↛ 3018line 3016 didn't jump to line 3018 because the condition on line 3016 was always true
3017 header_kind = "th"
3018 idx = celltext.find(": ")
3019 is_title = False
3020 # remove anything in parentheses, compress whitespace, .strip()
3021 cleaned_titletext = re.sub(
3022 r"\s+", " ", re.sub(r"\s*\([^)]*\)", "", titletext)
3023 ).strip()
3024 cleaned, _, _, _ = extract_cell_content(lang, word, celltext)
3025 cleaned = re.sub(r"\s+", " ", cleaned)
3026 hdr_expansion = expand_header(
3027 wxr,
3028 tablecontext,
3029 word,
3030 lang,
3031 pos,
3032 cleaned,
3033 [],
3034 silent=True,
3035 ignore_tags=True,
3036 )
3037 candidate_hdr = not any(
3038 any(t.startswith("error-") for t in ts) for ts in hdr_expansion
3039 )
3040 # KJ candidate_hdr says that a specific cell is a candidate
3041 # for being a header because it passed through expand_header
3042 # without getting any "error-" tags; that is, the contents
3043 # is "valid" for being a header; these are the false positives
3044 # we want to catch
3045 ignored_cell = any(
3046 any(t.startswith("dummy-") for t in ts) for ts in hdr_expansion
3047 )
3048 # ignored_cell should NOT be used to filter for headers, like
3049 # candidate_hdr is used, but only to filter for related *debug
3050 # messages*: some dummy-tags are actually half-way to headers,
3051 # like ones with "Notes", so they MUST be headers, but later
3052 # on they're ignored *as* headers so they don't need to print
3053 # out any cells-as-headers debug messages.
3054 if (
3055 candidate_hdr
3056 and kind != header_kind
3057 and cleaned != ""
3058 and cleaned != "dummy-ignored-text-cell"
3059 and cleaned not in IGNORED_COLVALUES
3060 ):
3061 # print("col: {}".format(col))
3062 if not ignored_cell and lang not in LANGUAGES_WITH_CELLS_AS_HEADERS:
3063 wxr.wtp.debug(
3064 "rejected heuristic header: "
3065 "table cell identified as header and given "
3066 "candidate status, BUT {} is not in "
3067 "LANGUAGES_WITH_CELLS_AS_HEADERS; "
3068 "cleaned text: {}".format(lang, cleaned),
3069 sortid="inflection/2447",
3070 )
3071 candidate_hdr = False
3072 elif cleaned not in LANGUAGES_WITH_CELLS_AS_HEADERS.get(lang, ""):
3073 wxr.wtp.debug(
3074 "rejected heuristic header: "
3075 "table cell identified as header and given "
3076 "candidate status, BUT the cleaned text is "
3077 "not in LANGUAGES_WITH_CELLS_AS_HEADERS[{}]; "
3078 "cleaned text: {}".format(lang, cleaned),
3079 sortid="inflection/2457",
3080 )
3081 candidate_hdr = False
3082 else:
3083 wxr.wtp.debug(
3084 "accepted heuristic header: "
3085 "table cell identified as header and given "
3086 "candidate status, AND the cleaned text is "
3087 "in LANGUAGES_WITH_CELLS_AS_HEADERS[{}]; "
3088 "cleaned text: {}".format(lang, cleaned),
3089 sortid="inflection/2466",
3090 )
3092 # If the cell starts with something that could start a
3093 # definition (typically a reference symbol), make it a candidate
3094 # regardless of whether the language is listed.
3095 if re.match(def_re, cleaned) and not re.match(nondef_re, cleaned): 3095 ↛ 3096line 3095 didn't jump to line 3096 because the condition on line 3095 was never true
3096 candidate_hdr = True
3098 # print("titletext={!r} hdr_expansion={!r} candidate_hdr={!r} "
3099 # "lang={} pos={}"
3100 # .format(titletext, hdr_expansion, candidate_hdr,
3101 # lang, pos))
3102 if idx >= 0 and titletext[:idx] in infl_map:
3103 target = titletext[idx + 2 :].strip()
3104 celltext = celltext[:idx]
3105 is_title = True
3106 elif (
3107 kind == header_kind
3108 and " + " not in titletext # For "avoir + blah blah"?
3109 and not any(
3110 isinstance(x, WikiNode)
3111 and x.kind == NodeKind.HTML
3112 and x.sarg == "span"
3113 and x.attrs.get("lang") in ("az",)
3114 for x in col.children
3115 )
3116 ):
3117 is_title = True
3118 elif (
3119 candidate_hdr
3120 and cleaned_titletext not in IGNORED_COLVALUES
3121 and distw([cleaned_titletext], word) > 0.3
3122 and cleaned_titletext not in ("I", "es")
3123 ):
3124 is_title = True
3125 # if first column or same style as first column
3126 elif (
3127 style == cellstyle
3128 and
3129 # and title is not identical to word name
3130 titletext != word
3131 and cleaned not in IGNORED_COLVALUES
3132 and cleaned != "dummy-ignored-text-cell"
3133 and
3134 # the style composite string is not broken
3135 not style.startswith("////")
3136 and " + " not in titletext
3137 ):
3138 if not ignored_cell and lang not in LANGUAGES_WITH_CELLS_AS_HEADERS: 3138 ↛ 3139line 3138 didn't jump to line 3139 because the condition on line 3138 was never true
3139 wxr.wtp.debug(
3140 "rejected heuristic header: "
3141 "table cell identified as header based "
3142 "on style, BUT {} is not in "
3143 "LANGUAGES_WITH_CELLS_AS_HEADERS; "
3144 "cleaned text: {}, style: {}".format(lang, cleaned, style),
3145 sortid="inflection/2512",
3146 )
3147 elif ( 3147 ↛ 3151line 3147 didn't jump to line 3151 because the condition on line 3147 was never true
3148 not ignored_cell
3149 and cleaned not in LANGUAGES_WITH_CELLS_AS_HEADERS.get(lang, "")
3150 ):
3151 wxr.wtp.debug(
3152 "rejected heuristic header: "
3153 "table cell identified as header based "
3154 "on style, BUT the cleaned text is "
3155 "not in LANGUAGES_WITH_CELLS_AS_HEADERS[{}]; "
3156 "cleaned text: {}, style: {}".format(lang, cleaned, style),
3157 sortid="inflection/2522",
3158 )
3159 else:
3160 wxr.wtp.debug(
3161 "accepted heuristic header: "
3162 "table cell identified as header based "
3163 "on style, AND the cleaned text is "
3164 "in LANGUAGES_WITH_CELLS_AS_HEADERS[{}]; "
3165 "cleaned text: {}, style: {}".format(lang, cleaned, style),
3166 sortid="inflection/2530",
3167 )
3168 is_title = True
3169 if ( 3169 ↛ 3176line 3169 didn't jump to line 3176 because the condition on line 3169 was never true
3170 not is_title
3171 and len(row) < len(cols_headered)
3172 and cols_headered[len(row)]
3173 ):
3174 # Whole column has title suggesting they are headers
3175 # (e.g. "Case")
3176 is_title = True
3177 if re.match(
3178 r"Conjugation of |Declension of |Inflection of |"
3179 r"Mutation of |Notes\b", # \b is word-boundary
3180 titletext,
3181 ):
3182 is_title = True
3183 return is_title, hdr_expansion, target, celltext
3186class TableContext:
3187 """Saved context used when parsing a table and its subtables."""
3189 __slot__ = (
3190 "stored_hdrspans",
3191 "section_header",
3192 "template_name",
3193 )
3195 def __init__(self, template_name: str | None = None) -> None:
3196 self.stored_hdrspans: list[HdrSpan] = []
3197 self.section_header: tuple[str, ...] = tuple()
3198 if template_name is None:
3199 self.template_name = ""
3200 else:
3201 self.template_name = template_name
3204def handle_wikitext_or_html_table(
3205 wxr: WiktextractContext,
3206 word: str,
3207 lang: str,
3208 pos: str,
3209 data: WordData,
3210 tree: WikiNode,
3211 titles: list[str],
3212 source: str,
3213 after: str,
3214 tablecontext: TableContext | None = None,
3215):
3216 """Parses a table from parsed Wikitext format into rows and columns of
3217 InflCell objects and then calls handle_generic_table() to parse it into
3218 forms. This adds the forms into ``data``."""
3219 assert isinstance(wxr, WiktextractContext)
3220 assert isinstance(word, str)
3221 assert isinstance(lang, str)
3222 assert isinstance(pos, str)
3223 assert isinstance(data, dict)
3224 assert isinstance(tree, WikiNode)
3225 assert tree.kind == NodeKind.TABLE or (
3226 tree.kind == NodeKind.HTML and tree.sarg == "table"
3227 )
3228 assert isinstance(titles, list)
3229 assert isinstance(source, str)
3230 for x in titles:
3231 assert isinstance(x, str)
3232 assert isinstance(after, str)
3233 assert tablecontext is None or isinstance(tablecontext, TableContext)
3234 # Imported here to avoid a circular import
3235 from wiktextract.page import clean_node, recursively_extract
3237 # from wikitextprocessor.parser import print_tree
3238 # print_tree(tree)
3239 # print("-------==========-------")
3241 if not tablecontext:
3242 tablecontext = TableContext()
3244 # Get language specific text removal patterns
3245 remove_text_patterns: tuple[str | re.Pattern, ...] | None = None
3246 if rem := get_lang_conf(lang, "remove_text_patterns"):
3247 for poses in rem.keys():
3248 if pos in poses:
3249 remove_text_patterns = rem[poses]
3250 break
3252 def handle_table1(
3253 wxr: WiktextractContext,
3254 tablecontext: TableContext,
3255 word: str,
3256 lang: str,
3257 pos: str,
3258 data: WordData,
3259 tree: WikiNode,
3260 titles: list[str],
3261 source: str,
3262 after: str,
3263 depth: int,
3264 ) -> list[tuple[list[list[InflCell]], list[str], str, int]]:
3265 # rows, titles, after, depth
3266 """Helper function allowing the 'flattening' out of the table
3267 recursion: instead of handling the tables in the wrong order
3268 (recursively), this function adds to new_row that is then
3269 iterated through in the main function at the end, creating
3270 a longer table (still in pieces) in the correct order."""
3272 assert isinstance(data, dict)
3273 assert isinstance(titles, list)
3274 assert isinstance(source, str)
3275 for x in titles:
3276 assert isinstance(x, str)
3277 assert isinstance(after, str)
3278 assert isinstance(depth, int)
3279 # print("HANDLE_WIKITEXT_TABLE", titles)
3280 # if len(titles) > 0:
3281 # wxr.wtp.debug(f"HANDLE_WIKITEXT_TABLE {titles=}")
3283 # Filling for columns with rowspan > 1
3284 col_gap_data: list[InflCell | None] = []
3285 # Number of remaining rows for which to fill the column
3286 vertical_still_left: list[int] = []
3287 cols_headered: list[bool] = [] # [F, T, F, F...]
3288 # True when the whole column contains headers, even
3289 # when the cell is not considered a header; triggered
3290 # by the "*" inflmap meta-tag.
3291 rows: list[list[InflCell]] = []
3293 sub_ret = []
3295 # from wikitextprocessor.parser import print_tree
3296 # print_tree(tree)
3297 for node in tree.children:
3298 if not isinstance(node, WikiNode):
3299 continue
3300 kind: NodeKind | str
3301 if node.kind == NodeKind.HTML:
3302 kind = node.sarg
3303 else:
3304 kind = node.kind
3306 # print(" {}".format(node))
3307 if kind in (NodeKind.TABLE_CAPTION, "caption"):
3308 # print(" CAPTION:", node)
3309 if "inflection-table-title" in node.attrs.get("class", ""): 3309 ↛ 3310line 3309 didn't jump to line 3310 because the condition on line 3309 was never true
3310 titles = [clean_node(wxr, None, node.children)]
3311 elif kind in (NodeKind.TABLE_ROW, "tr"):
3312 if "vsShow" in node.attrs.get("class", "").split():
3313 # vsShow rows are those that are intially shown in tables
3314 # that have more data. The hidden data duplicates these
3315 # rows, so we skip it and just process the hidden data.
3316 continue
3318 # if (
3319 # len(node.children) == 1
3320 # and node.children[0].attrs.get("class") == "separator"
3321 # ):
3322 # print("------------------ skip separator")
3323 # continue
3325 # Parse a table row.
3326 row: list[InflCell] = []
3327 style = None
3328 row_has_nonempty_cells = False
3329 # Have nonempty cell not from rowspan
3330 for col in get_table_cells(node):
3331 # loop through each cell in the ROW
3333 # The below skip is not needed anymore, because we "skip" in
3334 # get_table_cells, but left here as a comment
3335 # if not isinstance(col, WikiNode):
3336 # # This skip is not used for counting,
3337 # # "None" is not used in
3338 # # indexing or counting or looping.
3339 # continue
3340 if col.kind == NodeKind.HTML:
3341 kind = col.sarg
3342 else:
3343 kind = col.kind
3344 if kind not in ( 3344 ↛ 3350line 3344 didn't jump to line 3350 because the condition on line 3344 was never true
3345 NodeKind.TABLE_HEADER_CELL,
3346 NodeKind.TABLE_CELL,
3347 "th",
3348 "td",
3349 ):
3350 print(" UNEXPECTED ROW CONTENT: {}".format(col))
3351 continue
3353 while (
3354 len(row) < len(vertical_still_left)
3355 and vertical_still_left[len(row)] > 0
3356 ):
3357 # vertical_still_left is [...0, 0, 2...] for each
3358 # column. It is populated at the end of the loop, at the
3359 # same time as col_gap_data. This needs to be looped and
3360 # filled this way because each `for col`-looping jumps
3361 # straight to the next meaningful cell; there is no
3362 # "None" cells, only emptiness between, and rowspan and
3363 # colspan are just to generate the "fill-
3364 vertical_still_left[len(row)] -= 1
3366 # KJ Apr 2026
3367 # type checking is ignored; I am pretty sure that
3368 # row will never contain None, even if col_gap_data
3369 # is `InflCell | None`, but this code is such
3370 # spaghetti that it's hard to figure out, except
3371 # by the process of elimination: this has never
3372 # caused trouble before, ergo, it works.
3373 row.append(col_gap_data[len(row)]) # type: ignore
3375 # appending row is how "indexing" is
3376 # done here; something is appended,
3377 # like a filler-cell here or a "start"
3378 # cell at the end of the row-loop,
3379 # which increased len(row) which is
3380 # then used as the target-index to check
3381 # for gaps. vertical_still_left is
3382 # the countdown to when to stop
3383 # filling in gaps, and goes down to 0,
3384 # and col_gap_data is not touched
3385 # except when a new rowspan is needed,
3386 # at the same time that
3387 # vertical_still_left gets reassigned.
3389 try:
3390 rowspan = int(col.attrs.get("rowspan", "1")) # 🡙
3391 colspan = int(col.attrs.get("colspan", "1")) # 🡘
3392 except ValueError:
3393 rowspan = 1
3394 colspan = 1
3395 # print("COL:", col)
3397 # Too many of these errors
3398 if colspan > 100:
3399 # wxr.wtp.error(
3400 # f"Colspan {colspan} over 30, set to 1",
3401 # sortid="inflection/20250113a",
3402 # )
3403 colspan = 100
3404 if rowspan > 100: 3404 ↛ 3409line 3404 didn't jump to line 3409 because the condition on line 3404 was never true
3405 # wxr.wtp.error(
3406 # f"Rowspan {rowspan} over 30, set to 1",
3407 # sortid="inflection/20250113b",
3408 # )
3409 rowspan = 100
3411 # Process any nested tables recursively.
3412 tables, rest = recursively_extract(
3413 col,
3414 lambda x: (
3415 isinstance(x, WikiNode)
3416 and (x.kind == NodeKind.TABLE or x.sarg == "table")
3417 ),
3418 )
3420 # Clean the rest of the cell.
3421 link_capture_dict: dict = {}
3422 celltext = clean_node(
3423 wxr, link_capture_dict, rest, collect_links=True
3424 )
3425 cell_links: list[tuple[str, str]] | None = (
3426 link_capture_dict.get("links", None)
3427 )
3428 # print(f"CLEANED: {celltext=}")
3429 # print(f"SUBTABLES: {tables}")
3430 # print(f"{link_capture_dict=}")
3432 # Remove regexed patterns from text
3433 if remove_text_patterns is not None:
3434 for pat in remove_text_patterns:
3435 celltext = re.sub(pat, "", celltext)
3436 # print(f"AFTER: {celltext=} <<")
3438 # Handle nested tables.
3439 for tbl in tables:
3440 # Some nested tables (e.g., croí/Irish) have subtitles
3441 # as normal paragraphs in the same cell under a descrip-
3442 # tive text that should be treated as a title (e.g.,
3443 # "Forms with the definite article", with "definite" not
3444 # mentioned elsewhere).
3445 new_titles = list(titles)
3446 if celltext:
3447 new_titles.append(celltext)
3448 subtbl = handle_table1(
3449 wxr,
3450 tablecontext,
3451 word,
3452 lang,
3453 pos,
3454 data,
3455 tbl, # type: ignore
3456 new_titles,
3457 source,
3458 "",
3459 depth + 1,
3460 )
3461 if subtbl: 3461 ↛ 3439line 3461 didn't jump to line 3439 because the condition on line 3461 was always true
3462 sub_ret.append((rows, titles, after, depth))
3463 rows = []
3464 titles = []
3465 after = ""
3466 sub_ret.extend(subtbl)
3468 # This magic value is used as part of header detection
3469 cellstyle = (
3470 col.attrs.get("style", "")
3471 + "//"
3472 + col.attrs.get("class", "")
3473 + "//"
3474 + str(kind)
3475 )
3477 if not row: # if first column in row
3478 style = cellstyle
3479 target = None
3480 titletext = celltext.strip()
3481 while titletext and is_superscript(titletext[-1]):
3482 titletext = titletext[:-1]
3484 (
3485 is_title,
3486 hdr_expansion,
3487 target,
3488 celltext,
3489 ) = determine_header(
3490 wxr,
3491 tablecontext,
3492 lang,
3493 word,
3494 pos,
3495 tree.kind,
3496 kind,
3497 style,
3498 row,
3499 col,
3500 celltext,
3501 titletext,
3502 cols_headered,
3503 None,
3504 cellstyle,
3505 )
3507 if is_title:
3508 # If this cell gets a "*" tag, make the whole column
3509 # below it (toggling it in cols_headered = [F, F, T...])
3510 # into headers.
3511 while len(cols_headered) <= len(row):
3512 cols_headered.append(False)
3513 if any("*" in tt for tt in hdr_expansion):
3514 cols_headered[len(row)] = True
3515 celltext = ""
3516 # if row_has_nonempty_cells has been True at some point, it
3517 # keeps on being True.
3518 # if row_has_nonempty_cells or is_title or celltext != "":
3519 # row_has_nonempty_cells = True
3520 # ⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓
3521 row_has_nonempty_cells |= is_title or celltext != ""
3522 cell = InflCell(
3523 celltext, is_title, colspan, rowspan, target, cell_links
3524 )
3525 for _ in range(0, colspan):
3526 # colspan🡘 current loop (col) or 1
3527 # All the data-filling for colspan
3528 # is done simply in this loop,
3529 # while rowspan needs to use
3530 # vertical_still_left to count gaps
3531 # and col_gap_data to fill in
3532 # those gaps with InflCell data.
3533 if rowspan > 1: # rowspan🡙 current loop (col) or 1
3534 while len(col_gap_data) <= len(row):
3535 # Initialize col_gap_data/ed if
3536 # it is lacking slots
3537 # for each column; col_gap_data and
3538 # vertical_still_left are never
3539 # reset to [], during
3540 # the whole table function.
3541 col_gap_data.append(None)
3542 vertical_still_left.append(0)
3543 # Below is where the "rectangle" block of rowspan
3544 # and colspan is filled for the future.
3545 col_gap_data[len(row)] = cell
3546 # col_gap_data contains cells that
3547 # will be used in the
3548 # future, or None
3549 vertical_still_left[len(row)] = rowspan - 1
3550 # A counter for how many gaps🡙 are still left to be
3551 # filled (row.append or
3552 # row[col_gap_data[len(row)] =>
3553 # rows), it is not reset to [], but decremented to 0
3554 # each time a row gets something from col_gap_data.
3555 # Append this cell 1+ times for colspan🡘
3556 row.append(cell)
3557 if not row:
3558 continue
3559 # After looping the original row-nodes above, fill
3560 # in the rest of the row if the final cell has colspan
3561 # (inherited from above, so a cell with rowspan and colspan)
3562 for i in range(len(row), len(vertical_still_left)):
3563 if vertical_still_left[i] <= 0:
3564 continue
3565 vertical_still_left[i] -= 1
3566 while len(row) < i:
3567 row.append(InflCell("", False, 1, 1, None))
3568 row.append(col_gap_data[i]) # type: ignore
3569 # print(" ROW {!r}".format(row))
3570 if row_has_nonempty_cells: 3570 ↛ 3297line 3570 didn't jump to line 3297 because the condition on line 3570 was always true
3571 rows.append(row)
3572 elif kind in ( 3572 ↛ 3297line 3572 didn't jump to line 3297 because the condition on line 3572 was always true
3573 NodeKind.TABLE_HEADER_CELL,
3574 NodeKind.TABLE_CELL,
3575 "th",
3576 "td",
3577 "span",
3578 ):
3579 # print(" TOP-LEVEL CELL", node)
3580 pass
3582 if sub_ret:
3583 main_ret = sub_ret
3584 main_ret.append((rows, titles, after, depth))
3585 else:
3586 main_ret = [(rows, titles, after, depth)]
3587 return main_ret
3589 new_rows = handle_table1(
3590 wxr, tablecontext, word, lang, pos, data, tree, titles, source, after, 0
3591 )
3593 # Now we have a table that has been parsed into rows and columns of
3594 # InflCell objects. Parse the inflection table from that format.
3595 if new_rows: 3595 ↛ exitline 3595 didn't return from function 'handle_wikitext_or_html_table' because the condition on line 3595 was always true
3596 for rows, titles, after, depth in new_rows:
3597 handle_generic_table(
3598 wxr,
3599 tablecontext,
3600 data,
3601 word,
3602 lang,
3603 pos,
3604 rows,
3605 titles,
3606 source,
3607 after,
3608 depth,
3609 )
3612def get_table_cells(node: WikiNode) -> Generator[WikiNode, None, None]:
3613 """If a wikitext table cell contains HTML cells `<td>`, as they sometimes
3614 do because it is easier to write wikitext conditionals that way,
3615 those td-elements are parsed as child elements of the Wikitext cell.
3616 This generator will yield wikitext and HTML direct children of
3617 `node` and if a Wikitext TABLE_CELL has direct td-element children,
3618 those are also yielded."""
3619 for col in node.children:
3620 if not isinstance(col, WikiNode):
3621 continue
3622 if any(
3623 isinstance(c, HTMLNode) and c.sarg in ("th", "td")
3624 for c in col.children
3625 ):
3626 html_cells = []
3627 content = []
3628 for c in col.children:
3629 if isinstance(c, HTMLNode) and c.sarg in ("th", "td"):
3630 html_cells.append(c)
3631 else:
3632 content.append(c)
3633 # Remove td-elements from col so they are not returned twice
3634 col.children = content
3635 yield col
3636 for c in html_cells:
3637 yield c
3638 else:
3639 yield col
3642def handle_html_table(
3643 wxr: WiktextractContext,
3644 word: str,
3645 lang: str,
3646 pos: str,
3647 data: WordData,
3648 tree: WikiNode,
3649 titles: list[str],
3650 source: str,
3651 after: str,
3652 tablecontext: TableContext | None = None,
3653) -> None:
3654 """A passer-on function for html-tables, XXX, remove these?"""
3655 handle_wikitext_or_html_table(
3656 wxr, word, lang, pos, data, tree, titles, source, after, tablecontext
3657 )
3660def handle_wikitext_table(
3661 wxr: WiktextractContext,
3662 word: str,
3663 lang: str,
3664 pos: str,
3665 data: WordData,
3666 tree: WikiNode,
3667 titles: list[str],
3668 source: str,
3669 after: str,
3670 tablecontext: TableContext | None = None,
3671) -> None:
3672 """A passer-on function for html-tables, XXX, remove these?"""
3673 handle_wikitext_or_html_table(
3674 wxr, word, lang, pos, data, tree, titles, source, after, tablecontext
3675 )
3678def parse_inflection_section(
3679 wxr: WiktextractContext,
3680 data: WordData,
3681 word: str,
3682 lang: str,
3683 pos: str,
3684 section: str,
3685 tree: WikiNode,
3686 tablecontext: TableContext | None = None,
3687) -> None:
3688 """Parses an inflection section on a page. ``data`` should be the
3689 data for a part-of-speech, and inflections will be added to it."""
3691 # print("PARSE_INFLECTION_SECTION {}/{}/{}/{}"
3692 # .format(word, lang, pos, section))
3693 assert isinstance(wxr, WiktextractContext)
3694 assert isinstance(data, dict)
3695 assert isinstance(word, str)
3696 assert isinstance(lang, str)
3697 assert isinstance(section, str)
3698 assert isinstance(tree, WikiNode)
3699 assert tablecontext is None or isinstance(tablecontext, TableContext)
3700 source = section
3701 tables: list[
3702 tuple[Literal["html", "wikitext"], WikiNode, list[str], list[str]]
3703 ] = []
3704 titleparts: list[str] = []
3705 preceding_bolded_title = ""
3707 # from wikitextprocessor.parser import print_tree
3708 # print_tree(tree)
3709 # print("--------------******************----------------")
3711 def process_tables() -> None:
3712 for kind, node, titles, after_l in tables:
3713 after = "".join(after_l).strip()
3714 after = clean_value(wxr, after)
3715 if kind == "wikitext":
3716 handle_wikitext_table(
3717 wxr,
3718 word,
3719 lang,
3720 pos,
3721 data,
3722 node,
3723 titles,
3724 source,
3725 after,
3726 tablecontext=tablecontext,
3727 )
3728 elif kind == "html": 3728 ↛ 3742line 3728 didn't jump to line 3742 because the condition on line 3728 was always true
3729 handle_html_table(
3730 wxr,
3731 word,
3732 lang,
3733 pos,
3734 data,
3735 node,
3736 titles,
3737 source,
3738 after,
3739 tablecontext=tablecontext,
3740 )
3741 else:
3742 raise RuntimeError(
3743 "{}: unimplemented table kind {}".format(word, kind)
3744 )
3746 def recurse_navframe(node: WikiNode | str, titles: list[str]) -> None:
3747 nonlocal tables
3748 nonlocal titleparts
3749 titleparts = []
3750 old_tables = tables
3751 tables = []
3753 recurse(node, [], navframe=True)
3755 process_tables()
3756 tables = old_tables
3758 def recurse(
3759 node: WikiNode
3760 | str
3761 | list[WikiNode | str]
3762 | list[list[WikiNode | str]],
3763 titles: list[str],
3764 navframe=False,
3765 ) -> None:
3766 nonlocal tables
3767 if isinstance(node, (list, tuple)):
3768 for x in node:
3769 recurse(x, titles, navframe)
3770 return
3771 if isinstance(node, str):
3772 if tables:
3773 tables[-1][-1].append(node)
3774 elif navframe:
3775 titleparts.append(node)
3776 return
3777 if not isinstance(node, WikiNode): 3777 ↛ 3778line 3777 didn't jump to line 3778 because the condition on line 3777 was never true
3778 if navframe:
3779 wxr.wtp.debug(
3780 "inflection table: unhandled in NavFrame: {}".format(node),
3781 sortid="inflection/2907",
3782 )
3783 return
3784 kind = node.kind
3785 if navframe:
3786 if kind == NodeKind.HTML:
3787 classes = node.attrs.get("class", "").split()
3788 if "NavToggle" in classes: 3788 ↛ 3789line 3788 didn't jump to line 3789 because the condition on line 3788 was never true
3789 return
3790 if "NavHead" in classes:
3791 # print("NAVHEAD:", node)
3792 recurse(node.children, titles, navframe)
3793 return
3794 if "NavContent" in classes:
3795 # print("NAVCONTENT:", node)
3796 title = "".join(titleparts).strip()
3797 title = html.unescape(title)
3798 title = title.strip()
3799 new_titles = list(titles)
3800 if not re.match(r"(Note:|Notes:)", title): 3800 ↛ 3802line 3800 didn't jump to line 3802 because the condition on line 3800 was always true
3801 new_titles.append(title)
3802 recurse(node, new_titles, navframe=False)
3803 return
3804 else:
3805 if kind == NodeKind.TABLE:
3806 tables.append(("wikitext", node, titles, []))
3807 return
3808 elif kind == NodeKind.HTML and node.sarg == "table":
3809 htmlclasses = node.attrs.get("class", ())
3810 if "audiotable" in htmlclasses:
3811 return
3812 tables.append(("html", node, titles, []))
3813 return
3814 elif kind in ( 3814 ↛ 3821line 3814 didn't jump to line 3821 because the condition on line 3814 was never true
3815 NodeKind.LEVEL2,
3816 NodeKind.LEVEL3,
3817 NodeKind.LEVEL4,
3818 NodeKind.LEVEL5,
3819 NodeKind.LEVEL6,
3820 ):
3821 return # Skip subsections
3822 if (
3823 kind == NodeKind.HTML
3824 and node.sarg == "div"
3825 and "NavFrame" in node.attrs.get("class", "").split()
3826 ):
3827 recurse_navframe(node, titles)
3828 return
3829 if kind == NodeKind.LINK:
3830 if len(node.largs) > 1:
3831 recurse(node.largs[1:], titles, navframe)
3832 else:
3833 recurse(node.largs[0], titles, navframe)
3834 return
3835 if kind == NodeKind.HTML and node.sarg == "ref":
3836 return
3837 if kind == NodeKind.LIST and node.sarg == ";":
3838 nonlocal preceding_bolded_title
3839 from wiktextract.page import clean_node
3841 preceding_bolded_title = clean_node(wxr, None, node).strip("; ")
3842 for x in node.children:
3843 recurse(x, titles, navframe)
3845 assert tree.kind == NodeKind.ROOT
3846 for x in tree.children:
3847 if preceding_bolded_title != "":
3848 recurse(x, [preceding_bolded_title])
3849 else:
3850 recurse(x, [])
3852 # Process the tables we found
3853 process_tables()
3855 # XXX this code is used for extracting tables for inflection tests
3856 if wxr.config.expand_tables: 3856 ↛ 3857line 3856 didn't jump to line 3857 because the condition on line 3856 was never true
3857 if section != "Mutation":
3858 with open(wxr.config.expand_tables, "w") as f:
3859 f.write(word + "\n")
3860 f.write(lang + "\n")
3861 f.write(pos + "\n")
3862 f.write(section + "\n")
3863 text = wxr.wtp.node_to_wikitext(tree)
3864 f.write(text + "\n")