Coverage for src / wiktextract / extractor / en / inflection.py: 87%
1542 statements
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-11 04:48 +0000
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-11 04:48 +0000
1# Code for parsing inflection tables.
2#
3# Copyright (c) 2021-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org.
5import collections
6import copy
7import functools
8import html
9import re
10import unicodedata
11from typing import TYPE_CHECKING, Generator, Literal, Optional, Union
13from mediawiki_langcodes import code_to_name, name_to_code
14from wikitextprocessor import MAGIC_FIRST, HTMLNode, NodeKind, WikiNode
16from ...clean import clean_value
17from ...datautils import data_append, freeze, split_at_comma_semi
18from ...tags import valid_tags
19from ...wxr_context import WiktextractContext
20from .form_descriptions import (
21 classify_desc,
22 decode_tags,
23 distw,
24 match_links_to_form,
25 parse_head_final_tags,
26)
27from .inflection_kludges import ka_decl_noun_template_cell
28from .inflectiondata import infl_map, infl_start_map, infl_start_re
29from .lang_specific_configs import get_lang_conf, lang_specific_tags
30from .table_headers_heuristics_data import LANGUAGES_WITH_CELLS_AS_HEADERS
31from .type_utils import FormData, WordData
33# --debug-text-cell WORD
34# Command-line parameter for debugging. When parsing inflection tables,
35# print out debug messages when encountering this text.
36debug_cell_text: Optional[str] = None
39def set_debug_cell_text(text: str) -> None:
40 global debug_cell_text
41 debug_cell_text = text
44TagSets = list[tuple[str, ...]]
46# Column texts that are interpreted as an empty column.
47IGNORED_COLVALUES = {
48 "-",
49 "־",
50 "᠆",
51 "‐",
52 "‑",
53 "‒",
54 "–",
55 "—",
56 "―",
57 "−",
58 "⸺",
59 "⸻",
60 "﹘",
61 "﹣",
62 "-",
63 "/",
64 "?",
65 "not used",
66 "not applicable",
67}
69# These tags are never inherited from above
70# XXX merge with lang_specific
71noinherit_tags = {
72 "infinitive-i",
73 "infinitive-i-long",
74 "infinitive-ii",
75 "infinitive-iii",
76 "infinitive-iv",
77 "infinitive-v",
78}
80# Subject->object transformation mapping, when using dummy-object-concord
81# to replace subject concord tags with object concord tags
82object_concord_replacements = {
83 "first-person": "object-first-person",
84 "second-person": "object-second-person",
85 "third-person": "object-third-person",
86 "singular": "object-singular",
87 "plural": "object-plural",
88 "definite": "object-definite",
89 "indefinite": "object-indefinite",
90 "class-1": "object-class-1",
91 "class-2": "object-class-2",
92 "class-3": "object-class-3",
93 "class-4": "object-class-4",
94 "class-5": "object-class-5",
95 "class-6": "object-class-6",
96 "class-7": "object-class-7",
97 "class-8": "object-class-8",
98 "class-9": "object-class-9",
99 "class-10": "object-class-10",
100 "class-11": "object-class-11",
101 "class-12": "object-class-12",
102 "class-13": "object-class-13",
103 "class-14": "object-class-14",
104 "class-15": "object-class-15",
105 "class-16": "object-class-16",
106 "class-17": "object-class-17",
107 "class-18": "object-class-18",
108 "masculine": "object-masculine",
109 "feminine": "object-feminine",
110}
112# Words in title that cause addition of tags in all entries
113title_contains_global_map = {
114 "possessive": "possessive",
115 "possessed forms of": "possessive",
116 "predicative forms of": "predicative",
117 "negative": "negative",
118 "positive definite forms": "positive definite",
119 "positive indefinite forms": "positive indefinite",
120 "comparative": "comparative",
121 "superlative": "superlative",
122 "combined forms": "combined-form",
123 "mutation": "mutation",
124 "definite article": "definite",
125 "indefinite article": "indefinite",
126 "indefinite declension": "indefinite",
127 "bare forms": "indefinite", # e.g., cois/Irish
128 "definite declension": "definite",
129 "pre-reform": "dated",
130 "personal pronouns": "personal pronoun",
131 "composed forms of": "multiword-construction",
132 "subordinate-clause forms of": "subordinate-clause",
133 "participles of": "participle",
134 "variation of": "dummy-skip-this", # a'/Scottish Gaelic
135 "command form of": "imperative", # a راتلل/Pashto
136 "historical inflection of": "dummy-skip-this", # kork/Norwegian Nynorsk
137 "obsolete declension": "obsolete", # März/German 20241111
138}
139for k, v in title_contains_global_map.items():
140 if any(t not in valid_tags for t in v.split()): 140 ↛ 141line 140 didn't jump to line 141 because the condition on line 140 was never true
141 print("TITLE_CONTAINS_GLOBAL_MAP UNRECOGNIZED TAG: {}: {}".format(k, v))
142table_hdr_ign_part = r"(Inflection|Conjugation|Declension|Mutation) of [^\s]"
144table_hdr_ign_part_re = re.compile(r"(?i)(" + table_hdr_ign_part + ")")
145# (?i) python regex extension, ignore case
146title_contains_global_re = re.compile(
147 r"(?i)(^|\b)({}|{})($|\b)".format(
148 table_hdr_ign_part,
149 "|".join(re.escape(x) for x in title_contains_global_map.keys()),
150 )
151)
153# Words in title that cause addition of tags to table-tags "form"
154title_contains_wordtags_map = {
155 "pf": "perfective",
156 "impf": "imperfective",
157 "strong": "strong",
158 "weak": "weak",
159 "countable": "countable",
160 "uncountable": "uncountable",
161 "inanimate": "inanimate",
162 "animate": "animate",
163 "transitive": "transitive",
164 "intransitive": "intransitive",
165 "ditransitive": "ditransitive",
166 "ambitransitive": "ambitransitive",
167 "archaic": "archaic",
168 "dated": "dated",
169 "affirmative": "affirmative",
170 "negative": "negative",
171 "subject pronouns": "subjective",
172 "object pronouns": "objective",
173 "emphatic": "emphatic",
174 "proper noun": "proper-noun",
175 "no plural": "no-plural",
176 "imperfective": "imperfective",
177 "perfective": "perfective",
178 "no supine stem": "no-supine",
179 "no perfect stem": "no-perfect",
180 "deponent": "deponent",
181 "irregular": "irregular",
182 "no short forms": "no-short-form",
183 "iō-variant": "iō-variant",
184 "1st declension": "declension-1",
185 "2nd declension": "declension-2",
186 "3rd declension": "declension-3",
187 "4th declension": "declension-4",
188 "5th declension": "declension-5",
189 "6th declension": "declension-6",
190 "first declension": "declension-1",
191 "second declension": "declension-2",
192 "third declension": "declension-3",
193 "fourth declension": "declension-4",
194 "fifth declension": "declension-5",
195 "sixth declension": "declension-6",
196 "1st conjugation": "conjugation-1",
197 "2nd conjugation": "conjugation-2",
198 "3rd conjugation": "conjugation-3",
199 "4th conjugation": "conjugation-4",
200 "5th conjugation": "conjugation-5",
201 "6th conjugation": "conjugation-6",
202 "7th conjugation": "conjugation-7",
203 "first conjugation": "conjugation-1",
204 "second conjugation": "conjugation-2",
205 "third conjugation": "conjugation-3",
206 "fourth conjugation": "conjugation-4",
207 "fifth conjugation": "conjugation-5",
208 "sixth conjugation": "conjugation-6",
209 "seventh conjugation": "conjugation-7",
210 # Corsican regional tags in table header
211 "cismontane": "Cismontane",
212 "ultramontane": "Ultramontane",
213 "western lombard": "Western-Lombard",
214 "eastern lombard": "Eastern-Lombard",
215 "contracted": "contracted",
216 "present": "present",
217 "perfect": "perfect",
218 "imperfect": "imperfect",
219 "pluperfect": "pluperfect",
220 "future": "future",
221 "aorist": "aorist",
222}
223for k, v in title_contains_wordtags_map.items():
224 if any(t not in valid_tags for t in v.split()): 224 ↛ 225line 224 didn't jump to line 225 because the condition on line 224 was never true
225 print(
226 "TITLE_CONTAINS_WORDTAGS_MAP UNRECOGNIZED TAG: {}: {}".format(k, v)
227 )
228title_contains_wordtags_re = re.compile(
229 r"(?i)(^|\b)({}|{})($|\b)".format(
230 table_hdr_ign_part,
231 "|".join(re.escape(x) for x in title_contains_wordtags_map.keys()),
232 )
233)
235# Parenthesized elements in title that are converted to tags in
236# "table-tags" form
237title_elements_map = {
238 "weak": "weak",
239 "strong": "strong",
240 "separable": "separable",
241 "masculine": "masculine",
242 "feminine": "feminine",
243 "neuter": "neuter",
244 "singular": "singular",
245 "plural": "plural",
246 "archaic": "archaic",
247 "dated": "dated",
248 "iterative": "iterative",
249 "poetic": "poetic",
250 "Attic": "Attic",
251 "Epic": "Epic",
252 "Aeolic": "Aeolic",
253 "Arcadocypriot": "Arcadocypriot",
254 "Old Attic": "Old-Attic",
255 "Boeotian": "Boeotian",
256 "Byzantine": "Byzantine",
257 "Choral Doric": "Choral-Doric",
258 "Doric": "Doric",
259 "Elean": "Elean",
260 "Epirote": "Epirote",
261 "Ionic": "Ionic",
262 "Koine": "Koine",
263 "Cretan": "Cretan",
264 "Corinthian": "Corinthian",
265 "Laconian": "Laconian",
266 "Later poetic": "Later-poetic-Ancient-Greek",
267 "Lesbian": "Lesbian",
268 "Locrian": "Locrian",
269 "Lyric": "Lyric-Ancient-Greek",
270 "Thessalian": "Thessalian",
271 "Tragic": "Tragic-Ancient-Greek",
272}
273for k, v in title_elements_map.items():
274 if any(t not in valid_tags for t in v.split()): 274 ↛ 275line 274 didn't jump to line 275 because the condition on line 274 was never true
275 print("TITLE_ELEMENTS_MAP UNRECOGNIZED TAG: {}: {}".format(k, v))
277# Parenthized element starts to map them to tags for form for the rest of
278# the element
279title_elemstart_map = {
280 "auxiliary": "auxiliary",
281 "Kotus type": "class",
282 "ÕS type": "class",
283 "class": "class",
284 "short class": "class",
285 "type": "class",
286 "strong class": "class",
287 "weak class": "class",
288 "accent paradigm": "accent-paradigm",
289 "stem in": "class",
290}
291for k, v in title_elemstart_map.items():
292 if any(t not in valid_tags for t in v.split()): 292 ↛ 293line 292 didn't jump to line 293 because the condition on line 292 was never true
293 print("TITLE_ELEMSTART_MAP UNRECOGNIZED TAG: {}: {}".format(k, v))
294title_elemstart_re = re.compile(
295 r"^({}) ".format("|".join(re.escape(x) for x in title_elemstart_map.keys()))
296)
299# Regexp for cell starts that are likely definitions of reference symbols.
300# See also nondef_re.
301def_re = re.compile(
302 r"(\s*•?\s+)?"
303 r"((\*+|[△†0123456789⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻]+)([⁾):]|\s|(?=[A-Z]))|"
304 r"\^(\*+|[△†])|"
305 r"([¹²³⁴⁵⁶⁷⁸⁹])|"
306 r"([ᴬᴮᴰᴱᴳᴴᴵᴶᴷᴸᴹᴺᴼᴾᴿᵀᵁⱽᵂᵃᵇᶜᵈᵉᶠᵍʰⁱʲᵏˡᵐⁿᵒᵖʳˢᵗᵘᵛʷˣʸᶻᵝᵞᵟᶿᶥᵠᵡ]))"
307)
308# ᴺᴸᴴ persan/Old Irish
310# Regexp for cell starts that are exceptions to def_re and do not actually
311# start a definition.
312nondef_re = re.compile(
313 r"(^\s*(1|2|3)\s+(sg|pl)\s*$|" # 1s or 3p etc.
314 r"\s*\d\d?\s*/\s*\d\d?\s*$)"
315) # taka/Swahili "15 / 17"
318class InflCell:
319 """Cell in an inflection table."""
321 __slots__ = (
322 "text",
323 "is_title",
324 "colspan",
325 "rowspan",
326 "target",
327 "links",
328 )
330 def __init__(
331 self,
332 text: str,
333 is_title: bool,
334 colspan: int,
335 rowspan: int,
336 target: str | None,
337 cell_links: list[tuple[str, str]] | None = None,
338 ) -> None:
339 assert isinstance(text, str)
340 assert is_title in (True, False)
341 assert isinstance(colspan, int) and colspan >= 1
342 assert isinstance(rowspan, int) and rowspan >= 1
343 assert target is None or isinstance(target, str)
344 self.text = text.strip()
345 self.is_title = text and is_title
346 self.colspan = colspan
347 self.rowspan = rowspan
348 self.target = target
349 self.links = cell_links
351 def __str__(self) -> str:
352 v = "{}/{}/{}/{!r}".format(
353 self.text, self.is_title, self.colspan, self.rowspan
354 )
355 if self.target:
356 v += ": {!r}".format(self.target)
357 return v
359 def __repr__(self) -> str:
360 return str(self)
363class HdrSpan:
364 """Saved information about a header cell/span during the parsing
365 of a table."""
367 __slots__ = (
368 "start",
369 "colspan",
370 "rowspan",
371 "rownum", # Row number where this occurred
372 "tagsets", # list of tuples
373 "text", # For debugging
374 "all_headers_row",
375 "expanded", # The header has been expanded to cover whole row/part
376 )
378 def __init__(
379 self,
380 start: int,
381 colspan: int,
382 rowspan: int,
383 rownum: int,
384 tagsets: TagSets,
385 text: str,
386 all_headers_row: bool,
387 ) -> None:
388 assert isinstance(start, int) and start >= 0
389 assert isinstance(colspan, int) and colspan >= 1
390 assert isinstance(rownum, int)
391 assert isinstance(tagsets, list)
392 for x in tagsets:
393 assert isinstance(x, tuple)
394 assert all_headers_row in (True, False)
395 self.start = start
396 self.colspan = colspan
397 self.rowspan = rowspan
398 self.rownum = rownum
399 self.tagsets = list(tuple(sorted(set(tags))) for tags in tagsets)
400 self.text = text
401 self.all_headers_row = all_headers_row
402 self.expanded = False
405def is_superscript(ch: str) -> bool:
406 """Returns True if the argument is a superscript character."""
407 assert isinstance(ch, str) and len(ch) == 1
408 try:
409 name = unicodedata.name(ch)
410 except ValueError:
411 return False
412 return (
413 re.match(
414 r"SUPERSCRIPT |"
415 r"MODIFIER LETTER SMALL |"
416 r"MODIFIER LETTER CAPITAL ",
417 name,
418 )
419 is not None
420 )
423def remove_useless_tags(lang: str, pos: str, tags: set[str]) -> None:
424 """Remove certain tag combinations from ``tags`` when they serve no purpose
425 together (cover all options)."""
426 assert isinstance(lang, str)
427 assert isinstance(pos, str)
428 assert isinstance(tags, set)
429 if (
430 "animate" in tags
431 and "inanimate" in tags
432 and get_lang_conf(lang, "animate_inanimate_remove")
433 ):
434 tags.remove("animate")
435 tags.remove("inanimate")
436 if (
437 "virile" in tags
438 and "nonvirile" in tags
439 and get_lang_conf(lang, "virile_nonvirile_remove")
440 ):
441 tags.remove("virile")
442 tags.remove("nonvirile")
443 # If all numbers in the language are listed, remove them all
444 numbers = get_lang_conf(lang, "numbers")
445 if numbers and all(x in tags for x in numbers):
446 for x in numbers:
447 tags.remove(x)
448 # If all genders in the language are listed, remove them all
449 genders = get_lang_conf(lang, "genders")
450 if genders and all(x in tags for x in genders):
451 for x in genders:
452 tags.remove(x)
453 # If all voices in the language are listed, remove them all
454 voices = get_lang_conf(lang, "voices")
455 if voices and all(x in tags for x in voices):
456 for x in voices:
457 tags.remove(x)
458 # If all strengths of the language are listed, remove them all
459 strengths = get_lang_conf(lang, "strengths")
460 if strengths and all(x in tags for x in strengths):
461 for x in strengths:
462 tags.remove(x)
463 # If all persons of the language are listed, remove them all
464 persons = get_lang_conf(lang, "persons")
465 if persons and all(x in tags for x in persons):
466 for x in persons:
467 tags.remove(x)
468 # If all definitenesses of the language are listed, remove them all
469 definitenesses = get_lang_conf(lang, "definitenesses")
470 if definitenesses and all(x in tags for x in definitenesses):
471 for x in definitenesses:
472 tags.remove(x)
475def tagset_cats(tagset: TagSets) -> set[str]:
476 """Returns a set of tag categories for the tagset (merged from all
477 alternatives)."""
478 return set(valid_tags[t] for ts in tagset for t in ts)
481def or_tagsets(
482 lang: str, pos: str, tagsets1: TagSets, tagsets2: TagSets
483) -> TagSets:
484 """Merges two tagsets (the new tagset just merges the tags from both, in
485 all combinations). If they contain simple alternatives (differ in
486 only one category), they are simply merged; otherwise they are split to
487 more alternatives. The tagsets are assumed be sets of sorted tuples."""
488 assert isinstance(tagsets1, list)
489 assert all(isinstance(x, tuple) for x in tagsets1)
490 assert isinstance(tagsets2, list)
491 assert all(isinstance(x, tuple) for x in tagsets1)
492 tagsets: TagSets = [] # This will be the result
494 def add_tags(tags1: tuple[str, ...]) -> None:
495 # CONTINUE
496 if not tags1:
497 return # empty set would merge with anything, won't change result
498 if not tagsets:
499 tagsets.append(tags1)
500 return
501 for tags2 in tagsets:
502 # Determine if tags1 can be merged with tags2
503 num_differ = 0
504 if tags1 and tags2: 504 ↛ 522line 504 didn't jump to line 522 because the condition on line 504 was always true
505 cats1 = set(valid_tags[t] for t in tags1)
506 cats2 = set(valid_tags[t] for t in tags2)
507 cats = cats1 | cats2
508 for cat in cats:
509 tags1_in_cat = set(t for t in tags1 if valid_tags[t] == cat)
510 tags2_in_cat = set(t for t in tags2 if valid_tags[t] == cat)
511 if (
512 tags1_in_cat != tags2_in_cat
513 or not tags1_in_cat
514 or not tags2_in_cat
515 ):
516 num_differ += 1
517 if not tags1_in_cat or not tags2_in_cat:
518 # Prevent merging if one is empty
519 num_differ += 1
520 # print("tags1={} tags2={} num_differ={}"
521 # .format(tags1, tags2, num_differ))
522 if num_differ <= 1:
523 # Yes, they can be merged
524 tagsets.remove(tags2)
525 tags_s = set(tags1) | set(tags2)
526 remove_useless_tags(lang, pos, tags_s)
527 tags_t = tuple(sorted(tags_s))
528 add_tags(tags_t) # Could result in further merging
529 return
530 # If we could not merge, add to tagsets
531 tagsets.append(tags1)
533 for tags in tagsets1:
534 add_tags(tags)
535 for tags in tagsets2:
536 add_tags(tags)
537 if not tagsets:
538 tagsets.append(())
540 # print("or_tagsets: {} + {} -> {}"
541 # .format(tagsets1, tagsets2, tagsets))
542 return tagsets
545def and_tagsets(
546 lang: str,
547 pos: str,
548 tagsets1: list[tuple[str, ...]],
549 tagsets2: list[tuple[str, ...]],
550) -> list[tuple[str, ...]]:
551 """Merges tagsets by taking union of all cobinations, without trying
552 to determine whether they are compatible."""
553 assert isinstance(tagsets1, list) and len(tagsets1) >= 1
554 assert all(isinstance(x, tuple) for x in tagsets1)
555 assert isinstance(tagsets2, list) and len(tagsets2) >= 1
556 assert all(isinstance(x, tuple) for x in tagsets1)
557 new_tagsets = []
558 tags: Union[set[str], tuple[str, ...]]
559 for tags1 in tagsets1:
560 for tags2 in tagsets2:
561 tags = set(tags1) | set(tags2)
562 remove_useless_tags(lang, pos, tags)
563 if "dummy-ignored-text-cell" in tags: 563 ↛ 564line 563 didn't jump to line 564 because the condition on line 563 was never true
564 tags.remove("dummy-ignored-text-cell")
565 tags = tuple(sorted(tags))
566 if tags not in new_tagsets: 566 ↛ 560line 566 didn't jump to line 560 because the condition on line 566 was always true
567 new_tagsets.append(tags)
568 # print("and_tagsets: {} + {} -> {}"
569 # .format(tagsets1, tagsets2, new_tagsets))
570 return new_tagsets
573@functools.lru_cache(65536)
574def extract_cell_content(
575 lang: str, word: str, col: str
576) -> tuple[str, list[str], list[tuple[str, str]], list[str]]:
577 """Cleans a row/column header for later processing. This returns
578 (cleaned, refs, defs, tags)."""
579 # print("EXTRACT_CELL_CONTENT {!r}".format(col))
580 hdr_tags = []
581 col = re.sub(r"(?s)\s*,\s*$", "", col)
582 col = re.sub(r"(?s)\s*•\s*$", "", col)
583 col = re.sub(r"\s+", " ", col)
584 col = col.strip()
585 if re.search(
586 r"^\s*(There are |"
587 r"\* |"
588 r"see |"
589 r"Use |"
590 r"use the |"
591 r"Only used |"
592 r"The forms in |"
593 r"these are also written |"
594 r"The genitive can be |"
595 r"Genitive forms are rare or non-existant|"
596 r"Accusative Note: |"
597 r"Classifier Note: |"
598 r"Noun: Assamese nouns are |"
599 r"the active conjugation|"
600 r"the instrumenal singular|"
601 r"Note:|"
602 r"\^* Note:|"
603 r"possible mutated form |"
604 r"The future tense: )",
605 col,
606 ):
607 return "dummy-ignored-text-cell", [], [], []
609 # Temporarily remove final parenthesized part (if separated by whitespace),
610 # so that we can extract reference markers before it.
611 final_paren = ""
612 m = re.search(r"\s+\([^)]*\)$", col)
613 if m is not None:
614 final_paren = m.group(0)
615 col = col[: m.start()]
617 # Extract references and tag markers
618 refs = []
619 special_references = get_lang_conf(lang, "special_references")
620 while True:
621 m = re.search(r"\^(.|\([^)]*\))$", col)
622 if not m:
623 break
624 r = m.group(1)
625 if r.startswith("(") and r.endswith(")"):
626 r = r[1:-1]
627 for r1 in r.split(","):
628 if r1 == "rare": 628 ↛ 629line 628 didn't jump to line 629 because the condition on line 628 was never true
629 hdr_tags.append("rare")
630 elif special_references and r1 in special_references:
631 hdr_tags.extend(special_references[r1].split())
632 else:
633 # v = m.group(1)
634 if r1.startswith("(") and r1.endswith(")"): 634 ↛ 635line 634 didn't jump to line 635 because the condition on line 634 was never true
635 r1 = r1[1:-1]
636 refs.append(unicodedata.normalize("NFKD", r1))
637 col = col[: m.start()]
638 # See if it is a ref definition
639 # print("BEFORE REF CHECK: {!r}".format(col))
640 m = def_re.match(col)
641 # print(f"Before def_re: {refs=}")
642 if m and not nondef_re.match(col):
643 ofs = 0
644 ref = None
645 deflst = []
646 for m in re.finditer(def_re, col):
647 if ref:
648 deflst.append((ref, col[ofs : m.start()].strip()))
649 ref = unicodedata.normalize(
650 "NFKD", m.group(3) or m.group(5) or m.group(6) or ""
651 )
652 ofs = m.end()
653 if ref: 653 ↛ 656line 653 didn't jump to line 656 because the condition on line 653 was always true
654 deflst.append((ref, col[ofs:].strip()))
655 # print("deflst:", deflst)
656 return "", [], deflst, []
657 # See if it *looks* like a reference to a definition
658 # print(f"After def_re: {refs=}")
659 while col:
660 if is_superscript(col[-1]) or col[-1] in ("†",):
661 if col.endswith("ʳᵃʳᵉ"):
662 hdr_tags.append("rare")
663 col = col[:-4].strip()
664 continue
665 if special_references:
666 stop_flag = False
667 for r in special_references:
668 if col.endswith(r):
669 hdr_tags.extend(special_references[r].split())
670 col = col[: -len(r)].strip()
671 stop_flag = True
672 break # this for loop
673 if stop_flag:
674 continue # this while loop
675 # Numbers and H/L/N are useful information
676 refs.append(unicodedata.normalize("NFKD", col[-1]))
677 col = col[:-1]
678 else:
679 break
681 # Check for another form of note definition
682 if ( 682 ↛ 688line 682 didn't jump to line 688 because the condition on line 682 was never true
683 len(col) > 2
684 and col[1] in (")", " ", ":")
685 and col[0].isdigit()
686 and not re.match(nondef_re, col)
687 ):
688 return "", [], [(col[0], col[2:].strip())], []
689 col = col.strip()
691 # Extract final "*" reference symbols. Sometimes there are multiple.
692 m = re.search(r"\*+$", col)
693 if m is not None:
694 col = col[: m.start()]
695 refs.append(unicodedata.normalize("NFKD", m.group(0)))
696 if col.endswith("(*)"): 696 ↛ 697line 696 didn't jump to line 697 because the condition on line 696 was never true
697 col = col[:-3].strip()
698 refs.append("*")
700 # Put back the final parenthesized part
701 col = col.strip() + final_paren
702 # print("EXTRACT_CELL_CONTENT: orig_col={!r} col={!r} refs={!r} hdr_tags={}"
703 # .format(orig_col, col, refs, hdr_tags))
704 return col.strip(), refs, [], hdr_tags
707@functools.lru_cache(10000)
708def parse_title(
709 title: str, source: str
710) -> tuple[list[str], list[str], list[FormData]]:
711 """Parses inflection table title. This returns (global_tags, table_tags,
712 extra_forms), where ``global_tags`` is tags to be added to each inflection
713 entry, ``table_tags`` are tags for the word but not to be added to every
714 form, and ``extra_forms`` is dictionary describing additional forms to be
715 included in the part-of-speech entry)."""
716 assert isinstance(title, str)
717 assert isinstance(source, str)
718 title = html.unescape(title)
719 title = re.sub(r"(?i)<[^>]*>", "", title).strip()
720 title = re.sub(r"\s+", " ", title)
721 # print("PARSE_TITLE:", title)
722 global_tags: list[str] = []
723 table_tags: list[str] = []
724 extra_forms = []
725 # Add certain global tags based on contained words
726 for m in re.finditer(title_contains_global_re, title):
727 v = m.group(0).lower()
728 if re.match(table_hdr_ign_part_re, v): 728 ↛ 729line 728 didn't jump to line 729 because the condition on line 728 was never true
729 continue
730 global_tags.extend(title_contains_global_map[v].split())
731 # Add certain tags to table-tags "form" based on contained words
732 for m in re.finditer(title_contains_wordtags_re, title):
733 v = m.group(0).lower()
734 if re.match(table_hdr_ign_part_re, v): 734 ↛ 735line 734 didn't jump to line 735 because the condition on line 734 was never true
735 continue
736 table_tags.extend(title_contains_wordtags_map[v].split())
737 if re.search(r"Conjugation of (s’|se ).*French verbs", title): 737 ↛ 738line 737 didn't jump to line 738 because the condition on line 737 was never true
738 global_tags.append("reflexive")
739 # Check for <x>-type at the beginning of title (e.g., Armenian) and various
740 # other ways of specifying an inflection class.
741 for m in re.finditer(
742 r"\b("
743 r"[\w/]+-type|"
744 r"accent-\w+|"
745 r"[\w/]+-stem|"
746 r"[^ ]+ gradation|"
747 r"\b(stem in [\w/ ]+)|"
748 r"[^ ]+ alternation|"
749 r"(First|Second|Third|Fourth|Fifth|Sixth|Seventh) "
750 r"(Conjugation|declension)|"
751 r"First and second declension|"
752 r"(1st|2nd|3rd|4th|5th|6th) declension|"
753 r"\w[\w/ ]* harmony"
754 r")\b",
755 title,
756 ):
757 dt: FormData = {"form": m.group(1), "source": source, "tags": ["class"]}
758 extra_forms.append(dt)
759 # Parse parenthesized part from title
760 for m in re.finditer(r"\(([^)]*)\)", title):
761 for elem in m.group(1).split(","):
762 # group(0) is the whole string, group(1) first parens
763 elem = elem.strip()
764 if elem in title_elements_map:
765 table_tags.extend(title_elements_map[elem].split())
766 else:
767 m1 = re.match(title_elemstart_re, elem)
768 if m1:
769 tags = title_elemstart_map[m1.group(1)].split()
770 dt = {
771 "form": elem[m1.end() :],
772 "source": source,
773 "tags": tags,
774 }
775 extra_forms.append(dt)
776 # For titles that contains no parenthesized parts, do some special
777 # handling to still interpret parts from them
778 if "(" not in title:
779 # No parenthesized parts
780 m1 = re.search(r"\b(Portuguese) (-.* verb) ", title)
781 if m1 is not None:
782 dt = {"form": m1.group(2), "tags": ["class"], "source": source}
783 extra_forms.append(dt)
784 for elem in title.split(","):
785 elem = elem.strip()
786 if elem in title_elements_map: 786 ↛ 787line 786 didn't jump to line 787 because the condition on line 786 was never true
787 table_tags.extend(title_elements_map[elem].split())
788 elif elem.endswith("-stem"): 788 ↛ 789line 788 didn't jump to line 789 because the condition on line 788 was never true
789 dt = {"form": elem, "tags": ["class"], "source": source}
790 extra_forms.append(dt)
791 return global_tags, table_tags, extra_forms
794def expand_header(
795 wxr: WiktextractContext,
796 tablecontext: "TableContext",
797 word: str,
798 lang: str,
799 pos: str,
800 text: str,
801 base_tags: Union[list[str], set[str], tuple[str, ...]],
802 silent=False,
803 ignore_tags=False,
804 depth=0,
805 column_number: int | None = None,
806) -> list[tuple[str, ...]]:
807 """Expands a cell header to tagset, handling conditional expressions
808 in infl_map. This returns list of tuples of tags, each list element
809 describing an alternative interpretation. ``base_tags`` is combined
810 column and row tags for the cell in which the text is being interpreted
811 (conditional expressions in inflection data may depend on it).
812 If ``silent`` is True, then no warnings will be printed. If ``ignore_tags``
813 is True, then tags listed in "if" will be ignored in the test (this is
814 used when trying to heuristically detect whether a non-<th> cell is anyway
815 a header)."""
816 assert isinstance(wxr, WiktextractContext)
817 assert isinstance(word, str)
818 assert isinstance(lang, str)
819 assert isinstance(pos, str)
820 assert isinstance(text, str)
821 assert isinstance(base_tags, (list, tuple, set))
822 assert silent in (True, False)
823 assert isinstance(depth, int)
824 # print("EXPAND_HDR: text={!r} base_tags={!r}".format(text, base_tags))
825 # First map the text using the inflection map
826 text = clean_value(wxr, text)
827 combined_return: list[tuple[str, ...]] = []
828 parts = split_at_comma_semi(text, separators=[";"])
829 for text in parts:
830 if not text: 830 ↛ 831line 830 didn't jump to line 831 because the condition on line 830 was never true
831 continue
832 if text in infl_map:
833 v = infl_map[text] # list or string
834 else:
835 m = re.match(infl_start_re, text)
836 if m is not None: 836 ↛ 837line 836 didn't jump to line 837 because the condition on line 836 was never true
837 v = infl_start_map[m.group(1)]
838 # print("INFL_START {} -> {}".format(text, v))
839 elif re.match(r"Notes", text):
840 # Ignored header
841 # print("IGNORING NOTES")
842 combined_return = or_tagsets(
843 lang, pos, combined_return, [("dummy-skip-this",)]
844 )
845 # this just adds dummy-skip-this
846 continue
847 elif text in IGNORED_COLVALUES:
848 combined_return = or_tagsets(
849 lang, pos, combined_return, [("dummy-ignore-skipped",)]
850 )
851 continue
852 # Try without final parenthesized part
853 text_without_parens = re.sub(r"[,/]?\s+\([^)]*\)\s*$", "", text)
854 if text_without_parens in infl_map:
855 v = infl_map[text_without_parens]
856 elif m is None: 856 ↛ 872line 856 didn't jump to line 872 because the condition on line 856 was always true
857 if not silent:
858 wxr.wtp.debug(
859 "inflection table: unrecognized header: {}".format(
860 repr(text)
861 ),
862 sortid="inflection/735",
863 )
864 # Unrecognized header
865 combined_return = or_tagsets(
866 lang, pos, combined_return, [("error-unrecognized-form",)]
867 )
868 continue
870 # Then loop interpreting the value, until the value is a simple string.
871 # This may evaluate nested conditional expressions.
872 default_else = None
873 while True:
874 # If it is a string, we are done.
875 if isinstance(v, str):
876 tags = set(v.split())
877 remove_useless_tags(lang, pos, tags)
878 tagset = [tuple(sorted(tags))]
879 break
880 # For a list, just interpret it as alternatives. (Currently the
881 # alternatives must directly be strings.)
882 if isinstance(v, (list, tuple)):
883 tagset = []
884 for x in v:
885 tags = set(x.split())
886 remove_useless_tags(lang, pos, tags)
887 tags_t = tuple(sorted(tags))
888 if tags_t not in tagset: 888 ↛ 884line 888 didn't jump to line 884 because the condition on line 888 was always true
889 tagset.append(tags_t)
890 break
891 # Otherwise the value should be a dictionary describing a
892 # conditional expression.
893 if not isinstance(v, dict): 893 ↛ 894line 893 didn't jump to line 894 because the condition on line 893 was never true
894 wxr.wtp.debug(
895 "inflection table: internal: "
896 "UNIMPLEMENTED INFL_MAP VALUE: {}".format(infl_map[text]),
897 sortid="inflection/767",
898 )
899 tagset = [()]
900 break
901 # Evaluate the conditional expression.
902 assert isinstance(v, dict)
903 cond: Union[bool, str] = "default-true"
904 c: Union[str, list[str], set[str]] = ""
905 # Handle "lang" condition. The value must be either a
906 # single language or a list of languages, and the
907 # condition evaluates to True if the table is one of
908 # those languages.
909 if "lang" in v:
910 c = v["lang"]
911 # check if it's a code and transform if necessary
912 if isinstance(c, str):
913 if c != lang:
914 cond = lang == code_to_name(c, "en")
915 else:
916 cond = True
917 else:
918 assert isinstance(c, (list, tuple, set))
919 if lang not in c:
920 cond = name_to_code(lang, "en") in c
921 else:
922 cond = True
923 # Handle "nested-table-depth" condition. The value must
924 # be an int or list of ints, and the condition evaluates
925 # True if the depth is one of those values.
926 # "depth" is how deep into a nested table tree the current
927 # table lies. It is first started in handle_wikitext_table,
928 # so only applies to tables-within-tables, not other
929 # WikiNode content. `depth` is currently only passed as a
930 # parameter down the table parsing stack, and not stored.
931 if cond and "nested-table-depth" in v: 931 ↛ 932line 931 didn't jump to line 932 because the condition on line 931 was never true
932 d = v["nested-table-depth"]
933 if isinstance(d, int):
934 cond = d == depth
935 else:
936 assert isinstance(d, (list, tuple, set))
937 cond = depth in d
938 # Column index: check if we're in position X of the row
939 if cond and "column-index" in v:
940 index = v["column-index"]
941 if isinstance(index, int): 941 ↛ 944line 941 didn't jump to line 944 because the condition on line 941 was always true
942 cond = index == column_number
943 else:
944 assert isinstance(index, (list, tuple, set))
945 cond = column_number in index
946 # Handle inflection-template condition. Must be a string
947 # or list of strings, and if tablecontext.template_name is in
948 # those, accept the condition.
949 # TableContext.template_name is passed down from page/
950 # parse_inflection, before parsing and expanding itself
951 # has begun.
952 if cond and tablecontext and "inflection-template" in v:
953 d1 = v["inflection-template"]
954 if isinstance(d1, str): 954 ↛ 957line 954 didn't jump to line 957 because the condition on line 954 was always true
955 cond = d1 == tablecontext.template_name
956 else:
957 assert isinstance(d1, (list, tuple, set))
958 cond = tablecontext.template_name in d1
959 # Handle "pos" condition. The value must be either a single
960 # part-of-speech or a list of them, and the condition evaluates to
961 # True if the part-of-speech is any of those listed.
962 if cond and "pos" in v:
963 c = v["pos"]
964 if isinstance(c, str):
965 cond = c == pos
966 else:
967 assert isinstance(c, (list, tuple, set))
968 cond = pos in c
969 # Handle "if" condition. The value must be a string containing a
970 # space-separated list of tags. The condition evaluates to True if
971 # ``base_tags`` contains all of the listed tags. If the condition
972 # is of the form "any: ...tags...", then any of the tags will be
973 # enough.
974 if cond and "if" in v and not ignore_tags:
975 c = v["if"]
976 assert isinstance(c, str)
977 # "if" condition is true if any of the listed tags is present if
978 # it starts with "any:", otherwise all must be present
979 if c.startswith("any: "):
980 cond = any(t in base_tags for t in c[5:].split())
981 else:
982 cond = all(t in base_tags for t in c.split())
984 # Handle "default" assignment. Store the value to be used
985 # as a default later.
986 if "default" in v:
987 assert isinstance(v["default"], str)
988 default_else = v["default"]
990 # Warning message about missing conditions for debugging.
992 if cond == "default-true" and not default_else and not silent:
993 wxr.wtp.debug(
994 "inflection table: IF MISSING COND: word={} "
995 "lang={} text={} base_tags={} c={} cond={}".format(
996 word, lang, text, base_tags, c, cond
997 ),
998 sortid="inflection/851",
999 )
1000 # Based on the result of evaluating the condition, select either
1001 # "then" part or "else" part.
1002 if cond:
1003 v = v.get("then", "")
1004 else:
1005 v1 = v.get("else")
1006 if v1 is None:
1007 if default_else is not None:
1008 v = default_else
1009 else:
1010 if not silent:
1011 wxr.wtp.debug(
1012 "inflection table: IF WITHOUT ELSE EVALS "
1013 "False: "
1014 "{}/{} {!r} base_tags={}".format(
1015 word, lang, text, base_tags
1016 ),
1017 sortid="inflection/865",
1018 )
1019 v = "error-unrecognized-form"
1020 else:
1021 v = v1
1023 # Merge the resulting tagset from this header part with the other
1024 # tagsets from the whole header
1025 combined_return = or_tagsets(lang, pos, combined_return, tagset)
1027 # Return the combined tagsets, or empty tagset if we got no tagsets
1028 if not combined_return:
1029 combined_return = [()]
1030 return combined_return
1033def compute_coltags(
1034 lang: str,
1035 pos: str,
1036 hdrspans: list[HdrSpan],
1037 start: int,
1038 colspan: int,
1039 celltext: str,
1040) -> list[tuple[str, ...]]:
1041 """Computes column tags for a column of the given width based on the
1042 current header spans."""
1043 assert isinstance(lang, str)
1044 assert isinstance(pos, str)
1045 assert isinstance(hdrspans, list)
1046 assert isinstance(start, int) and start >= 0
1047 assert isinstance(colspan, int) and colspan >= 1
1048 assert isinstance(celltext, str) # For debugging only
1049 # print("COMPUTE_COLTAGS CALLED start={} colspan={} celltext={!r}"
1050 # .format(start, colspan, celltext))
1051 # For debugging, set this to the form for whose cell you want debug prints
1052 if celltext == debug_cell_text: 1052 ↛ 1053line 1052 didn't jump to line 1053 because the condition on line 1052 was never true
1053 print(
1054 "COMPUTE_COLTAGS CALLED start={} colspan={} celltext={!r}".format(
1055 start, colspan, celltext
1056 )
1057 )
1058 for hdrspan in hdrspans:
1059 print(
1060 " row={} start={} colspans={} tagsets={}".format(
1061 hdrspan.rownum,
1062 hdrspan.start,
1063 hdrspan.colspan,
1064 hdrspan.tagsets,
1065 )
1066 )
1067 used = set()
1068 coltags: list[tuple[str, ...]] = [()]
1069 last_header_row = 1000000
1070 # Iterate through the headers in reverse order, i.e., headers lower in the
1071 # table (closer to the cell) first.
1072 row_tagsets: list[tuple[str, ...]] = [()]
1073 row_tagsets_rownum = 1000000
1074 used_hdrspans = set()
1075 for hdrspan in reversed(hdrspans):
1076 if (
1077 hdrspan.start + hdrspan.colspan <= start
1078 or hdrspan.start >= start + colspan
1079 ):
1080 # Does not horizontally overlap current cell. Ignore this hdrspan.
1081 if celltext == debug_cell_text: 1081 ↛ 1082line 1081 didn't jump to line 1082 because the condition on line 1081 was never true
1082 print(
1083 "Ignoring row={} start={} colspan={} tagsets={}".format(
1084 hdrspan.rownum,
1085 hdrspan.start,
1086 hdrspan.colspan,
1087 hdrspan.tagsets,
1088 )
1089 )
1090 continue
1091 # If the cell partially overlaps the current cell, assume we have
1092 # reached something unrelated and abort.
1093 if (
1094 hdrspan.start < start
1095 and hdrspan.start + hdrspan.colspan > start
1096 and hdrspan.start + hdrspan.colspan < start + colspan
1097 ):
1098 if celltext == debug_cell_text: 1098 ↛ 1099line 1098 didn't jump to line 1099 because the condition on line 1098 was never true
1099 print(
1100 "break on partial overlap at start {} {} {}".format(
1101 hdrspan.start, hdrspan.colspan, hdrspan.tagsets
1102 )
1103 )
1104 break
1105 if (
1106 hdrspan.start < start + colspan
1107 and hdrspan.start > start
1108 and hdrspan.start + hdrspan.colspan > start + colspan
1109 and not hdrspan.expanded
1110 ):
1111 if celltext == debug_cell_text: 1111 ↛ 1112line 1111 didn't jump to line 1112 because the condition on line 1111 was never true
1112 print(
1113 "break on partial overlap at end {} {} {}".format(
1114 hdrspan.start, hdrspan.colspan, hdrspan.tagsets
1115 )
1116 )
1117 break
1118 # Check if we have already used this cell.
1119 if id(hdrspan) in used_hdrspans:
1120 continue
1121 # We are going to use this cell.
1122 used_hdrspans.add(id(hdrspan))
1123 tagsets = hdrspan.tagsets
1124 # If the hdrspan is fully inside the current cell and does not cover
1125 # it fully, check if we should merge information from multiple cells.
1126 if not hdrspan.expanded and (
1127 hdrspan.start > start
1128 or hdrspan.start + hdrspan.colspan < start + colspan
1129 ):
1130 # Multiple columns apply to the current cell, only
1131 # gender/number/case tags present
1132 # If there are no tags outside the range in any of the
1133 # categories included in these cells, don't add anything
1134 # (assume all choices valid in the language are possible).
1135 in_cats = set(
1136 valid_tags[t]
1137 for x in hdrspans
1138 if x.rownum == hdrspan.rownum
1139 and x.start >= start
1140 and x.start + x.colspan <= start + colspan
1141 for tt in x.tagsets
1142 for t in tt
1143 )
1144 if celltext == debug_cell_text: 1144 ↛ 1145line 1144 didn't jump to line 1145 because the condition on line 1144 was never true
1145 print("in_cats={} tagsets={}".format(in_cats, tagsets))
1146 # Merge the tagsets into existing tagsets. This merges
1147 # alternatives into the same tagset if there is only one
1148 # category different; otherwise this splits the tagset into
1149 # more alternatives.
1150 includes_all_on_row = True
1151 for x in hdrspans:
1152 # print("X: x.rownum={} x.start={}".format(x.rownum, x.start))
1153 if x.rownum != hdrspan.rownum:
1154 continue
1155 if x.start < start or x.start + x.colspan > start + colspan:
1156 if celltext == debug_cell_text: 1156 ↛ 1157line 1156 didn't jump to line 1157 because the condition on line 1156 was never true
1157 print(
1158 "NOT IN RANGE: {} {} {}".format(
1159 x.start, x.colspan, x.tagsets
1160 )
1161 )
1162 includes_all_on_row = False
1163 continue
1164 if id(x) in used_hdrspans:
1165 if celltext == debug_cell_text: 1165 ↛ 1166line 1165 didn't jump to line 1166 because the condition on line 1165 was never true
1166 print(
1167 "ALREADY USED: {} {} {}".format(
1168 x.start, x.colspan, x.tagsets
1169 )
1170 )
1171 continue
1172 used_hdrspans.add(id(x))
1173 if celltext == debug_cell_text: 1173 ↛ 1174line 1173 didn't jump to line 1174 because the condition on line 1173 was never true
1174 print(
1175 "Merging into wide col: x.rownum={} "
1176 "x.start={} x.colspan={} "
1177 "start={} colspan={} tagsets={} x.tagsets={}".format(
1178 x.rownum,
1179 x.start,
1180 x.colspan,
1181 start,
1182 colspan,
1183 tagsets,
1184 x.tagsets,
1185 )
1186 )
1187 tagsets = or_tagsets(lang, pos, tagsets, x.tagsets)
1188 # If all headers on the row were included, ignore them.
1189 # See e.g. kunna/Swedish/Verb.
1190 ts_cats = tagset_cats(tagsets)
1191 if (
1192 includes_all_on_row
1193 or
1194 # Kludge, see fut/Hungarian/Verb
1195 ("tense" in ts_cats and "object" in ts_cats)
1196 ):
1197 tagsets = [()]
1198 # For limited categories, if the category doesn't appear
1199 # outside, we won't include the category
1200 if not in_cats - set(
1201 ("gender", "number", "person", "case", "category", "voice")
1202 ):
1203 # Sometimes we have masc, fem, neut and plural, so treat
1204 # number and gender as the same here (if one given, look for
1205 # the other too)
1206 if "number" in in_cats or "gender" in in_cats:
1207 in_cats.update(("number", "gender"))
1208 # Determine which categories occur outside on
1209 # the same row. Ignore headers that have been expanded
1210 # to cover the whole row/part of it.
1211 out_cats = set(
1212 valid_tags[t]
1213 for x in hdrspans
1214 if x.rownum == hdrspan.rownum
1215 and not x.expanded
1216 and (
1217 x.start < start or x.start + x.colspan > start + colspan
1218 )
1219 for tt in x.tagsets
1220 for t in tt
1221 )
1222 if celltext == debug_cell_text: 1222 ↛ 1223line 1222 didn't jump to line 1223 because the condition on line 1222 was never true
1223 print("in_cats={} out_cats={}".format(in_cats, out_cats))
1224 # Remove all inside categories that do not appear outside
1226 new_tagsets = []
1227 for ts in tagsets:
1228 tags = tuple(
1229 sorted(t for t in ts if valid_tags[t] in out_cats)
1230 )
1231 if tags not in new_tagsets: 1231 ↛ 1227line 1231 didn't jump to line 1227 because the condition on line 1231 was always true
1232 new_tagsets.append(tags)
1233 if celltext == debug_cell_text and new_tagsets != tagsets: 1233 ↛ 1234line 1233 didn't jump to line 1234 because the condition on line 1233 was never true
1234 print(
1235 "Removed tags that do not "
1236 "appear outside {} -> {}".format(
1237 # have_hdr never used?
1238 tagsets,
1239 new_tagsets,
1240 )
1241 )
1242 tagsets = new_tagsets
1243 key = (hdrspan.start, hdrspan.colspan)
1244 if key in used:
1245 if celltext == debug_cell_text: 1245 ↛ 1246line 1245 didn't jump to line 1246 because the condition on line 1245 was never true
1246 print(
1247 "Cellspan already used: start={} "
1248 "colspan={} rownum={} {}".format(
1249 hdrspan.start,
1250 hdrspan.colspan,
1251 hdrspan.rownum,
1252 hdrspan.tagsets,
1253 )
1254 )
1255 action = get_lang_conf(lang, "reuse_cellspan")
1256 # can be "stop", "skip" or "reuse"
1257 if action == "stop":
1258 break
1259 if action == "skip":
1260 continue
1261 assert action == "reuse"
1262 tcats = tagset_cats(tagsets)
1263 # Most headers block using the same column position above. However,
1264 # "register" tags don't do this (cf. essere/Italian/verb: "formal")
1265 if len(tcats) != 1 or "register" not in tcats:
1266 used.add(key)
1267 # If we have moved to a different row, merge into column tagsets
1268 # (we use different and_tagsets within the row)
1269 if row_tagsets_rownum != hdrspan.rownum:
1270 # row_tagsets_rownum was initialized as 10000000
1271 ret = and_tagsets(lang, pos, coltags, row_tagsets)
1272 if celltext == debug_cell_text: 1272 ↛ 1273line 1272 didn't jump to line 1273 because the condition on line 1272 was never true
1273 print(
1274 "merging rows: {} {} -> {}".format(
1275 coltags, row_tagsets, ret
1276 )
1277 )
1278 coltags = ret
1279 row_tagsets = [()]
1280 row_tagsets_rownum = hdrspan.rownum
1281 # Merge into coltags
1282 if hdrspan.all_headers_row and hdrspan.rownum + 1 == last_header_row:
1283 # If this row is all headers and immediately preceeds the last
1284 # header we accepted, take any header from there.
1285 row_tagsets = and_tagsets(lang, pos, row_tagsets, tagsets)
1286 if celltext == debug_cell_text: 1286 ↛ 1287line 1286 didn't jump to line 1287 because the condition on line 1286 was never true
1287 print("merged (next header row): {}".format(row_tagsets))
1288 else:
1289 # new_cats is for the new tags (higher up in the table)
1290 new_cats = tagset_cats(tagsets)
1291 # cur_cats is for the tags already collected (lower in the table)
1292 cur_cats = tagset_cats(coltags)
1293 if celltext == debug_cell_text: 1293 ↛ 1294line 1293 didn't jump to line 1294 because the condition on line 1293 was never true
1294 print(
1295 "row={} start={} colspan={} tagsets={} coltags={} "
1296 "new_cats={} cur_cats={}".format(
1297 hdrspan.rownum,
1298 hdrspan.start,
1299 hdrspan.colspan,
1300 tagsets,
1301 coltags,
1302 new_cats,
1303 cur_cats,
1304 )
1305 )
1306 if "detail" in new_cats:
1307 if not any(coltags): # Only if no tags so far
1308 coltags = or_tagsets(lang, pos, coltags, tagsets)
1309 if celltext == debug_cell_text: 1309 ↛ 1310line 1309 didn't jump to line 1310 because the condition on line 1309 was never true
1310 print("stopping on detail after merge")
1311 break
1312 # Here, we block bleeding of categories from above
1313 elif "non-finite" in cur_cats and "non-finite" in new_cats:
1314 stop = get_lang_conf(lang, "stop_non_finite_non_finite")
1315 if stop: 1315 ↛ 1341line 1315 didn't jump to line 1341 because the condition on line 1315 was always true
1316 if celltext == debug_cell_text: 1316 ↛ 1317line 1316 didn't jump to line 1317 because the condition on line 1316 was never true
1317 print("stopping on non-finite-non-finite")
1318 break
1319 elif "non-finite" in cur_cats and "voice" in new_cats:
1320 stop = get_lang_conf(lang, "stop_non_finite_voice")
1321 if stop: 1321 ↛ 1341line 1321 didn't jump to line 1341 because the condition on line 1321 was always true
1322 if celltext == debug_cell_text: 1322 ↛ 1323line 1322 didn't jump to line 1323 because the condition on line 1322 was never true
1323 print("stopping on non-finite-voice")
1324 break
1325 elif "non-finite" in new_cats and cur_cats & set(
1326 ("person", "number")
1327 ):
1328 if celltext == debug_cell_text: 1328 ↛ 1329line 1328 didn't jump to line 1329 because the condition on line 1328 was never true
1329 print("stopping on non-finite new")
1330 break
1331 elif "non-finite" in new_cats and "tense" in new_cats:
1332 stop = get_lang_conf(lang, "stop_non_finite_tense")
1333 if stop:
1334 if celltext == debug_cell_text: 1334 ↛ 1335line 1334 didn't jump to line 1335 because the condition on line 1334 was never true
1335 print("stopping on non-finite new")
1336 break
1337 elif "non-finite" in cur_cats and new_cats & set(("mood",)): 1337 ↛ 1338line 1337 didn't jump to line 1338 because the condition on line 1337 was never true
1338 if celltext == debug_cell_text:
1339 print("stopping on non-finite cur")
1340 break
1341 if (
1342 "tense" in new_cats
1343 and any("imperative" in x for x in coltags)
1344 and get_lang_conf(lang, "imperative_no_tense")
1345 ):
1346 if celltext == debug_cell_text: 1346 ↛ 1347line 1346 didn't jump to line 1347 because the condition on line 1346 was never true
1347 print("skipping tense in imperative")
1348 continue
1349 elif (
1350 "mood" in new_cats
1351 and "mood" in cur_cats
1352 and
1353 # Allow if all new tags are already in current set
1354 any(
1355 t not in ts1
1356 for ts1 in coltags # current
1357 for ts2 in tagsets # new (from above)
1358 for t in ts2
1359 )
1360 ):
1361 skip = get_lang_conf(lang, "skip_mood_mood")
1362 if skip:
1363 if celltext == debug_cell_text: 1363 ↛ 1364line 1363 didn't jump to line 1364 because the condition on line 1363 was never true
1364 print("skipping on mood-mood")
1365 # we continue to next header
1366 else:
1367 if celltext == debug_cell_text: 1367 ↛ 1368line 1367 didn't jump to line 1368 because the condition on line 1367 was never true
1368 print("stopping on mood-mood")
1369 break
1370 elif "tense" in new_cats and "tense" in cur_cats:
1371 skip = get_lang_conf(lang, "skip_tense_tense")
1372 if skip:
1373 if celltext == debug_cell_text: 1373 ↛ 1374line 1373 didn't jump to line 1374 because the condition on line 1373 was never true
1374 print("skipping on tense-tense")
1375 # we continue to next header
1376 else:
1377 if celltext == debug_cell_text: 1377 ↛ 1378line 1377 didn't jump to line 1378 because the condition on line 1377 was never true
1378 print("stopping on tense-tense")
1379 break
1380 elif "aspect" in new_cats and "aspect" in cur_cats:
1381 if celltext == debug_cell_text: 1381 ↛ 1382line 1381 didn't jump to line 1382 because the condition on line 1381 was never true
1382 print("skipping on aspect-aspect")
1383 continue
1384 elif "number" in cur_cats and "number" in new_cats:
1385 if celltext == debug_cell_text: 1385 ↛ 1386line 1385 didn't jump to line 1386 because the condition on line 1385 was never true
1386 print("stopping on number-number")
1387 break
1388 elif "number" in cur_cats and "gender" in new_cats:
1389 if celltext == debug_cell_text: 1389 ↛ 1390line 1389 didn't jump to line 1390 because the condition on line 1389 was never true
1390 print("stopping on number-gender")
1391 break
1392 elif "person" in cur_cats and "person" in new_cats:
1393 if celltext == debug_cell_text: 1393 ↛ 1394line 1393 didn't jump to line 1394 because the condition on line 1393 was never true
1394 print("stopping on person-person")
1395 break
1396 else:
1397 # Merge tags and continue to next header up/left in the table.
1398 row_tagsets = and_tagsets(lang, pos, row_tagsets, tagsets)
1399 if celltext == debug_cell_text: 1399 ↛ 1400line 1399 didn't jump to line 1400 because the condition on line 1399 was never true
1400 print("merged: {}".format(coltags))
1401 # Update the row number from which we have last taken headers
1402 last_header_row = hdrspan.rownum
1403 # Merge the final row tagset into coltags
1404 coltags = and_tagsets(lang, pos, coltags, row_tagsets)
1405 # print(
1406 # "HDRSPANS:", list((x.start, x.colspan, x.tagsets) for x in hdrspans)
1407 # )
1408 if celltext == debug_cell_text: 1408 ↛ 1409line 1408 didn't jump to line 1409 because the condition on line 1408 was never true
1409 print("COMPUTE_COLTAGS {} {}: {}".format(start, colspan, coltags))
1410 assert isinstance(coltags, list)
1411 assert all(isinstance(x, tuple) for x in coltags)
1412 return coltags
1415def parse_simple_table(
1416 wxr: WiktextractContext,
1417 tablecontext: "TableContext",
1418 word: str,
1419 lang: str,
1420 pos: str,
1421 rows: list[list[InflCell]],
1422 titles: list[str],
1423 source: str,
1424 after: str,
1425 depth: int,
1426) -> list[FormData]:
1427 """This is the default table parser. Despite its name, it can parse
1428 complex tables. This returns a list of forms to be added to the
1429 part-of-speech, or None if the table could not be parsed."""
1430 assert isinstance(wxr, WiktextractContext)
1431 assert isinstance(tablecontext, TableContext)
1432 assert isinstance(word, str)
1433 assert isinstance(lang, str)
1434 assert isinstance(pos, str)
1435 assert isinstance(rows, list)
1436 assert isinstance(source, str)
1437 assert isinstance(after, str)
1438 assert isinstance(depth, int)
1439 for row in rows:
1440 for cell in row:
1441 assert isinstance(cell, InflCell)
1442 assert isinstance(titles, list)
1443 for x in titles:
1444 assert isinstance(x, str)
1446 # print("PARSE_SIMPLE_TABLE: TITLES:", titles)
1447 if debug_cell_text: 1447 ↛ 1448line 1447 didn't jump to line 1448 because the condition on line 1447 was never true
1448 print("ROWS:")
1449 for row in rows:
1450 print(" ", row)
1452 # Check for forced rowspan kludge. See e.g.
1453 # maorski/Serbo-Croatian. These are essentially multi-row
1454 # cells implemented using <br> rather than separate cell. We fix this
1455 # by identifying rows where this happens, and splitting the current row
1456 # to multiple rows by synthesizing additional cells.
1457 new_rows = []
1458 for row in rows:
1459 split_row = (
1460 any(x.is_title and x.text in ("inanimate\nanimate",) for x in row)
1461 and
1462 # x is an InflCell
1463 all(x.rowspan == 1 for x in row)
1464 )
1465 if not split_row:
1466 new_rows.append(row)
1467 continue
1468 row1 = []
1469 row2 = []
1470 for cell in row:
1471 cell1 = copy.deepcopy(cell)
1472 if "\n" in cell.text:
1473 # Has more than one line - split this cell
1474 parts = cell.text.strip().splitlines()
1475 if len(parts) != 2: 1475 ↛ 1476line 1475 didn't jump to line 1476 because the condition on line 1475 was never true
1476 wxr.wtp.debug(
1477 "forced rowspan kludge got {} parts: {!r}".format(
1478 len(parts), cell.text
1479 ),
1480 sortid="inflection/1234",
1481 )
1482 cell2 = copy.deepcopy(cell)
1483 cell1.text = parts[0]
1484 cell2.text = parts[1]
1485 else:
1486 cell1.rowspan = 2
1487 cell2 = cell1 # ref, not a copy
1488 row1.append(cell1)
1489 row2.append(cell2)
1490 new_rows.append(row1)
1491 new_rows.append(row2)
1492 rows = new_rows
1493 # print("ROWS AFTER FORCED ROWSPAN KLUDGE:")
1494 # for row in rows:
1495 # print(" ", row)
1497 # Parse definitions for references (from table itself and from text
1498 # after it)
1499 def_ht = {}
1501 def add_defs(defs: list[tuple[str, str]]) -> None:
1502 for ref, d in defs:
1503 # print("DEF: ref={} d={}".format(ref, d))
1504 d = d.strip()
1505 d = d.split(". ")[0].strip() # text before ". "
1506 if not d: 1506 ↛ 1507line 1506 didn't jump to line 1507 because the condition on line 1506 was never true
1507 continue
1508 if d.endswith("."): # catc ".."??
1509 d = d[:-1]
1510 tags, topics = decode_tags(d, no_unknown_starts=True)
1511 # print(f"{ref=}, {transformed=}, {tags=}")
1512 if topics or any("error-unknown-tag" in ts for ts in tags):
1513 d = d[0].lower() + d[1:]
1514 tags, topics = decode_tags(d, no_unknown_starts=True)
1515 if topics or any("error-unknown-tag" in ts for ts in tags):
1516 # Failed to parse as tags
1517 # print("Failed: topics={} tags={}"
1518 # .format(topics, tags))
1519 continue
1520 tags1_s: set[str] = set()
1521 for ts in tags:
1522 # Set.update is a union operation: definition tags are flat
1523 tags1_s.update(ts)
1524 tags1 = tuple(sorted(tags1_s))
1525 # print("DEFINED: {} -> {}".format(ref, tags1))
1526 def_ht[ref] = tags1
1528 def generate_tags(
1529 rowtags: list[tuple[str, ...]], table_tags: list[str]
1530 ) -> tuple[
1531 list[tuple[str, ...]], list[tuple[str, ...]], list[tuple[str, ...]]
1532 ]:
1533 new_coltags: list[tuple[str, ...]] = []
1534 all_hdr_tags: list[tuple[str, ...]] = [] # list of tuples
1535 new_rowtags: list[tuple[str, ...]] = []
1536 for rt0 in rowtags:
1537 for ct0 in compute_coltags(
1538 lang,
1539 pos,
1540 hdrspans,
1541 col_idx, # col_idx=>start
1542 colspan,
1543 col, # cell_text
1544 ):
1545 base_tags: set[str] = (
1546 set(rt0) | set(ct0) | set(global_tags) | set(table_tags)
1547 ) # Union.
1548 # print(f"{rt0=}, {ct0=}, {global_tags=},"
1549 # f" {table_tags=}, {base_tags=}")
1550 alt_tags = expand_header(
1551 wxr,
1552 tablecontext,
1553 word,
1554 lang,
1555 pos,
1556 text,
1557 base_tags,
1558 depth=depth,
1559 column_number=col_idx,
1560 )
1561 # base_tags are used in infl_map "if"-conds.
1562 for tt in alt_tags:
1563 if tt not in all_hdr_tags:
1564 all_hdr_tags.append(tt)
1565 tt_s = set(tt)
1566 # Add tags from referenced footnotes
1567 tt_s.update(refs_tags)
1568 # Sort, convert to tuple, and add to set of
1569 # alternatives.
1570 tt = tuple(sorted(tt_s))
1571 if tt not in new_coltags:
1572 new_coltags.append(tt)
1573 # Kludge (saprast/Latvian/Verb): ignore row tags
1574 # if trying to add a non-finite after mood.
1575 if any(valid_tags[t] == "mood" for t in rt0) and any(
1576 valid_tags[t] == "non-finite" for t in tt
1577 ):
1578 tags = tuple(sorted(set(tt) | set(hdr_tags)))
1579 else:
1580 tags = tuple(sorted(set(tt) | set(rt0) | set(hdr_tags)))
1581 if tags not in new_rowtags:
1582 new_rowtags.append(tags)
1583 return new_rowtags, new_coltags, all_hdr_tags
1585 def add_new_hdrspan(
1586 col: str,
1587 hdrspans: list[HdrSpan],
1588 store_new_hdrspan: bool,
1589 col0_followed_by_nonempty: bool,
1590 col0_hdrspan: Optional[HdrSpan],
1591 ) -> tuple[str, bool, Optional[HdrSpan]]:
1592 hdrspan = HdrSpan(
1593 col_idx, colspan, rowspan, rownum, new_coltags, col, all_headers
1594 )
1595 hdrspans.append(hdrspan)
1597 # infl-map tag "dummy-store-hdrspan" causes this new hdrspan
1598 # to be added to a register of stored hdrspans to be used
1599 # later with "dummy-load-stored-hdrspans".
1600 if store_new_hdrspan: 1600 ↛ 1601line 1600 didn't jump to line 1601 because the condition on line 1600 was never true
1601 tablecontext.stored_hdrspans.append(hdrspan)
1603 # Handle headers that are above left-side header
1604 # columns and are followed by personal pronouns in
1605 # remaining columns (basically headers that
1606 # evaluate to no tags). In such cases widen the
1607 # left-side header to the full row.
1608 if previously_seen: # id(cell) in seen_cells previously
1609 col0_followed_by_nonempty = True
1610 return col, col0_followed_by_nonempty, col0_hdrspan
1611 elif col0_hdrspan is None:
1612 col0_hdrspan = hdrspan
1613 elif any(all_hdr_tags): 1613 ↛ 1681line 1613 didn't jump to line 1681 because the condition on line 1613 was always true
1614 col0_cats = tagset_cats(col0_hdrspan.tagsets)
1615 later_cats = tagset_cats(all_hdr_tags)
1616 col0_allowed = get_lang_conf(lang, "hdr_expand_first")
1617 later_allowed = get_lang_conf(lang, "hdr_expand_cont")
1618 later_allowed = later_allowed | set(["dummy"])
1619 # dummy2 has different behavior than plain dummy
1620 # and does not belong here.
1622 # print("col0_cats={} later_cats={} "
1623 # "fol_by_nonempty={} col_idx={} end={} "
1624 # "tagsets={}"
1625 # .format(col0_cats, later_cats,
1626 # col0_followed_by_nonempty, col_idx,
1627 # col0_hdrspan.start +
1628 # col0_hdrspan.colspan,
1629 # col0_hdrspan.tagsets))
1630 # print("col0.rowspan={} rowspan={}"
1631 # .format(col0_hdrspan.rowspan, rowspan))
1632 # Only expand if [col0_cats and later_cats are allowed
1633 # and don't overlap] and [col0 has tags], and there have
1634 # been [no disallowed cells in between].
1635 #
1636 # There are three cases here:
1637 # - col0_hdrspan set, continue with allowed current
1638 # - col0_hdrspan set, expand, start new
1639 # - col0_hdrspan set, no expand, start new
1640 if (
1641 not col0_followed_by_nonempty
1642 and
1643 # XXX Only one cat of tags: kunna/Swedish
1644 # XXX len(col0_cats) == 1 and
1645 col0_hdrspan.rowspan >= rowspan
1646 and
1647 # from hdrspan
1648 not (later_cats - later_allowed)
1649 and not (col0_cats & later_cats)
1650 ):
1651 # First case: col0 set, continue
1652 return col, col0_followed_by_nonempty, col0_hdrspan
1653 # We are going to start new col0_hdrspan. Check if
1654 # we should expand.
1655 if (
1656 not col0_followed_by_nonempty
1657 and not (col0_cats - col0_allowed)
1658 and
1659 # Only "allowed" allowed
1660 # XXX len(col0_cats) == 1 and
1661 col_idx > col0_hdrspan.start + col0_hdrspan.colspan
1662 ):
1663 # col_idx is beyond current colspan
1664 # *Expand* current col0_hdrspan
1665 # print("EXPANDING COL0 MID: {} from {} to {} "
1666 # "cols {}"
1667 # .format(col0_hdrspan.text,
1668 # col0_hdrspan.colspan,
1669 # col_idx - col0_hdrspan.start,
1670 # col0_hdrspan.tagsets))
1671 col0_hdrspan.colspan = col_idx - col0_hdrspan.start
1672 col0_hdrspan.expanded = True
1673 # Clear old col0_hdrspan
1674 if col == debug_cell_text: 1674 ↛ 1675line 1674 didn't jump to line 1675 because the condition on line 1674 was never true
1675 print("START NEW {}".format(hdrspan.tagsets))
1676 col0_hdrspan = None
1677 # Now start new, unless it comes from previous row
1678 if not previously_seen: 1678 ↛ 1681line 1678 didn't jump to line 1681 because the condition on line 1678 was always true
1679 col0_hdrspan = hdrspan
1680 col0_followed_by_nonempty = False
1681 return col, col0_followed_by_nonempty, col0_hdrspan
1683 def split_text_into_alts(col: str) -> tuple[str, list[str], list[str]]:
1684 # Split the cell text into alternatives
1685 split_extra_tags = []
1686 if col and is_superscript(col[0]): 1686 ↛ 1687line 1686 didn't jump to line 1687 because the condition on line 1686 was never true
1687 alts = [col]
1688 else:
1689 separators = [";", "•", r"\n", " or "]
1690 if " + " not in col:
1691 separators.append(",")
1692 if not col.endswith("/"):
1693 separators.append("/")
1694 if col in special_phrase_splits:
1695 # Use language-specific special splits.
1696 # These are phrases and constructions that have
1697 # unique ways of splitting, not specific characters
1698 # to split on like with the default splitting.
1699 alts, tags = special_phrase_splits[col]
1700 split_extra_tags = tags.split()
1701 for x in split_extra_tags:
1702 assert x in valid_tags
1703 assert isinstance(alts, (list, tuple))
1704 assert isinstance(tags, str)
1705 else:
1706 # Use default splitting. However, recognize
1707 # language-specific replacements and change them to magic
1708 # characters before splitting. This way we won't split
1709 # them. This is important for, e.g., recognizing
1710 # alternative pronouns.
1711 # The magic characters are characters out of Unicode scope
1712 # that are given a simple incremental value, int > unicode.
1713 repls = {}
1714 magic_ch = MAGIC_FIRST
1715 trs = get_lang_conf(lang, "form_transformations")
1716 # trs is a list of lists of strings
1717 for _, v, _, _ in trs:
1718 # v is a pattern string, like "^ich"
1719 # form_transformations data is doing double-duty here,
1720 # because the pattern strings are already known to us and
1721 # not meant to be split.
1722 m = re.search(v, col)
1723 if m is not None:
1724 # if pattern found in text
1725 magic = chr(magic_ch)
1726 magic_ch += 1 # next magic character value
1727 col = re.sub(v, magic, col) # replace with magic ch
1728 repls[magic] = m.group(0)
1729 # remember what regex match string each magic char
1730 # replaces. .group(0) is the whole match.
1731 alts0 = split_at_comma_semi(col, separators=separators)
1732 # with magic characters in place, split the text so that
1733 # pre-transformation text is out of the way.
1734 alts = []
1735 for alt in alts0:
1736 # create a new list with the separated items and
1737 # the magic characters replaced with the original texts.
1738 for k, v in repls.items():
1739 alt = re.sub(k, v, alt)
1740 alts.append(alt)
1742 # Remove "*" from beginning of forms, as in non-attested
1743 # or reconstructed forms. Otherwise it might confuse romanization
1744 # detection.
1745 alts = list(re.sub(r"^\*\*?([^ ])", r"\1", x) for x in alts)
1746 alts = list(
1747 x for x in alts if not re.match(r"pronounced with |\(with ", x)
1748 )
1749 alts = list(
1750 re.sub(r"^\((in the sense [^)]*)\)\s+", "", x) for x in alts
1751 )
1752 return col, alts, split_extra_tags
1754 def handle_mixed_lines(alts: list[str]) -> list[tuple[str, str, str]]:
1755 # Handle the special case where romanization is given under
1756 # normal form, e.g. in Russian. There can be multiple
1757 # comma-separated forms in each case. We also handle the case
1758 # where instead of romanization we have IPA pronunciation
1759 # (e.g., avoir/French/verb).
1760 len2 = len(alts) // 2
1761 # Check for IPAs (forms first, IPAs under)
1762 # base, base, IPA, IPA
1763 if (
1764 len(alts) % 2 == 0 # Divisibly by two
1765 and all(
1766 re.match(r"^\s*/.*/\s*$", x) # Inside slashes = IPA
1767 for x in alts[len2:]
1768 )
1769 ): # In the second half of alts
1770 nalts = list(
1771 (alts[i], "", alts[i + len2])
1772 # List of tuples: (base, "", ipa)
1773 for i in range(len2)
1774 )
1775 # base, base, base, IPA
1776 elif (
1777 len(alts) > 2
1778 and re.match(r"^\s*/.*/\s*$", alts[-1])
1779 and all(not x.startswith("/") for x in alts[:-1])
1780 ):
1781 # Only if the last alt is IPA
1782 nalts = list((alts[i], "", alts[-1]) for i in range(len(alts) - 1))
1783 # base, IPA, IPA, IPA
1784 elif (
1785 len(alts) > 2
1786 and not alts[0].startswith("/")
1787 and all(
1788 re.match(r"^\s*/.*/\s*$", alts[i]) for i in range(1, len(alts))
1789 )
1790 ):
1791 # First is base and the rest is IPA alternatives
1792 nalts = list((alts[0], "", alts[i]) for i in range(1, len(alts)))
1794 # Check for romanizations, forms first, romanizations under
1795 elif (
1796 len(alts) % 2 == 0
1797 and not any("(" in x for x in alts)
1798 and all(
1799 classify_desc(
1800 re.sub(
1801 r"\^.*$",
1802 "",
1803 # Remove ends of strings starting from ^.
1804 # Supescripts have been already removed
1805 # from the string, while ^xyz needs to be
1806 # removed separately, though it's usually
1807 # something with a single letter?
1808 "".join(xx for xx in x if not is_superscript(xx)),
1809 )
1810 )
1811 == "other"
1812 for x in alts[:len2]
1813 )
1814 and all(
1815 classify_desc(
1816 re.sub(
1817 r"\^.*$",
1818 "",
1819 "".join(xx for xx in x if not is_superscript(xx)),
1820 )
1821 )
1822 in ("romanization", "english")
1823 for x in alts[len2:]
1824 )
1825 ):
1826 nalts = list((alts[i], alts[i + len2], "") for i in range(len2))
1827 # Check for romanizations, forms and romanizations alternating
1828 elif (
1829 len(alts) % 2 == 0
1830 and not any("(" in x for x in alts)
1831 and all(
1832 classify_desc(
1833 re.sub(
1834 r"\^.*$",
1835 "",
1836 "".join(xx for xx in alts[i] if not is_superscript(xx)),
1837 )
1838 )
1839 == "other"
1840 for i in range(0, len(alts), 2)
1841 )
1842 and all(
1843 classify_desc(
1844 re.sub(
1845 r"\^.*$",
1846 "",
1847 "".join(xx for xx in alts[i] if not is_superscript(xx)),
1848 )
1849 )
1850 in ("romanization", "english")
1851 for i in range(1, len(alts), 2)
1852 )
1853 ):
1854 # odds
1855 nalts = list(
1856 (alts[i], alts[i + 1], "") for i in range(0, len(alts), 2)
1857 )
1858 # evens
1859 # Handle complex Georgian entries with alternative forms and*
1860 # *romanizations. It's a bit of a mess. Remove this kludge if not
1861 # needed anymore. NOTE THAT THE PARENTHESES ON THE WEBSITE ARE NOT
1862 # DISPLAYED. They are put inside their own span elements that are
1863 # then hidden with some CSS.
1864 # https://en.wiktionary.org/wiki/%E1%83%90%E1%83%9B%E1%83%94%E1%83%A0%E1%83%98%E1%83%99%E1%83%98%E1%83%A1_%E1%83%A8%E1%83%94%E1%83%94%E1%83%A0%E1%83%97%E1%83%94%E1%83%91%E1%83%A3%E1%83%9A%E1%83%98_%E1%83%A8%E1%83%A2%E1%83%90%E1%83%A2%E1%83%94%E1%83%91%E1%83%98
1865 # ამერიკის შეერთებულ შტატებს(ა) (ameriḳis šeertebul šṭaṭebs(a))
1866 # The above should generate two alts entries, with two different
1867 # parallel versions, one without (a) and with (a) at the end,
1868 # for both the Georgian original and the romanization.
1869 elif ( 1869 ↛ 1874line 1869 didn't jump to line 1874 because the condition on line 1869 was never true
1870 tablecontext.template_name == "ka-decl-noun"
1871 and len(alts) >= 1
1872 and any(" (" in alt_ for alt_ in alts)
1873 ):
1874 nalts = ka_decl_noun_template_cell(alts)
1875 else:
1876 new_alts = []
1877 for alt in alts:
1878 lst = [""]
1879 idx = 0
1880 for m in re.finditer(
1881 r"(^|\w|\*)\((\w+" r"(/\w+)*)\)",
1882 # start OR letter OR asterisk (word/word*)
1883 # \\___________group 1_______/ \ \_g3_///
1884 # \ \__gr. 2_//
1885 # \_____________group 0________________/
1886 alt,
1887 ):
1888 v = m.group(2) # (word/word/word...)
1889 if (
1890 classify_desc(v) == "tags" # Tags inside parens
1891 or m.group(0) == alt
1892 ): # All in parens
1893 continue
1894 new_lst = []
1895 for x in lst:
1896 x += alt[idx : m.start()] + m.group(1)
1897 # alt until letter or asterisk
1898 idx = m.end()
1899 vparts = v.split("/")
1900 # group(2) = ["word", "wörd"...]
1901 if len(vparts) == 1:
1902 new_lst.append(x)
1903 new_lst.append(x + v)
1904 # "kind(er)" -> ["kind", "kinder"]
1905 else:
1906 for vv in vparts:
1907 new_lst.append(x + vv)
1908 # "lampai(tten/den)" ->
1909 # ["lampaitten", "lampaiden"]
1910 lst = new_lst
1911 for x in lst:
1912 new_alts.append(x + alt[idx:])
1913 # add the end of alt
1914 nalts = list((x, "", "") for x in new_alts)
1915 # [form, no romz, no ipa]
1916 return nalts
1918 def find_semantic_parens(form: str) -> tuple[str, list[str]]:
1919 # "Some languages" (=Greek) use brackets to mark things that
1920 # require tags, like (informality), [rarity] and {archaicity}.
1921 extra_tags = []
1922 if re.match(r"\([^][(){}]*\)$", form):
1923 if get_lang_conf(lang, "parentheses_for_informal"):
1924 form = form[1:-1]
1925 extra_tags.append("informal")
1926 else:
1927 form = form[1:-1]
1928 elif re.match(r"\{\[[^][(){}]*\]\}$", form):
1929 if get_lang_conf( 1929 ↛ 1936line 1929 didn't jump to line 1936 because the condition on line 1929 was always true
1930 lang, "square_brackets_for_rare"
1931 ) and get_lang_conf(lang, "curly_brackets_for_archaic"):
1932 # είμαι/Greek/Verb
1933 form = form[2:-2]
1934 extra_tags.extend(["rare", "archaic"])
1935 else:
1936 form = form[2:-2]
1937 elif re.match(r"\{[^][(){}]*\}$", form):
1938 if get_lang_conf(lang, "curly_brackets_for_archaic"): 1938 ↛ 1943line 1938 didn't jump to line 1943 because the condition on line 1938 was always true
1939 # είμαι/Greek/Verb
1940 form = form[1:-1]
1941 extra_tags.extend(["archaic"])
1942 else:
1943 form = form[1:-1]
1944 elif re.match(r"\[[^][(){}]*\]$", form):
1945 if get_lang_conf(lang, "square_brackets_for_rare"): 1945 ↛ 1950line 1945 didn't jump to line 1950 because the condition on line 1945 was always true
1946 # είμαι/Greek/Verb
1947 form = form[1:-1]
1948 extra_tags.append("rare")
1949 else:
1950 form = form[1:-1]
1951 return form, extra_tags
1953 def handle_parens(
1954 form: str, roman: str, clitic: str | None, extra_tags: list[str]
1955 ) -> tuple[str, str, str | None]:
1956 if TYPE_CHECKING:
1957 assert isinstance(paren, str)
1958 assert isinstance(m, re.Match)
1959 if re.match(r"[’'][a-z]([a-z][a-z]?)?$", paren):
1960 # is there a clitic starting with apostrophe?
1961 clitic = paren
1962 # assume the whole paren is a clitic
1963 # then remove paren from form
1964 form = (form[: m.start()] + subst + form[m.end() :]).strip()
1965 elif classify_desc(paren) == "tags":
1966 tagsets1, topics1 = decode_tags(paren)
1967 if not topics1: 1967 ↛ 1988line 1967 didn't jump to line 1988 because the condition on line 1967 was always true
1968 for ts in tagsets1:
1969 ts = tuple(x for x in ts if " " not in x)
1970 # There are some generated tags containing
1971 # spaces; do not let them through here.
1972 extra_tags.extend(ts)
1973 form = (form[: m.start()] + subst + form[m.end() :]).strip()
1974 # brackets contain romanization
1975 elif (
1976 m.start() > 0
1977 and not roman
1978 and classify_desc(form[: m.start()]) == "other"
1979 and
1980 # "other" ~ text
1981 classify_desc(paren) in ("romanization", "english")
1982 and not re.search(r"^with |-form$", paren)
1983 ):
1984 roman = paren
1985 form = (form[: m.start()] + subst + form[m.end() :]).strip()
1986 elif re.search(r"^with |-form", paren): 1986 ↛ 1987line 1986 didn't jump to line 1987 because the condition on line 1986 was never true
1987 form = (form[: m.start()] + subst + form[m.end() :]).strip()
1988 return form, roman, clitic
1990 def merge_row_and_column_tags(
1991 form: str,
1992 some_has_covered_text: bool,
1993 links: list[tuple[str, str]] | None = None,
1994 ) -> tuple[list[FormData], str, bool]:
1995 # Merge column tags and row tags. We give preference
1996 # to moods etc coming from rowtags (cf. austteigen/German/Verb
1997 # imperative forms).
1999 # In certain cases, what a tag means depends on whether
2000 # it is a row or column header. Depending on the language,
2001 # we replace certain tags with others if they're in
2002 # a column or row
2004 ret: list[FormData] = []
2005 # rtagreplacs = get_lang_conf(lang, "rowtag_replacements")
2006 # ctagreplacs = get_lang_conf(lang, "coltag_replacements")
2007 for rt in sorted(rowtags):
2008 if "dummy-use-as-coltags" in rt: 2008 ↛ 2009line 2008 didn't jump to line 2009 because the condition on line 2008 was never true
2009 continue
2010 # if lang was in rowtag_replacements)
2011 # if not rtagreplacs == None:
2012 # rt = replace_directional_tags(rt, rtagreplacs)
2013 for ct in sorted(coltags):
2014 if "dummy-use-as-rowtags" in ct: 2014 ↛ 2015line 2014 didn't jump to line 2015 because the condition on line 2014 was never true
2015 continue
2016 # if lang was in coltag_replacements
2017 # if not ctagreplacs == None:
2018 # ct = replace_directional_tags(ct,
2019 # ctagreplacs)
2020 tags = set(global_tags)
2021 tags.update(extra_tags)
2022 tags.update(rt)
2023 tags.update(refs_tags)
2024 tags.update(tablecontext.section_header)
2025 # Merge tags from column. For certain kinds of tags,
2026 # those coming from row take precedence.
2027 old_tags = set(tags)
2028 for t in ct:
2029 c = valid_tags[t]
2030 if c in ("mood", "case", "number") and any(
2031 valid_tags[tt] == c for tt in old_tags
2032 ):
2033 continue
2034 tags.add(t)
2036 # Extract language-specific tags from the
2037 # form. This may also adjust the form.
2038 form, lang_tags = lang_specific_tags(lang, pos, form)
2039 tags.update(lang_tags)
2041 # For non-finite verb forms, see if they have
2042 # a gender/class suffix
2043 if pos == "verb" and any(
2044 valid_tags[t] == "non-finite" for t in tags
2045 ):
2046 form, tt = parse_head_final_tags(wxr, lang, form)
2047 tags.update(tt)
2049 # Remove "personal" tag if have nth person; these
2050 # come up with e.g. reconhecer/Portuguese/Verb. But
2051 # not if we also have "pronoun"
2052 if (
2053 "personal" in tags
2054 and "pronoun" not in tags
2055 and any(
2056 x in tags
2057 for x in [
2058 "first-person",
2059 "second-person",
2060 "third-person",
2061 ]
2062 )
2063 ):
2064 tags.remove("personal")
2066 # If we have impersonal, remove person and number.
2067 # This happens with e.g. viajar/Portuguese/Verb
2068 if "impersonal" in tags:
2069 tags = tags - set(
2070 [
2071 "first-person",
2072 "second-person",
2073 "third-person",
2074 "singular",
2075 "plural",
2076 ]
2077 )
2079 # Remove unnecessary "positive" tag from verb forms
2080 if pos == "verb" and "positive" in tags:
2081 if "negative" in tags: 2081 ↛ 2082line 2081 didn't jump to line 2082 because the condition on line 2081 was never true
2082 tags.remove("negative")
2083 tags.remove("positive")
2085 # Many Russian (and other Slavic) inflection tables
2086 # have animate/inanimate distinction that generates
2087 # separate entries for neuter/feminine, but the
2088 # distinction only applies to masculine. Remove them
2089 # form neuter/feminine and eliminate duplicates.
2090 if get_lang_conf(lang, "masc_only_animate"):
2091 for t1 in ("animate", "inanimate"):
2092 for t2 in ("neuter", "feminine"):
2093 if (
2094 t1 in tags
2095 and t2 in tags
2096 and "masculine" not in tags
2097 and "plural" not in tags
2098 ):
2099 tags.remove(t1)
2101 # German adjective tables contain "(keiner)" etc
2102 # for mixed declension plural. When the adjective
2103 # disappears and it becomes just one word, remove
2104 # the "includes-article" tag. e.g. eiskalt/German
2105 if "includes-article" in tags and " " not in form:
2106 tags.remove("includes-article")
2108 # Handle ignored forms. We mark that the form was
2109 # provided. This is important information; some words
2110 # just do not have a certain form. However, there also
2111 # many cases where no word in a language has a
2112 # particular form. Post-processing could detect and
2113 # remove such cases.
2114 if form in IGNORED_COLVALUES:
2115 # if cell text seems to be ignorable
2116 if "dummy-ignore-skipped" in tags:
2117 continue
2118 if (
2119 col_idx not in has_covering_hdr
2120 and some_has_covered_text
2121 ):
2122 continue
2123 # don't ignore this cell if there's been a header
2124 # above it
2125 form = "-"
2126 elif col_idx in has_covering_hdr:
2127 some_has_covered_text = True
2129 # Handle ambiguous object concord. If a header
2130 # gives the "dummy-object-concord"-tag to a word,
2131 # replace person, number and gender tags with
2132 # their "object-" counterparts so that the verb
2133 # agrees with the object instead.
2134 # Use only when the verb has ONLY object agreement!
2135 # a پخول/Pashto
2136 if "dummy-object-concord" in tags: 2136 ↛ 2137line 2136 didn't jump to line 2137 because the condition on line 2136 was never true
2137 for subtag, objtag in object_concord_replacements.items():
2138 if subtag in tags:
2139 tags.remove(subtag)
2140 tags.add(objtag)
2142 # Remove the dummy mood tag that we sometimes
2143 # use to block adding other mood and related
2144 # tags
2145 tags = tags - set(
2146 [
2147 "dummy-mood",
2148 "dummy-tense",
2149 "dummy-ignore-skipped",
2150 "dummy-object-concord",
2151 "dummy-reset-headers",
2152 "dummy-use-as-coltags",
2153 "dummy-use-as-rowtags",
2154 "dummy-store-hdrspan",
2155 "dummy-load-stored-hdrspans",
2156 "dummy-reset-stored-hdrspans",
2157 "dummy-section-header",
2158 ]
2159 )
2161 # Perform language-specific tag replacements according
2162 # to rules in a table.
2163 lang_tag_mappings = get_lang_conf(lang, "lang_tag_mappings")
2164 if lang_tag_mappings is not None: 2164 ↛ 2165line 2164 didn't jump to line 2165 because the condition on line 2164 was never true
2165 for pre, post in lang_tag_mappings.items():
2166 if all(t in tags for t in pre):
2167 tags = (tags - set(pre)) | set(post)
2169 # Warn if there are entries with empty tags
2170 if not tags:
2171 wxr.wtp.debug(
2172 "inflection table: empty tags for {}".format(form),
2173 sortid="inflection/1826",
2174 )
2176 # Warn if form looks like IPA
2177 ########## XXX ########
2178 # Because IPA is its own unicode block, we could also
2179 # technically do a Unicode name check to see if a string
2180 # contains IPA. Not all valid IPA characters are in the
2181 # IPA extension block, so you can technically have false
2182 # negatives if it's something like /toki/, but it
2183 # shouldn't give false positives.
2184 # Alternatively, you could make a list of IPA-admissible
2185 # characters and reject non-IPA stuff with that.
2186 if re.match(r"\s*/.*/\s*$", form): 2186 ↛ 2187line 2186 didn't jump to line 2187 because the condition on line 2186 was never true
2187 wxr.wtp.debug(
2188 "inflection table form looks like IPA: "
2189 "form={} tags={}".format(form, tags),
2190 sortid="inflection/1840",
2191 )
2193 # Note that this checks `form`, not `in tags`
2194 if form == "dummy-ignored-text-cell": 2194 ↛ 2195line 2194 didn't jump to line 2195 because the condition on line 2194 was never true
2195 continue
2197 if "dummy-remove-this-cell" in tags: 2197 ↛ 2198line 2197 didn't jump to line 2198 because the condition on line 2197 was never true
2198 continue
2200 # Add the form
2201 tags_list = list(sorted(tags))
2202 dt: FormData = {
2203 "form": form,
2204 "tags": tags_list,
2205 "source": source,
2206 }
2207 if roman:
2208 dt["roman"] = roman
2209 if ipa:
2210 dt["ipa"] = ipa
2211 if cell_links is not None and (
2212 matched_links := match_links_to_form(
2213 wxr, form, cell_links, None
2214 )
2215 ):
2216 dt["links"] = matched_links
2217 ret.append(dt)
2218 # If we got separate clitic form, add it
2219 if clitic:
2220 dt = {
2221 "form": clitic,
2222 "tags": tags_list + ["clitic"],
2223 "source": source,
2224 }
2225 ret.append(dt)
2226 return ret, form, some_has_covered_text
2228 # First extract definitions from cells
2229 # See defs_ht for footnote defs stuff
2230 for row in rows:
2231 for cell in row:
2232 text, refs, defs, hdr_tags = extract_cell_content(
2233 lang, word, cell.text
2234 )
2235 # refs, defs = footnote stuff, defs -> (ref, def)
2236 add_defs(defs)
2237 # Extract definitions from text after table
2238 text, refs, defs, hdr_tags = extract_cell_content(lang, word, after)
2239 add_defs(defs)
2241 # Then extract the actual forms
2242 ret = []
2243 hdrspans: list[HdrSpan] = []
2244 first_col_has_text = False
2245 rownum = 0
2246 title = None
2247 global_tags = []
2248 table_tags = []
2249 special_phrase_splits = get_lang_conf(lang, "special_phrase_splits")
2250 form_replacements = get_lang_conf(lang, "form_replacements")
2251 form_transformations = get_lang_conf(lang, "form_transformations")
2252 possibly_ignored_forms = get_lang_conf(lang, "conditionally_ignored_cells")
2253 cleanup_rules = get_lang_conf(lang, "minor_text_cleanups")
2255 for title in titles:
2256 more_global_tags, more_table_tags, extra_forms = parse_title(
2257 title, source
2258 )
2259 global_tags.extend(more_global_tags)
2260 table_tags.extend(more_table_tags)
2261 ret.extend(extra_forms)
2262 cell_rowcnt: collections.defaultdict[int, int] = collections.defaultdict(
2263 int
2264 )
2265 seen_cells = set()
2266 has_covering_hdr = set()
2267 some_has_covered_text = False
2268 for row in rows:
2269 # print("ROW:", row)
2270 # print("====")
2271 # print(f"Start of PREVIOUS row hdrspans:"
2272 # f"{tuple(sp.tagsets for sp in hdrspans)}")
2273 # print(f"Start of row txt: {tuple(t.text for t in row)}")
2274 if not row: 2274 ↛ 2275line 2274 didn't jump to line 2275 because the condition on line 2274 was never true
2275 continue # Skip empty rows
2276 all_headers = all(x.is_title or not x.text.strip() for x in row)
2277 text = row[0].text
2278 if (
2279 row[0].is_title
2280 and text
2281 and not is_superscript(text[0])
2282 and text not in infl_map # zealous inflation map?
2283 and (
2284 re.match(r"Inflection ", text)
2285 or re.sub(
2286 r"\s+",
2287 " ", # flatten whitespace
2288 re.sub(
2289 r"\s*\([^)]*\)",
2290 "",
2291 # Remove whitespace+parens
2292 text,
2293 ),
2294 ).strip()
2295 not in infl_map
2296 )
2297 and not re.match(infl_start_re, text)
2298 and all(
2299 x.is_title == row[0].is_title and x.text == text
2300 # all InflCells in `row` have the same is_title and text
2301 for x in row
2302 )
2303 ):
2304 if text and title is None:
2305 # Only if there were no titles previously make the first
2306 # text that is found the title
2307 title = text
2308 if re.match(r"(Note:|Notes:)", title): 2308 ↛ 2309line 2308 didn't jump to line 2309 because the condition on line 2308 was never true
2309 continue # not a title
2310 more_global_tags, more_table_tags, extra_forms = parse_title(
2311 title, source
2312 )
2313 global_tags.extend(more_global_tags)
2314 table_tags.extend(more_table_tags)
2315 ret.extend(extra_forms)
2316 continue # Skip title rows without incrementing i
2317 if "dummy-skip-this" in global_tags: 2317 ↛ 2318line 2317 didn't jump to line 2318 because the condition on line 2317 was never true
2318 return []
2319 rowtags: list[tuple[str, ...]] = [()]
2320 # have_hdr = False
2321 # have_hdr never used?
2322 have_text = False
2323 samecell_cnt = 0
2324 col0_hdrspan = None # col0 or later header (despite its name)
2325 col0_followed_by_nonempty = False
2326 row_empty = True
2327 for col_idx, cell in enumerate(row):
2328 colspan = cell.colspan # >= 1
2329 rowspan = cell.rowspan # >= 1
2330 cell_links = cell.links # for weird links
2331 previously_seen = id(cell) in seen_cells
2332 # checks to see if this cell was in the previous ROW
2333 seen_cells.add(id(cell))
2334 if samecell_cnt == 0:
2335 # First column of a (possible multi-column) cell
2336 samecell_cnt = colspan - 1
2337 else:
2338 assert samecell_cnt > 0
2339 samecell_cnt -= 1
2340 continue
2342 # is_first_row_of_cell = cell_rowcnt[id(cell)] == 0
2343 # never used?
2345 # defaultdict(int) around line 1900
2346 cell_rowcnt[id(cell)] += 1
2347 # => how many cols this spans
2348 col: str = cell.text
2349 if not col:
2350 continue
2351 row_empty = False
2352 is_title = cell.is_title
2354 # If the cell has a target, i.e., text after colon, interpret
2355 # it as simply specifying a value for that value and ignore
2356 # it otherwise.
2357 if cell.target:
2358 text, refs, defs, hdr_tags = extract_cell_content(
2359 lang, word, col
2360 )
2361 if not text: 2361 ↛ 2362line 2361 didn't jump to line 2362 because the condition on line 2361 was never true
2362 continue
2363 refs_tags: set[str] = set()
2364 for ref in refs: # gets tags from footnotes 2364 ↛ 2365line 2364 didn't jump to line 2365 because the loop on line 2364 never started
2365 if ref in def_ht:
2366 refs_tags.update(def_ht[ref])
2367 rowtags = expand_header(
2368 wxr,
2369 tablecontext,
2370 word,
2371 lang,
2372 pos,
2373 text,
2374 [],
2375 silent=True,
2376 depth=depth,
2377 column_number=col_idx,
2378 )
2379 rowtags = list(
2380 set(tuple(sorted(set(x) | refs_tags)) for x in rowtags)
2381 )
2382 is_title = False
2383 col = cell.target
2385 # print(rownum, col_idx, col)
2386 # print(f"is_title: {is_title}")
2387 if is_title:
2388 # It is a header cell
2389 text, refs, defs, hdr_tags = extract_cell_content(
2390 lang, word, col
2391 )
2392 if not text:
2393 continue
2394 # Extract tags from referenced footnotes
2395 refs_tags = set()
2396 for ref in refs:
2397 if ref in def_ht:
2398 refs_tags.update(def_ht[ref])
2400 # Expand header to tags
2401 v = expand_header(
2402 wxr,
2403 tablecontext,
2404 word,
2405 lang,
2406 pos,
2407 text,
2408 [],
2409 silent=True,
2410 depth=depth,
2411 column_number=col_idx,
2412 )
2413 # print("EXPANDED {!r} to {}".format(text, v))
2415 if col_idx == 0:
2416 # first_col_has_text is used for a test to ignore
2417 # upper-left cells that are just text without
2418 # header info
2419 first_col_has_text = True
2420 # Check if the header expands to reset hdrspans
2421 if any("dummy-reset-headers" in tt for tt in v):
2422 new_hdrspans = []
2423 for hdrspan in hdrspans:
2424 # if there are HdrSpan objects (abstract headers with
2425 # row- and column-spans) that are to the left or at the
2426 # same row or below, KEEP those; things above and to
2427 # the right of the hdrspan with dummy-reset-headers
2428 # are discarded. Tags from the header together with
2429 # dummy-reset-headers are kept as normal.
2430 if (
2431 hdrspan.start + hdrspan.colspan < col_idx
2432 or hdrspan.rownum > rownum - cell.rowspan
2433 ):
2434 new_hdrspans.append(hdrspan)
2435 hdrspans = new_hdrspans
2437 for tt in v:
2438 if "dummy-section-header" in tt: 2438 ↛ 2439line 2438 didn't jump to line 2439 because the condition on line 2438 was never true
2439 tablecontext.section_header = tt
2440 break
2441 if "dummy-reset-section-header" in tt: 2441 ↛ 2442line 2441 didn't jump to line 2442 because the condition on line 2441 was never true
2442 tablecontext.section_header = tuple()
2443 # Text between headers on a row causes earlier headers to
2444 # be reset
2445 if have_text:
2446 # print(" HAVE_TEXT BEFORE HDR:", col)
2447 # Reset rowtags if new title column after previous
2448 # text cells
2449 # +-----+-----+-----+-----+
2450 # |hdr-a|txt-a|hdr-B|txt-B|
2451 # +-----+-----+-----+-----+
2452 # ^reset rowtags=>
2453 # XXX beware of header "—": "" - must not clear on that if
2454 # it expands to no tags
2455 rowtags = [()]
2456 # have_hdr = True
2457 # have_hdr never used?
2458 # print("HAVE_HDR: {} rowtags={}".format(col, rowtags))
2459 # Update rowtags and coltags
2460 has_covering_hdr.add(col_idx) # col_idx == current column
2461 # has_covering_hdr is a set that has the col_idx-ids of columns
2462 # that have previously had some kind of header. It is never
2463 # resetted inside the col_idx-loops OR the bigger rows-loop, so
2464 # applies to the whole table.
2466 new_coltags: list[tuple[str, ...]]
2467 all_hdr_tags: list[tuple[str, ...]]
2468 rowtags, new_coltags, all_hdr_tags = generate_tags(
2469 rowtags, table_tags
2470 )
2472 if any("dummy-skip-this" in ts for ts in rowtags):
2473 continue # Skip this cell
2475 if any("dummy-load-stored-hdrspans" in ts for ts in v): 2475 ↛ 2476line 2475 didn't jump to line 2476 because the condition on line 2475 was never true
2476 hdrspans.extend(tablecontext.stored_hdrspans)
2478 if any("dummy-reset-stored-hdrspans" in ts for ts in v): 2478 ↛ 2479line 2478 didn't jump to line 2479 because the condition on line 2478 was never true
2479 tablecontext.stored_hdrspans = []
2481 if any("dummy-store-hdrspan" in ts for ts in v): 2481 ↛ 2483line 2481 didn't jump to line 2483 because the condition on line 2481 was never true
2482 # print(f"STORED: {col}")
2483 store_new_hdrspan = True
2484 else:
2485 store_new_hdrspan = False
2487 new_coltags = list(
2488 x
2489 for x in new_coltags
2490 if not any(t in noinherit_tags for t in x)
2491 )
2492 # print("new_coltags={} previously_seen={} all_hdr_tags={}"
2493 # .format(new_coltags, previously_seen, all_hdr_tags))
2494 if any(new_coltags):
2495 (
2496 col,
2497 col0_followed_by_nonempty,
2498 col0_hdrspan,
2499 ) = add_new_hdrspan(
2500 col,
2501 hdrspans,
2502 store_new_hdrspan,
2503 col0_followed_by_nonempty,
2504 col0_hdrspan,
2505 )
2507 continue
2509 # These values are ignored, at least for now
2510 if re.match(r"^(# |\(see )", col): 2510 ↛ 2511line 2510 didn't jump to line 2511 because the condition on line 2510 was never true
2511 continue
2513 if any("dummy-skip-this" in ts for ts in rowtags):
2514 continue # Skip this cell
2516 # If the word has no rowtags and is a multi-row cell, then
2517 # ignore this. This happens with empty separator rows
2518 # within a rowspan>1 cell. cf. wander/English/Conjugation.
2519 if rowtags == [()] and rowspan > 1:
2520 continue
2522 # Minor cleanup. See e.g. είμαι/Greek/Verb present participle.
2523 if cleanup_rules:
2524 for regx, substitution in cleanup_rules.items():
2525 col = re.sub(regx, substitution, col)
2527 if ( 2527 ↛ 2532line 2527 didn't jump to line 2532 because the condition on line 2527 was never true
2528 col_idx == 0
2529 and not first_col_has_text
2530 and get_lang_conf(lang, "ignore_top_left_text_cell") is True
2531 ):
2532 continue # Skip text at top left, as in Icelandic, Faroese
2534 # if col0_hdrspan is not None:
2535 # print("COL0 FOLLOWED NONHDR: {!r} by {!r}"
2536 # .format(col0_hdrspan.text, col))
2537 col0_followed_by_nonempty = True
2538 have_text = True
2540 # Determine column tags for the multi-column cell
2541 combined_coltags = compute_coltags(
2542 lang, pos, hdrspans, col_idx, colspan, col
2543 )
2544 if any("dummy-ignored-text-cell" in ts for ts in combined_coltags): 2544 ↛ 2545line 2544 didn't jump to line 2545 because the condition on line 2544 was never true
2545 continue
2547 # Split the text into separate forms. First simplify spaces except
2548 # newline.
2549 col = re.sub(r"[ \t\r]+", " ", col)
2550 # Split the cell text into alternatives
2552 col, alts, split_extra_tags = split_text_into_alts(col)
2554 # Some cells have mixed form content, like text and romanization,
2555 # or text and IPA. Handle these.
2556 altss = handle_mixed_lines(alts)
2558 altsss = list((x, combined_coltags, cell_links) for x in altss)
2560 # Generate forms from the alternatives
2561 # alts is a list of (tuple of forms, tuple of tags)
2562 coltags: list[tuple[str, ...]]
2563 base_roman: str
2564 ipa: str
2565 for (form, base_roman, ipa), coltags, cell_links in altsss:
2566 form = form.strip()
2567 extra_tags: list[str] = []
2568 extra_tags.extend(split_extra_tags)
2569 # Handle special splits again here, so that we can have custom
2570 # mappings from form to form and tags.
2571 if form in form_replacements:
2572 replacement, tags = form_replacements[form]
2573 for x in tags.split():
2574 assert x in valid_tags
2575 assert isinstance(replacement, str)
2576 assert isinstance(tags, str)
2577 form = replacement
2578 extra_tags.extend(tags.split())
2580 check_romanization_form_transformation = False
2581 # loop over regexes in form_transformation and replace text
2582 # in form using regex patterns
2583 # this does a bit of the same stuff the above does,
2584 # but with regexes and re.sub() instead
2585 subst: str
2586 for (
2587 form_transformations_pos,
2588 vv,
2589 subst,
2590 tags,
2591 ) in form_transformations:
2592 # v is a pattern string, like "^ich"
2593 if (
2594 isinstance(form_transformations_pos, str)
2595 and pos != form_transformations_pos
2596 ) or (
2597 (not isinstance(form_transformations_pos, str))
2598 and pos not in form_transformations_pos
2599 ):
2600 continue
2601 m: re.Match | None = re.search(vv, form)
2602 if m is not None:
2603 if base_roman: 2603 ↛ 2604line 2603 didn't jump to line 2604 because the condition on line 2603 was never true
2604 for _, rom_v, rom_sub, _ in form_transformations:
2605 rom_m = re.search(rom_v, base_roman)
2606 if rom_m is not None:
2607 base_roman = re.sub(
2608 rom_v, rom_sub, base_roman
2609 )
2610 break
2611 form = re.sub(vv, subst, form)
2612 for x in tags.split():
2613 assert x in valid_tags
2614 extra_tags.extend(tags.split())
2615 check_romanization_form_transformation = True
2616 break
2618 # Clean the value, extracting reference symbols
2619 form, refs, defs, hdr_tags = extract_cell_content(
2620 lang, word, form
2621 )
2622 # if refs:
2623 # print("REFS:", refs)
2624 extra_tags.extend(hdr_tags)
2625 # Extract tags from referenced footnotes
2626 refs_tags = set()
2627 for ref in refs:
2628 if ref in def_ht:
2629 refs_tags.update(def_ht[ref])
2631 if base_roman:
2632 if check_romanization_form_transformation: 2632 ↛ 2636line 2632 didn't jump to line 2636 because the condition on line 2632 was never true
2633 # because form_transformations are used to handle things
2634 # where the romanization has the "same" structure, we
2635 # need to handle that here too....
2636 for (
2637 _,
2638 vv,
2639 subst,
2640 _,
2641 ) in form_transformations:
2642 # v is a pattern string, like "^ich"
2643 m = re.search(vv, base_roman)
2644 if m is not None:
2645 base_roman = re.sub(vv, subst, base_roman)
2646 # XXX add tag stuff here if needed
2647 break
2649 base_roman, _, _, hdr_tags = extract_cell_content(
2650 lang, word, base_roman
2651 )
2652 extra_tags.extend(hdr_tags)
2654 # Do some additional cleanup on the cell.
2655 form = re.sub(r"^\s*,\s*", "", form)
2656 form = re.sub(r"\s*,\s*$", "", form)
2657 form = re.sub(r"\s*(,\s*)+", ", ", form)
2658 form = re.sub(r"(?i)^Main:", "", form)
2659 form = re.sub(r"\s+", " ", form)
2660 form = form.strip()
2662 # Look for parentheses that have semantic meaning
2663 form, et = find_semantic_parens(form)
2664 extra_tags.extend(et)
2666 # Handle parentheses in the table element. We parse
2667 # tags anywhere and romanizations anywhere but beginning.
2668 roman: str = base_roman
2669 paren: str | None = None
2670 clitic: str | None = None
2671 m = re.search(r"(\s+|^)\(([^)]*)\)", form)
2672 # start|spaces + (anything)
2673 if m is not None:
2674 subst = m.group(1)
2675 paren = m.group(2)
2676 else:
2677 m = re.search(r"\(([^)]*)\)(\s+|$)", form)
2678 # (anything) + spaces|end
2679 if m is not None: 2679 ↛ 2680line 2679 didn't jump to line 2680 because the condition on line 2679 was never true
2680 paren = m.group(1)
2681 subst = m.group(2)
2682 if paren is not None:
2683 form, roman, clitic = handle_parens(
2684 form, roman, clitic, extra_tags
2685 )
2687 # Ignore certain forms that are not really forms,
2688 # unless they're really, really close to the article title
2689 if form in ( 2689 ↛ 2694line 2689 didn't jump to line 2694 because the condition on line 2689 was never true
2690 "",
2691 "unchanged",
2692 "after an", # in sona/Irish/Adj/Mutation
2693 ):
2694 Lev = distw([form], word)
2695 if form and Lev < 0.1:
2696 wxr.wtp.debug(
2697 "accepted possible false positive '{}' with"
2698 "> 0.1 Levenshtein distance in {}/{}".format(
2699 form, word, lang
2700 ),
2701 sortid="inflection/2213",
2702 )
2703 elif form and Lev < 0.3:
2704 wxr.wtp.debug(
2705 "skipped possible match '{}' with > 0.3"
2706 "Levenshtein distance in {}/{}".format(
2707 form, word, lang
2708 ),
2709 sortid="inflection/2218",
2710 )
2711 continue
2712 else:
2713 continue
2714 # print("ROWTAGS={} COLTAGS={} REFS_TAGS={} "
2715 # "FORM={!r} ROMAN={!r}"
2716 # .format(rowtags, coltags, refs_tags,
2717 # form, roman))
2719 # Merge tags from row and column and do miscellaneous
2720 # tag-related handling.
2721 (
2722 merge_ret,
2723 form,
2724 some_has_covered_text,
2725 ) = merge_row_and_column_tags(
2726 form, some_has_covered_text, cell_links
2727 )
2728 ret.extend(merge_ret)
2730 # End of row.
2731 rownum += 1
2732 # For certain languages, if the row was empty, reset
2733 # hdrspans (saprast/Latvian/Verb, but not aussteigen/German/Verb).
2734 if row_empty and get_lang_conf(lang, "empty_row_resets"):
2735 hdrspans = []
2736 # Check if we should expand col0_hdrspan.
2737 if col0_hdrspan is not None:
2738 col0_allowed = get_lang_conf(lang, "hdr_expand_first")
2739 col0_cats = tagset_cats(col0_hdrspan.tagsets)
2740 # Only expand if col0_cats and later_cats are allowed
2741 # and don't overlap and col0 has tags, and there have
2742 # been no disallowed cells in between.
2743 if (
2744 not col0_followed_by_nonempty
2745 and not (col0_cats - col0_allowed)
2746 and
2747 # len(col0_cats) == 1 and
2748 col_idx > col0_hdrspan.start + col0_hdrspan.colspan
2749 ):
2750 # If an earlier header is only followed by headers that yield
2751 # no tags, expand it to entire row
2752 # print("EXPANDING COL0: {} from {} to {} cols {}"
2753 # .format(col0_hdrspan.text, col0_hdrspan.colspan,
2754 # len(row) - col0_hdrspan.start,
2755 # col0_hdrspan.tagsets))
2756 col0_hdrspan.colspan = len(row) - col0_hdrspan.start
2757 col0_hdrspan.expanded = True
2758 # XXX handle refs and defs
2759 # for x in hdrspans:
2760 # print(" HDRSPAN {} {} {} {!r}"
2761 # .format(x.start, x.colspan, x.tagsets, x.text))
2763 # Post-process German nouns with articles in separate columns. We move the
2764 # definite/indefinite/usually-without-article markers into the noun and
2765 # remove the article entries.
2766 if get_lang_conf(lang, "articles_in_separate_columns") and any(
2767 "noun" in x["tags"] for x in ret
2768 ):
2769 new_ret = []
2770 saved_tags: set[str] = set()
2771 had_noun = False
2772 for dt in ret:
2773 tags = dt["tags"]
2774 # print(tags)
2775 if "noun" in tags:
2776 tags = list(
2777 sorted(set(t for t in tags if t != "noun") | saved_tags)
2778 )
2779 had_noun = True
2780 elif ( 2780 ↛ 2807line 2780 didn't jump to line 2807 because the condition on line 2780 was always true
2781 "indefinite" in tags
2782 or "definite" in tags
2783 or "usually-without-article" in tags
2784 or "without-article" in tags
2785 ):
2786 if had_noun:
2787 saved_tags = set(tags)
2788 else:
2789 saved_tags = saved_tags | set(tags) # E.g. Haus/German
2790 remove_useless_tags(lang, pos, saved_tags)
2791 saved_tags = saved_tags & set(
2792 [
2793 "masculine",
2794 "feminine",
2795 "neuter",
2796 "singular",
2797 "plural",
2798 "indefinite",
2799 "definite",
2800 "usually-without-article",
2801 "without-article",
2802 ]
2803 )
2804 had_noun = False
2805 continue # Skip the articles
2807 dt = dt.copy()
2808 dt["tags"] = tags
2809 new_ret.append(dt)
2810 ret = new_ret
2812 elif possibly_ignored_forms:
2813 # Some languages have tables with cells that are kind of separated
2814 # and difficult to handle, like eulersche Formel/German where
2815 # the definite and indefinite articles are just floating.
2816 # If a language has a dict of conditionally_ignored_cells,
2817 # and if the contents of a cell is found in one of the rules
2818 # there, ignore that cell if it
2819 # 1. Does not have the appropriate tag (like "definite" for "die")
2820 # and
2821 # 2. The title of the article is not one of the other co-words
2822 # (ie. it's an article for the definite articles in german etc.)
2823 # pass
2824 new_ret = []
2825 for cell_data in ret:
2826 tags = cell_data["tags"]
2827 text = cell_data["form"]
2828 skip_this = False
2829 for key_tag, ignored_forms in possibly_ignored_forms.items():
2830 if text not in ignored_forms: 2830 ↛ 2832line 2830 didn't jump to line 2832 because the condition on line 2830 was always true
2831 continue
2832 if word in ignored_forms:
2833 continue
2834 if key_tag not in tags:
2835 skip_this = True
2837 if skip_this: 2837 ↛ 2838line 2837 didn't jump to line 2838 because the condition on line 2837 was never true
2838 continue
2839 new_ret.append(cell_data)
2841 ret = new_ret
2843 # Post-process English inflection tables, addding "multiword-construction"
2844 # when the number of words has increased.
2845 if lang == "English" and pos == "verb":
2846 word_words = len(word.split())
2847 new_ret = []
2848 for dt in ret:
2849 form = dt.get("form", "")
2850 if len(form.split()) > word_words:
2851 dt = dt.copy()
2852 dt["tags"] = list(dt.get("tags", []))
2853 # This strange copy-assigning shuffle is preventative black
2854 # magic; do not touch lest you invoke deep bugs.
2855 data_append(dt, "tags", "multiword-construction")
2856 new_ret.append(dt)
2857 ret = new_ret
2859 # Always insert "table-tags" detail as the first entry in any inflection
2860 # table. This way we can reliably detect where a new table starts.
2861 # Table-tags applies until the next table-tags entry.
2862 if ret or table_tags:
2863 table_tags = sorted(set(table_tags))
2864 dt = {
2865 "form": " ".join(table_tags),
2866 "source": source,
2867 "tags": ["table-tags"],
2868 }
2869 if dt["form"] == "":
2870 dt["form"] = "no-table-tags"
2871 if tablecontext.template_name:
2872 tn: FormData = {
2873 "form": tablecontext.template_name,
2874 "source": source,
2875 "tags": ["inflection-template"],
2876 }
2877 ret = [dt] + [tn] + ret
2878 else:
2879 ret = [dt] + ret
2881 return ret
2884def handle_generic_table(
2885 wxr: WiktextractContext,
2886 tablecontext: "TableContext",
2887 data: WordData,
2888 word: str,
2889 lang: str,
2890 pos: str,
2891 rows: list[list[InflCell]],
2892 titles: list[str],
2893 source: str,
2894 after: str,
2895 depth: int,
2896) -> None:
2897 assert isinstance(wxr, WiktextractContext)
2898 assert isinstance(data, dict)
2899 assert isinstance(word, str)
2900 assert isinstance(lang, str)
2901 assert isinstance(pos, str)
2902 assert isinstance(rows, list)
2903 assert isinstance(source, str)
2904 assert isinstance(after, str)
2905 assert isinstance(depth, int)
2906 for row in rows:
2907 assert isinstance(row, list)
2908 for x in row:
2909 assert isinstance(x, InflCell)
2910 assert isinstance(titles, list)
2911 for s in titles:
2912 assert isinstance(s, str)
2914 # Try to parse the table as a simple table
2915 ret = parse_simple_table(
2916 wxr, tablecontext, word, lang, pos, rows, titles, source, after, depth
2917 )
2918 if ret is None: 2918 ↛ 2921line 2918 didn't jump to line 2921 because the condition on line 2918 was never true
2919 # XXX handle other table formats
2920 # We were not able to handle the table
2921 wxr.wtp.debug(
2922 "unhandled inflection table format, {}/{}".format(word, lang),
2923 sortid="inflection/2370",
2924 )
2925 return
2927 # Add the returned forms but eliminate duplicates.
2928 have_forms = set()
2929 for dt in ret:
2930 fdt = freeze(dt)
2931 if fdt in have_forms:
2932 continue # Don't add duplicates
2933 # Some Russian words have Declension and Pre-reform declension partially
2934 # duplicating same data. Don't add "dated" tags variant if already have
2935 # the same without "dated" from the modern declension table
2937 tags = dt.get("tags", [])
2938 for dated_tag in ("dated",):
2939 if dated_tag in tags:
2940 dt2 = dt.copy()
2941 tags2 = list(x for x in tags if x != dated_tag)
2942 dt2["tags"] = tags2
2943 if tags2 and freeze(dt2) in have_forms: 2943 ↛ 2944line 2943 didn't jump to line 2944 because the condition on line 2943 was never true
2944 break # Already have without archaic
2945 else:
2946 if "table-tags" not in tags:
2947 have_forms.add(fdt)
2948 data_append(data, "forms", dt)
2951def determine_header(
2952 wxr: WiktextractContext,
2953 tablecontext,
2954 lang: str,
2955 word: str,
2956 pos: str,
2957 table_kind: NodeKind,
2958 kind: NodeKind | str,
2959 style: str | None,
2960 row: list[InflCell],
2961 col: WikiNode,
2962 celltext: str,
2963 titletext: str,
2964 cols_headered: list[bool],
2965 target: str | None,
2966 cellstyle: str,
2967 # is_title,
2968 # hdr_expansion,
2969 # target,
2970 # celltext,
2971) -> tuple[bool, list[tuple[str, ...]], str | None, str]:
2972 assert isinstance(table_kind, NodeKind)
2973 assert isinstance(kind, (NodeKind, str))
2974 assert style is None or isinstance(style, str)
2975 assert cellstyle is None or isinstance(cellstyle, str)
2977 header_kind: NodeKind | str
2978 if table_kind == NodeKind.TABLE:
2979 header_kind = NodeKind.TABLE_HEADER_CELL
2980 elif table_kind == NodeKind.HTML: 2980 ↛ 2982line 2980 didn't jump to line 2982 because the condition on line 2980 was always true
2981 header_kind = "th"
2982 idx = celltext.find(": ")
2983 is_title = False
2984 # remove anything in parentheses, compress whitespace, .strip()
2985 cleaned_titletext = re.sub(
2986 r"\s+", " ", re.sub(r"\s*\([^)]*\)", "", titletext)
2987 ).strip()
2988 cleaned, _, _, _ = extract_cell_content(lang, word, celltext)
2989 cleaned = re.sub(r"\s+", " ", cleaned)
2990 hdr_expansion = expand_header(
2991 wxr,
2992 tablecontext,
2993 word,
2994 lang,
2995 pos,
2996 cleaned,
2997 [],
2998 silent=True,
2999 ignore_tags=True,
3000 )
3001 candidate_hdr = not any(
3002 any(t.startswith("error-") for t in ts) for ts in hdr_expansion
3003 )
3004 # KJ candidate_hdr says that a specific cell is a candidate
3005 # for being a header because it passed through expand_header
3006 # without getting any "error-" tags; that is, the contents
3007 # is "valid" for being a header; these are the false positives
3008 # we want to catch
3009 ignored_cell = any(
3010 any(t.startswith("dummy-") for t in ts) for ts in hdr_expansion
3011 )
3012 # ignored_cell should NOT be used to filter for headers, like
3013 # candidate_hdr is used, but only to filter for related *debug
3014 # messages*: some dummy-tags are actually half-way to headers,
3015 # like ones with "Notes", so they MUST be headers, but later
3016 # on they're ignored *as* headers so they don't need to print
3017 # out any cells-as-headers debug messages.
3018 if (
3019 candidate_hdr
3020 and kind != header_kind
3021 and cleaned != ""
3022 and cleaned != "dummy-ignored-text-cell"
3023 and cleaned not in IGNORED_COLVALUES
3024 ):
3025 # print("col: {}".format(col))
3026 if not ignored_cell and lang not in LANGUAGES_WITH_CELLS_AS_HEADERS:
3027 wxr.wtp.debug(
3028 "rejected heuristic header: "
3029 "table cell identified as header and given "
3030 "candidate status, BUT {} is not in "
3031 "LANGUAGES_WITH_CELLS_AS_HEADERS; "
3032 "cleaned text: {}".format(lang, cleaned),
3033 sortid="inflection/2447",
3034 )
3035 candidate_hdr = False
3036 elif cleaned not in LANGUAGES_WITH_CELLS_AS_HEADERS.get(lang, ""):
3037 wxr.wtp.debug(
3038 "rejected heuristic header: "
3039 "table cell identified as header and given "
3040 "candidate status, BUT the cleaned text is "
3041 "not in LANGUAGES_WITH_CELLS_AS_HEADERS[{}]; "
3042 "cleaned text: {}".format(lang, cleaned),
3043 sortid="inflection/2457",
3044 )
3045 candidate_hdr = False
3046 else:
3047 wxr.wtp.debug(
3048 "accepted heuristic header: "
3049 "table cell identified as header and given "
3050 "candidate status, AND the cleaned text is "
3051 "in LANGUAGES_WITH_CELLS_AS_HEADERS[{}]; "
3052 "cleaned text: {}".format(lang, cleaned),
3053 sortid="inflection/2466",
3054 )
3056 # If the cell starts with something that could start a
3057 # definition (typically a reference symbol), make it a candidate
3058 # regardless of whether the language is listed.
3059 if re.match(def_re, cleaned) and not re.match(nondef_re, cleaned): 3059 ↛ 3060line 3059 didn't jump to line 3060 because the condition on line 3059 was never true
3060 candidate_hdr = True
3062 # print("titletext={!r} hdr_expansion={!r} candidate_hdr={!r} "
3063 # "lang={} pos={}"
3064 # .format(titletext, hdr_expansion, candidate_hdr,
3065 # lang, pos))
3066 if idx >= 0 and titletext[:idx] in infl_map:
3067 target = titletext[idx + 2 :].strip()
3068 celltext = celltext[:idx]
3069 is_title = True
3070 elif (
3071 kind == header_kind
3072 and " + " not in titletext # For "avoir + blah blah"?
3073 and not any(
3074 isinstance(x, WikiNode)
3075 and x.kind == NodeKind.HTML
3076 and x.sarg == "span"
3077 and x.attrs.get("lang") in ("az",)
3078 for x in col.children
3079 )
3080 ):
3081 is_title = True
3082 elif (
3083 candidate_hdr
3084 and cleaned_titletext not in IGNORED_COLVALUES
3085 and distw([cleaned_titletext], word) > 0.3
3086 and cleaned_titletext not in ("I", "es")
3087 ):
3088 is_title = True
3089 # if first column or same style as first column
3090 elif (
3091 style == cellstyle
3092 and
3093 # and title is not identical to word name
3094 titletext != word
3095 and cleaned not in IGNORED_COLVALUES
3096 and cleaned != "dummy-ignored-text-cell"
3097 and
3098 # the style composite string is not broken
3099 not style.startswith("////")
3100 and " + " not in titletext
3101 ):
3102 if not ignored_cell and lang not in LANGUAGES_WITH_CELLS_AS_HEADERS: 3102 ↛ 3103line 3102 didn't jump to line 3103 because the condition on line 3102 was never true
3103 wxr.wtp.debug(
3104 "rejected heuristic header: "
3105 "table cell identified as header based "
3106 "on style, BUT {} is not in "
3107 "LANGUAGES_WITH_CELLS_AS_HEADERS; "
3108 "cleaned text: {}, style: {}".format(lang, cleaned, style),
3109 sortid="inflection/2512",
3110 )
3111 elif ( 3111 ↛ 3115line 3111 didn't jump to line 3115 because the condition on line 3111 was never true
3112 not ignored_cell
3113 and cleaned not in LANGUAGES_WITH_CELLS_AS_HEADERS.get(lang, "")
3114 ):
3115 wxr.wtp.debug(
3116 "rejected heuristic header: "
3117 "table cell identified as header based "
3118 "on style, BUT the cleaned text is "
3119 "not in LANGUAGES_WITH_CELLS_AS_HEADERS[{}]; "
3120 "cleaned text: {}, style: {}".format(lang, cleaned, style),
3121 sortid="inflection/2522",
3122 )
3123 else:
3124 wxr.wtp.debug(
3125 "accepted heuristic header: "
3126 "table cell identified as header based "
3127 "on style, AND the cleaned text is "
3128 "in LANGUAGES_WITH_CELLS_AS_HEADERS[{}]; "
3129 "cleaned text: {}, style: {}".format(lang, cleaned, style),
3130 sortid="inflection/2530",
3131 )
3132 is_title = True
3133 if ( 3133 ↛ 3140line 3133 didn't jump to line 3140 because the condition on line 3133 was never true
3134 not is_title
3135 and len(row) < len(cols_headered)
3136 and cols_headered[len(row)]
3137 ):
3138 # Whole column has title suggesting they are headers
3139 # (e.g. "Case")
3140 is_title = True
3141 if re.match(
3142 r"Conjugation of |Declension of |Inflection of |"
3143 r"Mutation of |Notes\b", # \b is word-boundary
3144 titletext,
3145 ):
3146 is_title = True
3147 return is_title, hdr_expansion, target, celltext
3150class TableContext:
3151 """Saved context used when parsing a table and its subtables."""
3153 __slot__ = (
3154 "stored_hdrspans",
3155 "section_header",
3156 "template_name",
3157 )
3159 def __init__(self, template_name: str | None = None) -> None:
3160 self.stored_hdrspans: list[HdrSpan] = []
3161 self.section_header: tuple[str, ...] = tuple()
3162 if template_name is None:
3163 self.template_name = ""
3164 else:
3165 self.template_name = template_name
3168def handle_wikitext_or_html_table(
3169 wxr: WiktextractContext,
3170 word: str,
3171 lang: str,
3172 pos: str,
3173 data: WordData,
3174 tree: WikiNode,
3175 titles: list[str],
3176 source: str,
3177 after: str,
3178 tablecontext: TableContext | None = None,
3179):
3180 """Parses a table from parsed Wikitext format into rows and columns of
3181 InflCell objects and then calls handle_generic_table() to parse it into
3182 forms. This adds the forms into ``data``."""
3183 assert isinstance(wxr, WiktextractContext)
3184 assert isinstance(word, str)
3185 assert isinstance(lang, str)
3186 assert isinstance(pos, str)
3187 assert isinstance(data, dict)
3188 assert isinstance(tree, WikiNode)
3189 assert tree.kind == NodeKind.TABLE or (
3190 tree.kind == NodeKind.HTML and tree.sarg == "table"
3191 )
3192 assert isinstance(titles, list)
3193 assert isinstance(source, str)
3194 for x in titles:
3195 assert isinstance(x, str)
3196 assert isinstance(after, str)
3197 assert tablecontext is None or isinstance(tablecontext, TableContext)
3198 # Imported here to avoid a circular import
3199 from wiktextract.page import clean_node, recursively_extract
3201 # from wikitextprocessor.parser import print_tree
3202 # print_tree(tree)
3203 # print("-------==========-------")
3205 if not tablecontext:
3206 tablecontext = TableContext()
3208 # Get language specific text removal patterns
3209 remove_text_patterns: tuple[str | re.Pattern, ...] | None = None
3210 if rem := get_lang_conf(lang, "remove_text_patterns"):
3211 for poses in rem.keys():
3212 if pos in poses:
3213 remove_text_patterns = rem[poses]
3214 break
3216 def handle_table1(
3217 wxr: WiktextractContext,
3218 tablecontext: TableContext,
3219 word: str,
3220 lang: str,
3221 pos: str,
3222 data: WordData,
3223 tree: WikiNode,
3224 titles: list[str],
3225 source: str,
3226 after: str,
3227 depth: int,
3228 ) -> list[tuple[list[list[InflCell]], list[str], str, int]]:
3229 # rows, titles, after, depth
3230 """Helper function allowing the 'flattening' out of the table
3231 recursion: instead of handling the tables in the wrong order
3232 (recursively), this function adds to new_row that is then
3233 iterated through in the main function at the end, creating
3234 a longer table (still in pieces) in the correct order."""
3236 assert isinstance(data, dict)
3237 assert isinstance(titles, list)
3238 assert isinstance(source, str)
3239 for x in titles:
3240 assert isinstance(x, str)
3241 assert isinstance(after, str)
3242 assert isinstance(depth, int)
3243 # print("HANDLE_WIKITEXT_TABLE", titles)
3245 # Filling for columns with rowspan > 1
3246 col_gap_data: list[InflCell | None] = []
3247 # Number of remaining rows for which to fill the column
3248 vertical_still_left: list[int] = []
3249 cols_headered: list[bool] = [] # [F, T, F, F...]
3250 # True when the whole column contains headers, even
3251 # when the cell is not considered a header; triggered
3252 # by the "*" inflmap meta-tag.
3253 rows: list[list[InflCell]] = []
3255 sub_ret = []
3257 # from wikitextprocessor.parser import print_tree
3258 # print_tree(tree)
3259 for node in tree.children:
3260 if not isinstance(node, WikiNode):
3261 continue
3262 kind: NodeKind | str
3263 if node.kind == NodeKind.HTML:
3264 kind = node.sarg
3265 else:
3266 kind = node.kind
3268 # print(" {}".format(node))
3269 if kind in (NodeKind.TABLE_CAPTION, "caption"):
3270 # print(" CAPTION:", node)
3271 pass
3272 elif kind in (NodeKind.TABLE_ROW, "tr"):
3273 if "vsShow" in node.attrs.get("class", "").split():
3274 # vsShow rows are those that are intially shown in tables
3275 # that have more data. The hidden data duplicates these
3276 # rows, so we skip it and just process the hidden data.
3277 continue
3279 # if (
3280 # len(node.children) == 1
3281 # and node.children[0].attrs.get("class") == "separator"
3282 # ):
3283 # print("------------------ skip separator")
3284 # continue
3286 # Parse a table row.
3287 row: list[InflCell] = []
3288 style = None
3289 row_has_nonempty_cells = False
3290 # Have nonempty cell not from rowspan
3291 for col in get_table_cells(node):
3292 # loop through each cell in the ROW
3294 # The below skip is not needed anymore, because we "skip" in
3295 # get_table_cells, but left here as a comment
3296 # if not isinstance(col, WikiNode):
3297 # # This skip is not used for counting,
3298 # # "None" is not used in
3299 # # indexing or counting or looping.
3300 # continue
3301 if col.kind == NodeKind.HTML:
3302 kind = col.sarg
3303 else:
3304 kind = col.kind
3305 if kind not in ( 3305 ↛ 3311line 3305 didn't jump to line 3311 because the condition on line 3305 was never true
3306 NodeKind.TABLE_HEADER_CELL,
3307 NodeKind.TABLE_CELL,
3308 "th",
3309 "td",
3310 ):
3311 print(" UNEXPECTED ROW CONTENT: {}".format(col))
3312 continue
3314 while (
3315 len(row) < len(vertical_still_left)
3316 and vertical_still_left[len(row)] > 0
3317 ):
3318 # vertical_still_left is [...0, 0, 2...] for each
3319 # column. It is populated at the end of the loop, at the
3320 # same time as col_gap_data. This needs to be looped and
3321 # filled this way because each `for col`-looping jumps
3322 # straight to the next meaningful cell; there is no
3323 # "None" cells, only emptiness between, and rowspan and
3324 # colspan are just to generate the "fill-
3325 vertical_still_left[len(row)] -= 1
3327 # KJ Apr 2026
3328 # type checking is ignored; I am pretty sure that
3329 # row will never contain None, even if col_gap_data
3330 # is `InflCell | None`, but this code is such
3331 # spaghetti that it's hard to figure out, except
3332 # by the process of elimination: this has never
3333 # caused trouble before, ergo, it works.
3334 row.append(col_gap_data[len(row)]) # type: ignore
3336 # appending row is how "indexing" is
3337 # done here; something is appended,
3338 # like a filler-cell here or a "start"
3339 # cell at the end of the row-loop,
3340 # which increased len(row) which is
3341 # then used as the target-index to check
3342 # for gaps. vertical_still_left is
3343 # the countdown to when to stop
3344 # filling in gaps, and goes down to 0,
3345 # and col_gap_data is not touched
3346 # except when a new rowspan is needed,
3347 # at the same time that
3348 # vertical_still_left gets reassigned.
3350 try:
3351 rowspan = int(col.attrs.get("rowspan", "1")) # 🡙
3352 colspan = int(col.attrs.get("colspan", "1")) # 🡘
3353 except ValueError:
3354 rowspan = 1
3355 colspan = 1
3356 # print("COL:", col)
3358 # Too many of these errors
3359 if colspan > 100:
3360 # wxr.wtp.error(
3361 # f"Colspan {colspan} over 30, set to 1",
3362 # sortid="inflection/20250113a",
3363 # )
3364 colspan = 100
3365 if rowspan > 100: 3365 ↛ 3370line 3365 didn't jump to line 3370 because the condition on line 3365 was never true
3366 # wxr.wtp.error(
3367 # f"Rowspan {rowspan} over 30, set to 1",
3368 # sortid="inflection/20250113b",
3369 # )
3370 rowspan = 100
3372 # Process any nested tables recursively.
3373 tables, rest = recursively_extract(
3374 col,
3375 lambda x: (
3376 isinstance(x, WikiNode)
3377 and (x.kind == NodeKind.TABLE or x.sarg == "table")
3378 ),
3379 )
3381 # Clean the rest of the cell.
3382 link_capture_dict: dict = {}
3383 celltext = clean_node(
3384 wxr, link_capture_dict, rest, collect_links=True
3385 )
3386 cell_links: list[tuple[str, str]] | None = (
3387 link_capture_dict.get("links", None)
3388 )
3389 # print(f"CLEANED: {celltext=}")
3390 # print(f"SUBTABLES: {tables}")
3391 # print(f"{link_capture_dict=}")
3393 # Remove regexed patterns from text
3394 if remove_text_patterns is not None:
3395 for pat in remove_text_patterns:
3396 celltext = re.sub(pat, "", celltext)
3397 # print(f"AFTER: {celltext=} <<")
3399 # Handle nested tables.
3400 for tbl in tables:
3401 # Some nested tables (e.g., croí/Irish) have subtitles
3402 # as normal paragraphs in the same cell under a descrip-
3403 # tive text that should be treated as a title (e.g.,
3404 # "Forms with the definite article", with "definite" not
3405 # mentioned elsewhere).
3406 new_titles = list(titles)
3407 if celltext:
3408 new_titles.append(celltext)
3409 subtbl = handle_table1(
3410 wxr,
3411 tablecontext,
3412 word,
3413 lang,
3414 pos,
3415 data,
3416 tbl, # type: ignore
3417 new_titles,
3418 source,
3419 "",
3420 depth + 1,
3421 )
3422 if subtbl: 3422 ↛ 3400line 3422 didn't jump to line 3400 because the condition on line 3422 was always true
3423 sub_ret.append((rows, titles, after, depth))
3424 rows = []
3425 titles = []
3426 after = ""
3427 sub_ret.extend(subtbl)
3429 # This magic value is used as part of header detection
3430 cellstyle = (
3431 col.attrs.get("style", "")
3432 + "//"
3433 + col.attrs.get("class", "")
3434 + "//"
3435 + str(kind)
3436 )
3438 if not row: # if first column in row
3439 style = cellstyle
3440 target = None
3441 titletext = celltext.strip()
3442 while titletext and is_superscript(titletext[-1]):
3443 titletext = titletext[:-1]
3445 (
3446 is_title,
3447 hdr_expansion,
3448 target,
3449 celltext,
3450 ) = determine_header(
3451 wxr,
3452 tablecontext,
3453 lang,
3454 word,
3455 pos,
3456 tree.kind,
3457 kind,
3458 style,
3459 row,
3460 col,
3461 celltext,
3462 titletext,
3463 cols_headered,
3464 None,
3465 cellstyle,
3466 )
3468 if is_title:
3469 # If this cell gets a "*" tag, make the whole column
3470 # below it (toggling it in cols_headered = [F, F, T...])
3471 # into headers.
3472 while len(cols_headered) <= len(row):
3473 cols_headered.append(False)
3474 if any("*" in tt for tt in hdr_expansion):
3475 cols_headered[len(row)] = True
3476 celltext = ""
3477 # if row_has_nonempty_cells has been True at some point, it
3478 # keeps on being True.
3479 # if row_has_nonempty_cells or is_title or celltext != "":
3480 # row_has_nonempty_cells = True
3481 # ⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓
3482 row_has_nonempty_cells |= is_title or celltext != ""
3483 cell = InflCell(
3484 celltext, is_title, colspan, rowspan, target, cell_links
3485 )
3486 for _ in range(0, colspan):
3487 # colspan🡘 current loop (col) or 1
3488 # All the data-filling for colspan
3489 # is done simply in this loop,
3490 # while rowspan needs to use
3491 # vertical_still_left to count gaps
3492 # and col_gap_data to fill in
3493 # those gaps with InflCell data.
3494 if rowspan > 1: # rowspan🡙 current loop (col) or 1
3495 while len(col_gap_data) <= len(row):
3496 # Initialize col_gap_data/ed if
3497 # it is lacking slots
3498 # for each column; col_gap_data and
3499 # vertical_still_left are never
3500 # reset to [], during
3501 # the whole table function.
3502 col_gap_data.append(None)
3503 vertical_still_left.append(0)
3504 # Below is where the "rectangle" block of rowspan
3505 # and colspan is filled for the future.
3506 col_gap_data[len(row)] = cell
3507 # col_gap_data contains cells that
3508 # will be used in the
3509 # future, or None
3510 vertical_still_left[len(row)] = rowspan - 1
3511 # A counter for how many gaps🡙 are still left to be
3512 # filled (row.append or
3513 # row[col_gap_data[len(row)] =>
3514 # rows), it is not reset to [], but decremented to 0
3515 # each time a row gets something from col_gap_data.
3516 # Append this cell 1+ times for colspan🡘
3517 row.append(cell)
3518 if not row:
3519 continue
3520 # After looping the original row-nodes above, fill
3521 # in the rest of the row if the final cell has colspan
3522 # (inherited from above, so a cell with rowspan and colspan)
3523 for i in range(len(row), len(vertical_still_left)):
3524 if vertical_still_left[i] <= 0:
3525 continue
3526 vertical_still_left[i] -= 1
3527 while len(row) < i:
3528 row.append(InflCell("", False, 1, 1, None))
3529 row.append(col_gap_data[i]) # type: ignore
3530 # print(" ROW {!r}".format(row))
3531 if row_has_nonempty_cells: 3531 ↛ 3259line 3531 didn't jump to line 3259 because the condition on line 3531 was always true
3532 rows.append(row)
3533 elif kind in ( 3533 ↛ 3259line 3533 didn't jump to line 3259 because the condition on line 3533 was always true
3534 NodeKind.TABLE_HEADER_CELL,
3535 NodeKind.TABLE_CELL,
3536 "th",
3537 "td",
3538 "span",
3539 ):
3540 # print(" TOP-LEVEL CELL", node)
3541 pass
3543 if sub_ret:
3544 main_ret = sub_ret
3545 main_ret.append((rows, titles, after, depth))
3546 else:
3547 main_ret = [(rows, titles, after, depth)]
3548 return main_ret
3550 new_rows = handle_table1(
3551 wxr, tablecontext, word, lang, pos, data, tree, titles, source, after, 0
3552 )
3554 # Now we have a table that has been parsed into rows and columns of
3555 # InflCell objects. Parse the inflection table from that format.
3556 if new_rows: 3556 ↛ exitline 3556 didn't return from function 'handle_wikitext_or_html_table' because the condition on line 3556 was always true
3557 for rows, titles, after, depth in new_rows:
3558 handle_generic_table(
3559 wxr,
3560 tablecontext,
3561 data,
3562 word,
3563 lang,
3564 pos,
3565 rows,
3566 titles,
3567 source,
3568 after,
3569 depth,
3570 )
3573def get_table_cells(node: WikiNode) -> Generator[WikiNode, None, None]:
3574 """If a wikitext table cell contains HTML cells `<td>`, as they sometimes
3575 do because it is easier to write wikitext conditionals that way,
3576 those td-elements are parsed as child elements of the Wikitext cell.
3577 This generator will yield wikitext and HTML direct children of
3578 `node` and if a Wikitext TABLE_CELL has direct td-element children,
3579 those are also yielded."""
3580 for col in node.children:
3581 if not isinstance(col, WikiNode):
3582 continue
3583 if any(
3584 isinstance(c, HTMLNode) and c.sarg in ("th", "td")
3585 for c in col.children
3586 ):
3587 html_cells = []
3588 content = []
3589 for c in col.children:
3590 if isinstance(c, HTMLNode) and c.sarg in ("th", "td"):
3591 html_cells.append(c)
3592 else:
3593 content.append(c)
3594 # Remove td-elements from col so they are not returned twice
3595 col.children = content
3596 yield col
3597 for c in html_cells:
3598 yield c
3599 else:
3600 yield col
3603def handle_html_table(
3604 wxr: WiktextractContext,
3605 word: str,
3606 lang: str,
3607 pos: str,
3608 data: WordData,
3609 tree: WikiNode,
3610 titles: list[str],
3611 source: str,
3612 after: str,
3613 tablecontext: TableContext | None = None,
3614) -> None:
3615 """A passer-on function for html-tables, XXX, remove these?"""
3616 handle_wikitext_or_html_table(
3617 wxr, word, lang, pos, data, tree, titles, source, after, tablecontext
3618 )
3621def handle_wikitext_table(
3622 wxr: WiktextractContext,
3623 word: str,
3624 lang: str,
3625 pos: str,
3626 data: WordData,
3627 tree: WikiNode,
3628 titles: list[str],
3629 source: str,
3630 after: str,
3631 tablecontext: TableContext | None = None,
3632) -> None:
3633 """A passer-on function for html-tables, XXX, remove these?"""
3634 handle_wikitext_or_html_table(
3635 wxr, word, lang, pos, data, tree, titles, source, after, tablecontext
3636 )
3639def parse_inflection_section(
3640 wxr: WiktextractContext,
3641 data: WordData,
3642 word: str,
3643 lang: str,
3644 pos: str,
3645 section: str,
3646 tree: WikiNode,
3647 tablecontext: TableContext | None = None,
3648) -> None:
3649 """Parses an inflection section on a page. ``data`` should be the
3650 data for a part-of-speech, and inflections will be added to it."""
3652 # print("PARSE_INFLECTION_SECTION {}/{}/{}/{}"
3653 # .format(word, lang, pos, section))
3654 assert isinstance(wxr, WiktextractContext)
3655 assert isinstance(data, dict)
3656 assert isinstance(word, str)
3657 assert isinstance(lang, str)
3658 assert isinstance(section, str)
3659 assert isinstance(tree, WikiNode)
3660 assert tablecontext is None or isinstance(tablecontext, TableContext)
3661 source = section
3662 tables: list[
3663 tuple[Literal["html", "wikitext"], WikiNode, list[str], list[str]]
3664 ] = []
3665 titleparts: list[str] = []
3666 preceding_bolded_title = ""
3668 # from wikitextprocessor.parser import print_tree
3669 # print_tree(tree)
3670 # print("--------------******************----------------")
3672 def process_tables() -> None:
3673 for kind, node, titles, after_l in tables:
3674 after = "".join(after_l).strip()
3675 after = clean_value(wxr, after)
3676 if kind == "wikitext":
3677 handle_wikitext_table(
3678 wxr,
3679 word,
3680 lang,
3681 pos,
3682 data,
3683 node,
3684 titles,
3685 source,
3686 after,
3687 tablecontext=tablecontext,
3688 )
3689 elif kind == "html": 3689 ↛ 3703line 3689 didn't jump to line 3703 because the condition on line 3689 was always true
3690 handle_html_table(
3691 wxr,
3692 word,
3693 lang,
3694 pos,
3695 data,
3696 node,
3697 titles,
3698 source,
3699 after,
3700 tablecontext=tablecontext,
3701 )
3702 else:
3703 raise RuntimeError(
3704 "{}: unimplemented table kind {}".format(word, kind)
3705 )
3707 def recurse_navframe(node: WikiNode | str, titles: list[str]) -> None:
3708 nonlocal tables
3709 nonlocal titleparts
3710 titleparts = []
3711 old_tables = tables
3712 tables = []
3714 recurse(node, [], navframe=True)
3716 process_tables()
3717 tables = old_tables
3719 def recurse(
3720 node: WikiNode
3721 | str
3722 | list[WikiNode | str]
3723 | list[list[WikiNode | str]],
3724 titles: list[str],
3725 navframe=False,
3726 ) -> None:
3727 nonlocal tables
3728 if isinstance(node, (list, tuple)):
3729 for x in node:
3730 recurse(x, titles, navframe)
3731 return
3732 if isinstance(node, str):
3733 if tables:
3734 tables[-1][-1].append(node)
3735 elif navframe:
3736 titleparts.append(node)
3737 return
3738 if not isinstance(node, WikiNode): 3738 ↛ 3739line 3738 didn't jump to line 3739 because the condition on line 3738 was never true
3739 if navframe:
3740 wxr.wtp.debug(
3741 "inflection table: unhandled in NavFrame: {}".format(node),
3742 sortid="inflection/2907",
3743 )
3744 return
3745 kind = node.kind
3746 if navframe:
3747 if kind == NodeKind.HTML:
3748 classes = node.attrs.get("class", "").split()
3749 if "NavToggle" in classes: 3749 ↛ 3750line 3749 didn't jump to line 3750 because the condition on line 3749 was never true
3750 return
3751 if "NavHead" in classes:
3752 # print("NAVHEAD:", node)
3753 recurse(node.children, titles, navframe)
3754 return
3755 if "NavContent" in classes:
3756 # print("NAVCONTENT:", node)
3757 title = "".join(titleparts).strip()
3758 title = html.unescape(title)
3759 title = title.strip()
3760 new_titles = list(titles)
3761 if not re.match(r"(Note:|Notes:)", title): 3761 ↛ 3763line 3761 didn't jump to line 3763 because the condition on line 3761 was always true
3762 new_titles.append(title)
3763 recurse(node, new_titles, navframe=False)
3764 return
3765 else:
3766 if kind == NodeKind.TABLE:
3767 tables.append(("wikitext", node, titles, []))
3768 return
3769 elif kind == NodeKind.HTML and node.sarg == "table":
3770 htmlclasses = node.attrs.get("class", ())
3771 if "audiotable" in htmlclasses:
3772 return
3773 tables.append(("html", node, titles, []))
3774 return
3775 elif kind in ( 3775 ↛ 3782line 3775 didn't jump to line 3782 because the condition on line 3775 was never true
3776 NodeKind.LEVEL2,
3777 NodeKind.LEVEL3,
3778 NodeKind.LEVEL4,
3779 NodeKind.LEVEL5,
3780 NodeKind.LEVEL6,
3781 ):
3782 return # Skip subsections
3783 if (
3784 kind == NodeKind.HTML
3785 and node.sarg == "div"
3786 and "NavFrame" in node.attrs.get("class", "").split()
3787 ):
3788 recurse_navframe(node, titles)
3789 return
3790 if kind == NodeKind.LINK:
3791 if len(node.largs) > 1:
3792 recurse(node.largs[1:], titles, navframe)
3793 else:
3794 recurse(node.largs[0], titles, navframe)
3795 return
3796 if kind == NodeKind.HTML and node.sarg == "ref":
3797 return
3798 if kind == NodeKind.LIST and node.sarg == ";":
3799 nonlocal preceding_bolded_title
3800 from wiktextract.page import clean_node
3802 preceding_bolded_title = clean_node(wxr, None, node).strip("; ")
3803 for x in node.children:
3804 recurse(x, titles, navframe)
3806 assert tree.kind == NodeKind.ROOT
3807 for x in tree.children:
3808 if preceding_bolded_title != "":
3809 recurse(x, [preceding_bolded_title])
3810 else:
3811 recurse(x, [])
3813 # Process the tables we found
3814 process_tables()
3816 # XXX this code is used for extracting tables for inflection tests
3817 if wxr.config.expand_tables: 3817 ↛ 3818line 3817 didn't jump to line 3818 because the condition on line 3817 was never true
3818 if section != "Mutation":
3819 with open(wxr.config.expand_tables, "w") as f:
3820 f.write(word + "\n")
3821 f.write(lang + "\n")
3822 f.write(pos + "\n")
3823 f.write(section + "\n")
3824 text = wxr.wtp.node_to_wikitext(tree)
3825 f.write(text + "\n")