Coverage for src/wiktextract/extractor/en/inflection.py: 86%
1481 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1# Code for parsing inflection tables.
2#
3# Copyright (c) 2021-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org.
5import collections
6import copy
7import functools
8import html
9import itertools
10import re
11import unicodedata
12from typing import Optional, Union
14from wikitextprocessor import MAGIC_FIRST, NodeKind, WikiNode
16from ...clean import clean_value
17from ...datautils import data_append, freeze, split_at_comma_semi
18from ...tags import valid_tags
19from ...wxr_context import WiktextractContext
20from .form_descriptions import (
21 classify_desc,
22 decode_tags,
23 distw,
24 parse_head_final_tags,
25)
26from .inflectiondata import infl_map, infl_start_map, infl_start_re
27from .lang_specific_configs import get_lang_conf, lang_specific_tags
28from .table_headers_heuristics_data import LANGUAGES_WITH_CELLS_AS_HEADERS
29from .type_utils import FormData
31# --debug-text-cell WORD
32# Command-line parameter for debugging. When parsing inflection tables,
33# print out debug messages when encountering this text.
34debug_cell_text: Optional[str] = None
37def set_debug_cell_text(text: str) -> None:
38 global debug_cell_text
39 debug_cell_text = text
42TagSets = list[tuple[str, ...]]
44# Column texts that are interpreted as an empty column.
45IGNORED_COLVALUES = {
46 "-",
47 "־",
48 "᠆",
49 "‐",
50 "‑",
51 "‒",
52 "–",
53 "—",
54 "―",
55 "−",
56 "⸺",
57 "⸻",
58 "﹘",
59 "﹣",
60 "-",
61 "/",
62 "?",
63 "not used",
64 "not applicable",
65}
67# These tags are never inherited from above
68# XXX merge with lang_specific
69noinherit_tags = {
70 "infinitive-i",
71 "infinitive-i-long",
72 "infinitive-ii",
73 "infinitive-iii",
74 "infinitive-iv",
75 "infinitive-v",
76}
78# Subject->object transformation mapping, when using dummy-object-concord
79# to replace subject concord tags with object concord tags
80object_concord_replacements = {
81 "first-person": "object-first-person",
82 "second-person": "object-second-person",
83 "third-person": "object-third-person",
84 "singular": "object-singular",
85 "plural": "object-plural",
86 "definite": "object-definite",
87 "indefinite": "object-indefinite",
88 "class-1": "object-class-1",
89 "class-2": "object-class-2",
90 "class-3": "object-class-3",
91 "class-4": "object-class-4",
92 "class-5": "object-class-5",
93 "class-6": "object-class-6",
94 "class-7": "object-class-7",
95 "class-8": "object-class-8",
96 "class-9": "object-class-9",
97 "class-10": "object-class-10",
98 "class-11": "object-class-11",
99 "class-12": "object-class-12",
100 "class-13": "object-class-13",
101 "class-14": "object-class-14",
102 "class-15": "object-class-15",
103 "class-16": "object-class-16",
104 "class-17": "object-class-17",
105 "class-18": "object-class-18",
106 "masculine": "object-masculine",
107 "feminine": "object-feminine",
108}
110# Words in title that cause addition of tags in all entries
111title_contains_global_map = {
112 "possessive": "possessive",
113 "possessed forms of": "possessive",
114 "predicative forms of": "predicative",
115 "negative": "negative",
116 "positive definite forms": "positive definite",
117 "positive indefinite forms": "positive indefinite",
118 "comparative": "comparative",
119 "superlative": "superlative",
120 "combined forms": "combined-form",
121 "mutation": "mutation",
122 "definite article": "definite",
123 "indefinite article": "indefinite",
124 "indefinite declension": "indefinite",
125 "bare forms": "indefinite", # e.g., cois/Irish
126 "definite declension": "definite",
127 "pre-reform": "dated",
128 "personal pronouns": "personal pronoun",
129 "composed forms of": "multiword-construction",
130 "subordinate-clause forms of": "subordinate-clause",
131 "participles of": "participle",
132 "variation of": "dummy-skip-this", # a'/Scottish Gaelic
133 "command form of": "imperative", # a راتلل/Pashto
134 "historical inflection of": "dummy-skip-this", # kork/Norwegian Nynorsk
135 "obsolete declension": "obsolete", # März/German 20241111
136}
137for k, v in title_contains_global_map.items():
138 if any(t not in valid_tags for t in v.split()): 138 ↛ 139line 138 didn't jump to line 139 because the condition on line 138 was never true
139 print("TITLE_CONTAINS_GLOBAL_MAP UNRECOGNIZED TAG: {}: {}".format(k, v))
140table_hdr_ign_part = r"(Inflection|Conjugation|Declension|Mutation) of [^\s]"
142table_hdr_ign_part_re = re.compile(r"(?i)(" + table_hdr_ign_part + ")")
143# (?i) python regex extension, ignore case
144title_contains_global_re = re.compile(
145 r"(?i)(^|\b)({}|{})($|\b)".format(
146 table_hdr_ign_part,
147 "|".join(re.escape(x) for x in title_contains_global_map.keys()),
148 )
149)
151# Words in title that cause addition of tags to table-tags "form"
152title_contains_wordtags_map = {
153 "pf": "perfective",
154 "impf": "imperfective",
155 "strong": "strong",
156 "weak": "weak",
157 "countable": "countable",
158 "uncountable": "uncountable",
159 "inanimate": "inanimate",
160 "animate": "animate",
161 "transitive": "transitive",
162 "intransitive": "intransitive",
163 "ditransitive": "ditransitive",
164 "ambitransitive": "ambitransitive",
165 "archaic": "archaic",
166 "dated": "dated",
167 "affirmative": "affirmative",
168 "negative": "negative",
169 "subject pronouns": "subjective",
170 "object pronouns": "objective",
171 "emphatic": "emphatic",
172 "proper noun": "proper-noun",
173 "no plural": "no-plural",
174 "imperfective": "imperfective",
175 "perfective": "perfective",
176 "no supine stem": "no-supine",
177 "no perfect stem": "no-perfect",
178 "deponent": "deponent",
179 "irregular": "irregular",
180 "no short forms": "no-short-form",
181 "iō-variant": "iō-variant",
182 "1st declension": "declension-1",
183 "2nd declension": "declension-2",
184 "3rd declension": "declension-3",
185 "4th declension": "declension-4",
186 "5th declension": "declension-5",
187 "6th declension": "declension-6",
188 "first declension": "declension-1",
189 "second declension": "declension-2",
190 "third declension": "declension-3",
191 "fourth declension": "declension-4",
192 "fifth declension": "declension-5",
193 "sixth declension": "declension-6",
194 "1st conjugation": "conjugation-1",
195 "2nd conjugation": "conjugation-2",
196 "3rd conjugation": "conjugation-3",
197 "4th conjugation": "conjugation-4",
198 "5th conjugation": "conjugation-5",
199 "6th conjugation": "conjugation-6",
200 "7th conjugation": "conjugation-7",
201 "first conjugation": "conjugation-1",
202 "second conjugation": "conjugation-2",
203 "third conjugation": "conjugation-3",
204 "fourth conjugation": "conjugation-4",
205 "fifth conjugation": "conjugation-5",
206 "sixth conjugation": "conjugation-6",
207 "seventh conjugation": "conjugation-7",
208 # Corsican regional tags in table header
209 "cismontane": "Cismontane",
210 "ultramontane": "Ultramontane",
211 "western lombard": "Western-Lombard",
212 "eastern lombard": "Eastern-Lombard",
213}
214for k, v in title_contains_wordtags_map.items():
215 if any(t not in valid_tags for t in v.split()): 215 ↛ 216line 215 didn't jump to line 216 because the condition on line 215 was never true
216 print(
217 "TITLE_CONTAINS_WORDTAGS_MAP UNRECOGNIZED TAG: {}: {}".format(k, v)
218 )
219title_contains_wordtags_re = re.compile(
220 r"(?i)(^|\b)({}|{})($|\b)".format(
221 table_hdr_ign_part,
222 "|".join(re.escape(x) for x in title_contains_wordtags_map.keys()),
223 )
224)
226# Parenthesized elements in title that are converted to tags in
227# "table-tags" form
228title_elements_map = {
229 "weak": "weak",
230 "strong": "strong",
231 "separable": "separable",
232 "masculine": "masculine",
233 "feminine": "feminine",
234 "neuter": "neuter",
235 "singular": "singular",
236 "plural": "plural",
237 "archaic": "archaic",
238 "dated": "dated",
239 "Attic": "Attic", # e.g. καλός/Greek/Adj
240 "Epic": "Epic", # e.g. καλός/Greek/Adj
241}
242for k, v in title_elements_map.items():
243 if any(t not in valid_tags for t in v.split()): 243 ↛ 244line 243 didn't jump to line 244 because the condition on line 243 was never true
244 print("TITLE_ELEMENTS_MAP UNRECOGNIZED TAG: {}: {}".format(k, v))
246# Parenthized element starts to map them to tags for form for the rest of
247# the element
248title_elemstart_map = {
249 "auxiliary": "auxiliary",
250 "Kotus type": "class",
251 "ÕS type": "class",
252 "class": "class",
253 "short class": "class",
254 "type": "class",
255 "strong class": "class",
256 "weak class": "class",
257 "accent paradigm": "accent-paradigm",
258 "stem in": "class",
259}
260for k, v in title_elemstart_map.items():
261 if any(t not in valid_tags for t in v.split()): 261 ↛ 262line 261 didn't jump to line 262 because the condition on line 261 was never true
262 print("TITLE_ELEMSTART_MAP UNRECOGNIZED TAG: {}: {}".format(k, v))
263title_elemstart_re = re.compile(
264 r"^({}) ".format("|".join(re.escape(x) for x in title_elemstart_map.keys()))
265)
268# Regexp for cell starts that are likely definitions of reference symbols.
269# See also nondef_re.
270def_re = re.compile(
271 r"(\s*•?\s+)?"
272 r"((\*+|[△†0123456789⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻]+)([⁾):]|\s|(?=[A-Z]))|"
273 r"\^(\*+|[△†])|"
274 r"([¹²³⁴⁵⁶⁷⁸⁹])|"
275 r"([ᴬᴮᴰᴱᴳᴴᴵᴶᴷᴸᴹᴺᴼᴾᴿᵀᵁⱽᵂᵃᵇᶜᵈᵉᶠᵍʰⁱʲᵏˡᵐⁿᵒᵖʳˢᵗᵘᵛʷˣʸᶻᵝᵞᵟᶿᶥᵠᵡ]))"
276)
277# ᴺᴸᴴ persan/Old Irish
279# Regexp for cell starts that are exceptions to def_re and do not actually
280# start a definition.
281nondef_re = re.compile(
282 r"(^\s*(1|2|3)\s+(sg|pl)\s*$|" # 1s or 3p etc.
283 r"\s*\d\d?\s*/\s*\d\d?\s*$)"
284) # taka/Swahili "15 / 17"
286# Certain tags are moved from headers in tables into word tags, as they always
287# apply to the whole word.
288TAGS_FORCED_WORDTAGS: set[str] = set(
289 [
290 # This was originally created for a issue with number paradigms in
291 # Arabic, but that is being handled elsewhere now.
292 ]
293)
296class InflCell:
297 """Cell in an inflection table."""
299 __slots__ = (
300 "text",
301 "is_title",
302 "colspan",
303 "rowspan",
304 "target",
305 )
307 def __init__(
308 self,
309 text: str,
310 is_title: bool,
311 colspan: int,
312 rowspan: int,
313 target: Optional[str],
314 ) -> None:
315 assert isinstance(text, str)
316 assert is_title in (True, False)
317 assert isinstance(colspan, int) and colspan >= 1
318 assert isinstance(rowspan, int) and rowspan >= 1
319 assert target is None or isinstance(target, str)
320 self.text = text.strip()
321 self.is_title = text and is_title
322 self.colspan = colspan
323 self.rowspan = rowspan
324 self.target = target
326 def __str__(self) -> str:
327 v = "{}/{}/{}/{!r}".format(
328 self.text, self.is_title, self.colspan, self.rowspan
329 )
330 if self.target:
331 v += ": {!r}".format(self.target)
332 return v
334 def __repr__(self) -> str:
335 return str(self)
338class HdrSpan:
339 """Saved information about a header cell/span during the parsing
340 of a table."""
342 __slots__ = (
343 "start",
344 "colspan",
345 "rowspan",
346 "rownum", # Row number where this occurred
347 "tagsets", # list of tuples
348 "text", # For debugging
349 "all_headers_row",
350 "expanded", # The header has been expanded to cover whole row/part
351 )
353 def __init__(
354 self,
355 start: int,
356 colspan: int,
357 rowspan: int,
358 rownum: int,
359 tagsets: TagSets,
360 text: str,
361 all_headers_row: bool,
362 ) -> None:
363 assert isinstance(start, int) and start >= 0
364 assert isinstance(colspan, int) and colspan >= 1
365 assert isinstance(rownum, int)
366 assert isinstance(tagsets, list)
367 for x in tagsets:
368 assert isinstance(x, tuple)
369 assert all_headers_row in (True, False)
370 self.start = start
371 self.colspan = colspan
372 self.rowspan = rowspan
373 self.rownum = rownum
374 self.tagsets = list(tuple(sorted(set(tags))) for tags in tagsets)
375 self.text = text
376 self.all_headers_row = all_headers_row
377 self.expanded = False
380def is_superscript(ch: str) -> bool:
381 """Returns True if the argument is a superscript character."""
382 assert isinstance(ch, str) and len(ch) == 1
383 try:
384 name = unicodedata.name(ch)
385 except ValueError:
386 return False
387 return (
388 re.match(
389 r"SUPERSCRIPT |"
390 r"MODIFIER LETTER SMALL |"
391 r"MODIFIER LETTER CAPITAL ",
392 name,
393 )
394 is not None
395 )
398def remove_useless_tags(lang: str, pos: str, tags: set[str]) -> None:
399 """Remove certain tag combinations from ``tags`` when they serve no purpose
400 together (cover all options)."""
401 assert isinstance(lang, str)
402 assert isinstance(pos, str)
403 assert isinstance(tags, set)
404 if (
405 "animate" in tags
406 and "inanimate" in tags
407 and get_lang_conf(lang, "animate_inanimate_remove")
408 ):
409 tags.remove("animate")
410 tags.remove("inanimate")
411 if (
412 "virile" in tags
413 and "nonvirile" in tags
414 and get_lang_conf(lang, "virile_nonvirile_remove")
415 ):
416 tags.remove("virile")
417 tags.remove("nonvirile")
418 # If all numbers in the language are listed, remove them all
419 numbers = get_lang_conf(lang, "numbers")
420 if numbers and all(x in tags for x in numbers):
421 for x in numbers:
422 tags.remove(x)
423 # If all genders in the language are listed, remove them all
424 genders = get_lang_conf(lang, "genders")
425 if genders and all(x in tags for x in genders):
426 for x in genders:
427 tags.remove(x)
428 # If all voices in the language are listed, remove them all
429 voices = get_lang_conf(lang, "voices")
430 if voices and all(x in tags for x in voices):
431 for x in voices:
432 tags.remove(x)
433 # If all strengths of the language are listed, remove them all
434 strengths = get_lang_conf(lang, "strengths")
435 if strengths and all(x in tags for x in strengths):
436 for x in strengths:
437 tags.remove(x)
438 # If all persons of the language are listed, remove them all
439 persons = get_lang_conf(lang, "persons")
440 if persons and all(x in tags for x in persons):
441 for x in persons:
442 tags.remove(x)
443 # If all definitenesses of the language are listed, remove them all
444 definitenesses = get_lang_conf(lang, "definitenesses")
445 if definitenesses and all(x in tags for x in definitenesses):
446 for x in definitenesses:
447 tags.remove(x)
450def tagset_cats(tagset: TagSets) -> set[str]:
451 """Returns a set of tag categories for the tagset (merged from all
452 alternatives)."""
453 return set(valid_tags[t] for ts in tagset for t in ts)
456def or_tagsets(
457 lang: str, pos: str, tagsets1: TagSets, tagsets2: TagSets
458) -> TagSets:
459 """Merges two tagsets (the new tagset just merges the tags from both, in
460 all combinations). If they contain simple alternatives (differ in
461 only one category), they are simply merged; otherwise they are split to
462 more alternatives. The tagsets are assumed be sets of sorted tuples."""
463 assert isinstance(tagsets1, list)
464 assert all(isinstance(x, tuple) for x in tagsets1)
465 assert isinstance(tagsets2, list)
466 assert all(isinstance(x, tuple) for x in tagsets1)
467 tagsets: TagSets = [] # This will be the result
469 def add_tags(tags1: tuple[str, ...]) -> None:
470 # CONTINUE
471 if not tags1:
472 return # empty set would merge with anything, won't change result
473 if not tagsets:
474 tagsets.append(tags1)
475 return
476 for tags2 in tagsets:
477 # Determine if tags1 can be merged with tags2
478 num_differ = 0
479 if tags1 and tags2: 479 ↛ 497line 479 didn't jump to line 497 because the condition on line 479 was always true
480 cats1 = set(valid_tags[t] for t in tags1)
481 cats2 = set(valid_tags[t] for t in tags2)
482 cats = cats1 | cats2
483 for cat in cats:
484 tags1_in_cat = set(t for t in tags1 if valid_tags[t] == cat)
485 tags2_in_cat = set(t for t in tags2 if valid_tags[t] == cat)
486 if (
487 tags1_in_cat != tags2_in_cat
488 or not tags1_in_cat
489 or not tags2_in_cat
490 ):
491 num_differ += 1
492 if not tags1_in_cat or not tags2_in_cat:
493 # Prevent merging if one is empty
494 num_differ += 1
495 # print("tags1={} tags2={} num_differ={}"
496 # .format(tags1, tags2, num_differ))
497 if num_differ <= 1:
498 # Yes, they can be merged
499 tagsets.remove(tags2)
500 tags_s = set(tags1) | set(tags2)
501 remove_useless_tags(lang, pos, tags_s)
502 tags_t = tuple(sorted(tags_s))
503 add_tags(tags_t) # Could result in further merging
504 return
505 # If we could not merge, add to tagsets
506 tagsets.append(tags1)
508 for tags in tagsets1:
509 add_tags(tags)
510 for tags in tagsets2:
511 add_tags(tags)
512 if not tagsets:
513 tagsets.append(())
515 # print("or_tagsets: {} + {} -> {}"
516 # .format(tagsets1, tagsets2, tagsets))
517 return tagsets
520def and_tagsets(
521 lang: str,
522 pos: str,
523 tagsets1: list[tuple[str, ...]],
524 tagsets2: list[tuple[str, ...]],
525) -> list[tuple[str, ...]]:
526 """Merges tagsets by taking union of all cobinations, without trying
527 to determine whether they are compatible."""
528 assert isinstance(tagsets1, list) and len(tagsets1) >= 1
529 assert all(isinstance(x, tuple) for x in tagsets1)
530 assert isinstance(tagsets2, list) and len(tagsets2) >= 1
531 assert all(isinstance(x, tuple) for x in tagsets1)
532 new_tagsets = []
533 tags: Union[set[str], tuple[str, ...]]
534 for tags1 in tagsets1:
535 for tags2 in tagsets2:
536 tags = set(tags1) | set(tags2)
537 remove_useless_tags(lang, pos, tags)
538 if "dummy-ignored-text-cell" in tags: 538 ↛ 539line 538 didn't jump to line 539 because the condition on line 538 was never true
539 tags.remove("dummy-ignored-text-cell")
540 tags = tuple(sorted(tags))
541 if tags not in new_tagsets: 541 ↛ 535line 541 didn't jump to line 535 because the condition on line 541 was always true
542 new_tagsets.append(tags)
543 # print("and_tagsets: {} + {} -> {}"
544 # .format(tagsets1, tagsets2, new_tagsets))
545 return new_tagsets
548@functools.lru_cache(65536)
549def extract_cell_content(
550 lang: str, word: str, col: str
551) -> tuple[str, list[str], list[tuple[str, str]], list[str]]:
552 """Cleans a row/column header for later processing. This returns
553 (cleaned, refs, defs, tags)."""
554 # print("EXTRACT_CELL_CONTENT {!r}".format(col))
555 hdr_tags = []
556 col = re.sub(r"(?s)\s*,\s*$", "", col)
557 col = re.sub(r"(?s)\s*•\s*$", "", col)
558 col = re.sub(r"\s+", " ", col)
559 col = col.strip()
560 if re.search(
561 r"^\s*(There are |"
562 r"\* |"
563 r"see |"
564 r"Use |"
565 r"use the |"
566 r"Only used |"
567 r"The forms in |"
568 r"these are also written |"
569 r"The genitive can be |"
570 r"Genitive forms are rare or non-existant|"
571 r"Accusative Note: |"
572 r"Classifier Note: |"
573 r"Noun: Assamese nouns are |"
574 r"the active conjugation|"
575 r"the instrumenal singular|"
576 r"Note:|"
577 r"\^* Note:|"
578 r"possible mutated form |"
579 r"The future tense: )",
580 col,
581 ):
582 return "dummy-ignored-text-cell", [], [], []
584 # Temporarily remove final parenthesized part (if separated by whitespace),
585 # so that we can extract reference markers before it.
586 final_paren = ""
587 m = re.search(r"\s+\([^)]*\)$", col)
588 if m is not None:
589 final_paren = m.group(0)
590 col = col[: m.start()]
592 # Extract references and tag markers
593 refs = []
594 special_references = get_lang_conf(lang, "special_references")
595 while True:
596 m = re.search(r"\^(.|\([^)]*\))$", col)
597 if not m:
598 break
599 r = m.group(1)
600 if r.startswith("(") and r.endswith(")"):
601 r = r[1:-1]
602 for r1 in r.split(","):
603 if r1 == "rare": 603 ↛ 604line 603 didn't jump to line 604 because the condition on line 603 was never true
604 hdr_tags.append("rare")
605 elif special_references and r1 in special_references:
606 hdr_tags.extend(special_references[r1].split())
607 else:
608 # v = m.group(1)
609 if r1.startswith("(") and r1.endswith(")"): 609 ↛ 610line 609 didn't jump to line 610 because the condition on line 609 was never true
610 r1 = r1[1:-1]
611 refs.append(unicodedata.normalize("NFKD", r1))
612 col = col[: m.start()]
613 # See if it is a ref definition
614 # print("BEFORE REF CHECK: {!r}".format(col))
615 m = def_re.match(col)
616 # print(f"Before def_re: {refs=}")
617 if m and not nondef_re.match(col):
618 ofs = 0
619 ref = None
620 deflst = []
621 for m in re.finditer(def_re, col):
622 if ref:
623 deflst.append((ref, col[ofs : m.start()].strip()))
624 ref = unicodedata.normalize(
625 "NFKD", m.group(3) or m.group(5) or m.group(6) or ""
626 )
627 ofs = m.end()
628 if ref: 628 ↛ 631line 628 didn't jump to line 631 because the condition on line 628 was always true
629 deflst.append((ref, col[ofs:].strip()))
630 # print("deflst:", deflst)
631 return "", [], deflst, []
632 # See if it *looks* like a reference to a definition
633 # print(f"After def_re: {refs=}")
634 while col:
635 if is_superscript(col[-1]) or col[-1] in ("†",):
636 if col.endswith("ʳᵃʳᵉ"):
637 hdr_tags.append("rare")
638 col = col[:-4].strip()
639 continue
640 if special_references:
641 stop_flag = False
642 for r in special_references:
643 if col.endswith(r):
644 hdr_tags.extend(special_references[r].split())
645 col = col[: -len(r)].strip()
646 stop_flag = True
647 break # this for loop
648 if stop_flag:
649 continue # this while loop
650 # Numbers and H/L/N are useful information
651 refs.append(unicodedata.normalize("NFKD", col[-1]))
652 col = col[:-1]
653 else:
654 break
656 # Check for another form of note definition
657 if ( 657 ↛ 663line 657 didn't jump to line 663 because the condition on line 657 was never true
658 len(col) > 2
659 and col[1] in (")", " ", ":")
660 and col[0].isdigit()
661 and not re.match(nondef_re, col)
662 ):
663 return "", [], [(col[0], col[2:].strip())], []
664 col = col.strip()
666 # Extract final "*" reference symbols. Sometimes there are multiple.
667 m = re.search(r"\*+$", col)
668 if m is not None:
669 col = col[: m.start()]
670 refs.append(unicodedata.normalize("NFKD", m.group(0)))
671 if col.endswith("(*)"): 671 ↛ 672line 671 didn't jump to line 672 because the condition on line 671 was never true
672 col = col[:-3].strip()
673 refs.append("*")
675 # Put back the final parenthesized part
676 col = col.strip() + final_paren
677 # print("EXTRACT_CELL_CONTENT: orig_col={!r} col={!r} refs={!r} hdr_tags={}"
678 # .format(orig_col, col, refs, hdr_tags))
679 return col.strip(), refs, [], hdr_tags
682@functools.lru_cache(10000)
683def parse_title(
684 title: str, source: str
685) -> tuple[list[str], list[str], list[FormData]]:
686 """Parses inflection table title. This returns (global_tags, table_tags,
687 extra_forms), where ``global_tags`` is tags to be added to each inflection
688 entry, ``table_tags`` are tags for the word but not to be added to every
689 form, and ``extra_forms`` is dictionary describing additional forms to be
690 included in the part-of-speech entry)."""
691 assert isinstance(title, str)
692 assert isinstance(source, str)
693 title = html.unescape(title)
694 title = re.sub(r"(?i)<[^>]*>", "", title).strip()
695 title = re.sub(r"\s+", " ", title)
696 # print("PARSE_TITLE:", title)
697 global_tags = []
698 table_tags = []
699 extra_forms = []
700 # Add certain global tags based on contained words
701 for m in re.finditer(title_contains_global_re, title):
702 v = m.group(0).lower()
703 if re.match(table_hdr_ign_part_re, v): 703 ↛ 704line 703 didn't jump to line 704 because the condition on line 703 was never true
704 continue
705 global_tags.extend(title_contains_global_map[v].split())
706 # Add certain tags to table-tags "form" based on contained words
707 for m in re.finditer(title_contains_wordtags_re, title):
708 v = m.group(0).lower()
709 if re.match(table_hdr_ign_part_re, v): 709 ↛ 710line 709 didn't jump to line 710 because the condition on line 709 was never true
710 continue
711 table_tags.extend(title_contains_wordtags_map[v].split())
712 if re.search(r"Conjugation of (s’|se ).*French verbs", title): 712 ↛ 713line 712 didn't jump to line 713 because the condition on line 712 was never true
713 global_tags.append("reflexive")
714 # Check for <x>-type at the beginning of title (e.g., Armenian) and various
715 # other ways of specifying an inflection class.
716 for m in re.finditer(
717 r"\b("
718 r"[\w/]+-type|"
719 r"accent-\w+|"
720 r"[\w/]+-stem|"
721 r"[^ ]+ gradation|"
722 r"\b(stem in [\w/ ]+)|"
723 r"[^ ]+ alternation|"
724 r"(First|Second|Third|Fourth|Fifth|Sixth|Seventh) "
725 r"(Conjugation|declension)|"
726 r"First and second declension|"
727 r"(1st|2nd|3rd|4th|5th|6th) declension|"
728 r"\w[\w/ ]* harmony"
729 r")\b",
730 title,
731 ):
732 dt: FormData = {"form": m.group(1), "source": source, "tags": ["class"]}
733 extra_forms.append(dt)
734 # Parse parenthesized part from title
735 for m in re.finditer(r"\(([^)]*)\)", title):
736 for elem in m.group(1).split(","):
737 # group(0) is the whole string, group(1) first parens
738 elem = elem.strip()
739 if elem in title_elements_map:
740 table_tags.extend(title_elements_map[elem].split())
741 else:
742 m1 = re.match(title_elemstart_re, elem)
743 if m1:
744 tags = title_elemstart_map[m1.group(1)].split()
745 dt = {
746 "form": elem[m1.end() :],
747 "source": source,
748 "tags": tags,
749 }
750 extra_forms.append(dt)
751 # For titles that contains no parenthesized parts, do some special
752 # handling to still interpret parts from them
753 if "(" not in title:
754 # No parenthesized parts
755 m1 = re.search(r"\b(Portuguese) (-.* verb) ", title)
756 if m1 is not None:
757 dt = {"form": m1.group(2), "tags": ["class"], "source": source}
758 extra_forms.append(dt)
759 for elem in title.split(","):
760 elem = elem.strip()
761 if elem in title_elements_map: 761 ↛ 762line 761 didn't jump to line 762 because the condition on line 761 was never true
762 table_tags.extend(title_elements_map[elem].split())
763 elif elem.endswith("-stem"): 763 ↛ 764line 763 didn't jump to line 764 because the condition on line 763 was never true
764 dt = {"form": elem, "tags": ["class"], "source": source}
765 extra_forms.append(dt)
766 return global_tags, table_tags, extra_forms
769def expand_header(
770 wxr: WiktextractContext,
771 tablecontext: "TableContext",
772 word: str,
773 lang: str,
774 pos: str,
775 text: str,
776 base_tags: Union[list[str], set[str], tuple[str, ...]],
777 silent=False,
778 ignore_tags=False,
779 depth=0,
780) -> list[tuple[str, ...]]:
781 """Expands a cell header to tagset, handling conditional expressions
782 in infl_map. This returns list of tuples of tags, each list element
783 describing an alternative interpretation. ``base_tags`` is combined
784 column and row tags for the cell in which the text is being interpreted
785 (conditional expressions in inflection data may depend on it).
786 If ``silent`` is True, then no warnings will be printed. If ``ignore_tags``
787 is True, then tags listed in "if" will be ignored in the test (this is
788 used when trying to heuristically detect whether a non-<th> cell is anyway
789 a header)."""
790 assert isinstance(wxr, WiktextractContext)
791 assert isinstance(word, str)
792 assert isinstance(lang, str)
793 assert isinstance(pos, str)
794 assert isinstance(text, str)
795 assert isinstance(base_tags, (list, tuple, set))
796 assert silent in (True, False)
797 assert isinstance(depth, int)
798 # print("EXPAND_HDR: text={!r} base_tags={!r}".format(text, base_tags))
799 # First map the text using the inflection map
800 text = clean_value(wxr, text)
801 combined_return: list[tuple[str, ...]] = []
802 parts = split_at_comma_semi(text, separators=[";"])
803 for text in parts:
804 if not text: 804 ↛ 805line 804 didn't jump to line 805 because the condition on line 804 was never true
805 continue
806 if text in infl_map:
807 v = infl_map[text] # list or string
808 else:
809 m = re.match(infl_start_re, text)
810 if m is not None: 810 ↛ 811line 810 didn't jump to line 811 because the condition on line 810 was never true
811 v = infl_start_map[m.group(1)]
812 # print("INFL_START {} -> {}".format(text, v))
813 elif re.match(r"Notes", text):
814 # Ignored header
815 # print("IGNORING NOTES")
816 combined_return = or_tagsets(
817 lang, pos, combined_return, [("dummy-skip-this",)]
818 )
819 # this just adds dummy-skip-this
820 continue
821 elif text in IGNORED_COLVALUES:
822 combined_return = or_tagsets(
823 lang, pos, combined_return, [("dummy-ignore-skipped",)]
824 )
825 continue
826 # Try without final parenthesized part
827 text_without_parens = re.sub(r"[,/]?\s+\([^)]*\)\s*$", "", text)
828 if text_without_parens in infl_map:
829 v = infl_map[text_without_parens]
830 elif m is None: 830 ↛ 846line 830 didn't jump to line 846 because the condition on line 830 was always true
831 if not silent:
832 wxr.wtp.debug(
833 "inflection table: unrecognized header: {}".format(
834 repr(text)
835 ),
836 sortid="inflection/735",
837 )
838 # Unrecognized header
839 combined_return = or_tagsets(
840 lang, pos, combined_return, [("error-unrecognized-form",)]
841 )
842 continue
844 # Then loop interpreting the value, until the value is a simple string.
845 # This may evaluate nested conditional expressions.
846 default_then = None
847 while True:
848 # If it is a string, we are done.
849 if isinstance(v, str):
850 tags = set(v.split())
851 remove_useless_tags(lang, pos, tags)
852 tagset = [tuple(sorted(tags))]
853 break
854 # For a list, just interpret it as alternatives. (Currently the
855 # alternatives must directly be strings.)
856 if isinstance(v, (list, tuple)):
857 tagset = []
858 for x in v:
859 tags = set(x.split())
860 remove_useless_tags(lang, pos, tags)
861 tags_t = tuple(sorted(tags))
862 if tags_t not in tagset: 862 ↛ 858line 862 didn't jump to line 858 because the condition on line 862 was always true
863 tagset.append(tags_t)
864 break
865 # Otherwise the value should be a dictionary describing a
866 # conditional expression.
867 if not isinstance(v, dict): 867 ↛ 868line 867 didn't jump to line 868 because the condition on line 867 was never true
868 wxr.wtp.debug(
869 "inflection table: internal: "
870 "UNIMPLEMENTED INFL_MAP VALUE: {}".format(infl_map[text]),
871 sortid="inflection/767",
872 )
873 tagset = [()]
874 break
875 # Evaluate the conditional expression.
876 assert isinstance(v, dict)
877 cond: Union[bool, str] = "default-true"
878 c: Union[str, list[str], set[str]] = ""
879 # Handle "lang" condition. The value must be either a
880 # single language or a list of languages, and the
881 # condition evaluates to True if the table is one of
882 # those languages.
883 if "lang" in v:
884 c = v["lang"]
885 if isinstance(c, str):
886 cond = c == lang
887 else:
888 assert isinstance(c, (list, tuple, set))
889 cond = lang in c
890 # Handle "nested-table-depth" condition. The value must
891 # be an int or list of ints, and the condition evaluates
892 # True if the depth is one of those values.
893 # "depth" is how deep into a nested table tree the current
894 # table lies. It is first started in handle_wikitext_table,
895 # so only applies to tables-within-tables, not other
896 # WikiNode content. `depth` is currently only passed as a
897 # parameter down the table parsing stack, and not stored.
898 if cond and "nested-table-depth" in v: 898 ↛ 899line 898 didn't jump to line 899 because the condition on line 898 was never true
899 d = v["nested-table-depth"]
900 if isinstance(d, int):
901 cond = d == depth
902 else:
903 assert isinstance(d, (list, tuple, set))
904 cond = depth in d
905 # Handle inflection-template condition. Must be a string
906 # or list of strings, and if tablecontext.template_name is in
907 # those, accept the condition.
908 # TableContext.template_name is passed down from page/
909 # parse_inflection, before parsing and expanding itself
910 # has begun.
911 if cond and tablecontext and "inflection-template" in v:
912 d1 = v["inflection-template"]
913 if isinstance(d1, str): 913 ↛ 916line 913 didn't jump to line 916 because the condition on line 913 was always true
914 cond = d1 == tablecontext.template_name
915 else:
916 assert isinstance(d1, (list, tuple, set))
917 cond = tablecontext.template_name in d1
918 # Handle "pos" condition. The value must be either a single
919 # part-of-speech or a list of them, and the condition evaluates to
920 # True if the part-of-speech is any of those listed.
921 if cond and "pos" in v:
922 c = v["pos"]
923 if isinstance(c, str):
924 cond = c == pos
925 else:
926 assert isinstance(c, (list, tuple, set))
927 cond = pos in c
928 # Handle "if" condition. The value must be a string containing a
929 # space-separated list of tags. The condition evaluates to True if
930 # ``base_tags`` contains all of the listed tags. If the condition
931 # is of the form "any: ...tags...", then any of the tags will be
932 # enough.
933 if cond and "if" in v and not ignore_tags:
934 c = v["if"]
935 assert isinstance(c, str)
936 # "if" condition is true if any of the listed tags is present if
937 # it starts with "any:", otherwise all must be present
938 if c.startswith("any: "):
939 cond = any(t in base_tags for t in c[5:].split())
940 else:
941 cond = all(t in base_tags for t in c.split())
943 # Handle "default" assignment. Store the value to be used
944 # as a default later.
945 if "default" in v:
946 assert isinstance(v["default"], str)
947 default_then = v["default"]
949 # Warning message about missing conditions for debugging.
951 if cond == "default-true" and not default_then and not silent:
952 wxr.wtp.debug(
953 "inflection table: IF MISSING COND: word={} "
954 "lang={} text={} base_tags={} c={} cond={}".format(
955 word, lang, text, base_tags, c, cond
956 ),
957 sortid="inflection/851",
958 )
959 # Based on the result of evaluating the condition, select either
960 # "then" part or "else" part.
961 if cond:
962 v = v.get("then", "")
963 else:
964 v1 = v.get("else")
965 if v1 is None:
966 if default_then:
967 v = default_then
968 else:
969 if not silent:
970 wxr.wtp.debug(
971 "inflection table: IF WITHOUT ELSE EVALS "
972 "False: "
973 "{}/{} {!r} base_tags={}".format(
974 word, lang, text, base_tags
975 ),
976 sortid="inflection/865",
977 )
978 v = "error-unrecognized-form"
979 else:
980 v = v1
982 # Merge the resulting tagset from this header part with the other
983 # tagsets from the whole header
984 combined_return = or_tagsets(lang, pos, combined_return, tagset)
986 # Return the combined tagsets, or empty tagset if we got no tagsets
987 if not combined_return:
988 combined_return = [()]
989 return combined_return
992def compute_coltags(
993 lang: str,
994 pos: str,
995 hdrspans: list[str],
996 start: int,
997 colspan: int,
998 celltext: int,
999) -> list[tuple[str]]:
1000 """Computes column tags for a column of the given width based on the
1001 current header spans."""
1002 assert isinstance(lang, str)
1003 assert isinstance(pos, str)
1004 assert isinstance(hdrspans, list)
1005 assert isinstance(start, int) and start >= 0
1006 assert isinstance(colspan, int) and colspan >= 1
1007 assert isinstance(celltext, str) # For debugging only
1008 # print("COMPUTE_COLTAGS CALLED start={} colspan={} celltext={!r}"
1009 # .format(start, colspan, celltext))
1010 # For debugging, set this to the form for whose cell you want debug prints
1011 if celltext == debug_cell_text: 1011 ↛ 1012line 1011 didn't jump to line 1012 because the condition on line 1011 was never true
1012 print(
1013 "COMPUTE_COLTAGS CALLED start={} colspan={} celltext={!r}".format(
1014 start, colspan, celltext
1015 )
1016 )
1017 for hdrspan in hdrspans:
1018 print(
1019 " row={} start={} colspans={} tagsets={}".format(
1020 hdrspan.rownum,
1021 hdrspan.start,
1022 hdrspan.colspan,
1023 hdrspan.tagsets,
1024 )
1025 )
1026 used = set()
1027 coltags = [()]
1028 last_header_row = 1000000
1029 # Iterate through the headers in reverse order, i.e., headers lower in the
1030 # table (closer to the cell) first.
1031 row_tagsets = [()]
1032 row_tagsets_rownum = 1000000
1033 used_hdrspans = set()
1034 for hdrspan in reversed(hdrspans):
1035 if (
1036 hdrspan.start + hdrspan.colspan <= start
1037 or hdrspan.start >= start + colspan
1038 ):
1039 # Does not horizontally overlap current cell. Ignore this hdrspan.
1040 if celltext == debug_cell_text: 1040 ↛ 1041line 1040 didn't jump to line 1041 because the condition on line 1040 was never true
1041 print(
1042 "Ignoring row={} start={} colspan={} tagsets={}".format(
1043 hdrspan.rownum,
1044 hdrspan.start,
1045 hdrspan.colspan,
1046 hdrspan.tagsets,
1047 )
1048 )
1049 continue
1050 # If the cell partially overlaps the current cell, assume we have
1051 # reached something unrelated and abort.
1052 if (
1053 hdrspan.start < start
1054 and hdrspan.start + hdrspan.colspan > start
1055 and hdrspan.start + hdrspan.colspan < start + colspan
1056 ):
1057 if celltext == debug_cell_text: 1057 ↛ 1058line 1057 didn't jump to line 1058 because the condition on line 1057 was never true
1058 print(
1059 "break on partial overlap at start {} {} {}".format(
1060 hdrspan.start, hdrspan.colspan, hdrspan.tagsets
1061 )
1062 )
1063 break
1064 if (
1065 hdrspan.start < start + colspan
1066 and hdrspan.start > start
1067 and hdrspan.start + hdrspan.colspan > start + colspan
1068 and not hdrspan.expanded
1069 ):
1070 if celltext == debug_cell_text: 1070 ↛ 1071line 1070 didn't jump to line 1071 because the condition on line 1070 was never true
1071 print(
1072 "break on partial overlap at end {} {} {}".format(
1073 hdrspan.start, hdrspan.colspan, hdrspan.tagsets
1074 )
1075 )
1076 break
1077 # Check if we have already used this cell.
1078 if id(hdrspan) in used_hdrspans:
1079 continue
1080 # We are going to use this cell.
1081 used_hdrspans.add(id(hdrspan))
1082 tagsets = hdrspan.tagsets
1083 # If the hdrspan is fully inside the current cell and does not cover
1084 # it fully, check if we should merge information from multiple cells.
1085 if not hdrspan.expanded and (
1086 hdrspan.start > start
1087 or hdrspan.start + hdrspan.colspan < start + colspan
1088 ):
1089 # Multiple columns apply to the current cell, only
1090 # gender/number/case tags present
1091 # If there are no tags outside the range in any of the
1092 # categories included in these cells, don't add anything
1093 # (assume all choices valid in the language are possible).
1094 in_cats = set(
1095 valid_tags[t]
1096 for x in hdrspans
1097 if x.rownum == hdrspan.rownum
1098 and x.start >= start
1099 and x.start + x.colspan <= start + colspan
1100 for tt in x.tagsets
1101 for t in tt
1102 )
1103 if celltext == debug_cell_text: 1103 ↛ 1104line 1103 didn't jump to line 1104 because the condition on line 1103 was never true
1104 print("in_cats={} tagsets={}".format(in_cats, tagsets))
1105 # Merge the tagsets into existing tagsets. This merges
1106 # alternatives into the same tagset if there is only one
1107 # category different; otherwise this splits the tagset into
1108 # more alternatives.
1109 includes_all_on_row = True
1110 for x in hdrspans:
1111 # print("X: x.rownum={} x.start={}".format(x.rownum, x.start))
1112 if x.rownum != hdrspan.rownum:
1113 continue
1114 if x.start < start or x.start + x.colspan > start + colspan:
1115 if celltext == debug_cell_text: 1115 ↛ 1116line 1115 didn't jump to line 1116 because the condition on line 1115 was never true
1116 print(
1117 "NOT IN RANGE: {} {} {}".format(
1118 x.start, x.colspan, x.tagsets
1119 )
1120 )
1121 includes_all_on_row = False
1122 continue
1123 if id(x) in used_hdrspans:
1124 if celltext == debug_cell_text: 1124 ↛ 1125line 1124 didn't jump to line 1125 because the condition on line 1124 was never true
1125 print(
1126 "ALREADY USED: {} {} {}".format(
1127 x.start, x.colspan, x.tagsets
1128 )
1129 )
1130 continue
1131 used_hdrspans.add(id(x))
1132 if celltext == debug_cell_text: 1132 ↛ 1133line 1132 didn't jump to line 1133 because the condition on line 1132 was never true
1133 print(
1134 "Merging into wide col: x.rownum={} "
1135 "x.start={} x.colspan={} "
1136 "start={} colspan={} tagsets={} x.tagsets={}".format(
1137 x.rownum,
1138 x.start,
1139 x.colspan,
1140 start,
1141 colspan,
1142 tagsets,
1143 x.tagsets,
1144 )
1145 )
1146 tagsets = or_tagsets(lang, pos, tagsets, x.tagsets)
1147 # If all headers on the row were included, ignore them.
1148 # See e.g. kunna/Swedish/Verb.
1149 ts_cats = tagset_cats(tagsets)
1150 if (
1151 includes_all_on_row
1152 or
1153 # Kludge, see fut/Hungarian/Verb
1154 ("tense" in ts_cats and "object" in ts_cats)
1155 ):
1156 tagsets = [()]
1157 # For limited categories, if the category doesn't appear
1158 # outside, we won't include the category
1159 if not in_cats - set(
1160 ("gender", "number", "person", "case", "category", "voice")
1161 ):
1162 # Sometimes we have masc, fem, neut and plural, so treat
1163 # number and gender as the same here (if one given, look for
1164 # the other too)
1165 if "number" in in_cats or "gender" in in_cats:
1166 in_cats.update(("number", "gender"))
1167 # Determine which categories occur outside on
1168 # the same row. Ignore headers that have been expanded
1169 # to cover the whole row/part of it.
1170 out_cats = set(
1171 valid_tags[t]
1172 for x in hdrspans
1173 if x.rownum == hdrspan.rownum
1174 and not x.expanded
1175 and (
1176 x.start < start or x.start + x.colspan > start + colspan
1177 )
1178 for tt in x.tagsets
1179 for t in tt
1180 )
1181 if celltext == debug_cell_text: 1181 ↛ 1182line 1181 didn't jump to line 1182 because the condition on line 1181 was never true
1182 print("in_cats={} out_cats={}".format(in_cats, out_cats))
1183 # Remove all inside categories that do not appear outside
1185 new_tagsets = []
1186 for ts in tagsets:
1187 tags = tuple(
1188 sorted(t for t in ts if valid_tags[t] in out_cats)
1189 )
1190 if tags not in new_tagsets: 1190 ↛ 1186line 1190 didn't jump to line 1186 because the condition on line 1190 was always true
1191 new_tagsets.append(tags)
1192 if celltext == debug_cell_text and new_tagsets != tagsets: 1192 ↛ 1193line 1192 didn't jump to line 1193 because the condition on line 1192 was never true
1193 print(
1194 "Removed tags that do not "
1195 "appear outside {} -> {}".format(
1196 # have_hdr never used?
1197 tagsets,
1198 new_tagsets,
1199 )
1200 )
1201 tagsets = new_tagsets
1202 key = (hdrspan.start, hdrspan.colspan)
1203 if key in used:
1204 if celltext == debug_cell_text: 1204 ↛ 1205line 1204 didn't jump to line 1205 because the condition on line 1204 was never true
1205 print(
1206 "Cellspan already used: start={} "
1207 "colspan={} rownum={} {}".format(
1208 hdrspan.start,
1209 hdrspan.colspan,
1210 hdrspan.rownum,
1211 hdrspan.tagsets,
1212 )
1213 )
1214 action = get_lang_conf(lang, "reuse_cellspan")
1215 # can be "stop", "skip" or "reuse"
1216 if action == "stop":
1217 break
1218 if action == "skip":
1219 continue
1220 assert action == "reuse"
1221 tcats = tagset_cats(tagsets)
1222 # Most headers block using the same column position above. However,
1223 # "register" tags don't do this (cf. essere/Italian/verb: "formal")
1224 if len(tcats) != 1 or "register" not in tcats:
1225 used.add(key)
1226 # If we have moved to a different row, merge into column tagsets
1227 # (we use different and_tagsets within the row)
1228 if row_tagsets_rownum != hdrspan.rownum:
1229 # row_tagsets_rownum was initialized as 10000000
1230 ret = and_tagsets(lang, pos, coltags, row_tagsets)
1231 if celltext == debug_cell_text: 1231 ↛ 1232line 1231 didn't jump to line 1232 because the condition on line 1231 was never true
1232 print(
1233 "merging rows: {} {} -> {}".format(
1234 coltags, row_tagsets, ret
1235 )
1236 )
1237 coltags = ret
1238 row_tagsets = [()]
1239 row_tagsets_rownum = hdrspan.rownum
1240 # Merge into coltags
1241 if hdrspan.all_headers_row and hdrspan.rownum + 1 == last_header_row:
1242 # If this row is all headers and immediately preceeds the last
1243 # header we accepted, take any header from there.
1244 row_tagsets = and_tagsets(lang, pos, row_tagsets, tagsets)
1245 if celltext == debug_cell_text: 1245 ↛ 1246line 1245 didn't jump to line 1246 because the condition on line 1245 was never true
1246 print("merged (next header row): {}".format(row_tagsets))
1247 else:
1248 # new_cats is for the new tags (higher up in the table)
1249 new_cats = tagset_cats(tagsets)
1250 # cur_cats is for the tags already collected (lower in the table)
1251 cur_cats = tagset_cats(coltags)
1252 if celltext == debug_cell_text: 1252 ↛ 1253line 1252 didn't jump to line 1253 because the condition on line 1252 was never true
1253 print(
1254 "row={} start={} colspan={} tagsets={} coltags={} "
1255 "new_cats={} cur_cats={}".format(
1256 hdrspan.rownum,
1257 hdrspan.start,
1258 hdrspan.colspan,
1259 tagsets,
1260 coltags,
1261 new_cats,
1262 cur_cats,
1263 )
1264 )
1265 if "detail" in new_cats:
1266 if not any(coltags): # Only if no tags so far
1267 coltags = or_tagsets(lang, pos, coltags, tagsets)
1268 if celltext == debug_cell_text: 1268 ↛ 1269line 1268 didn't jump to line 1269 because the condition on line 1268 was never true
1269 print("stopping on detail after merge")
1270 break
1271 # Here, we block bleeding of categories from above
1272 elif "non-finite" in cur_cats and "non-finite" in new_cats:
1273 stop = get_lang_conf(lang, "stop_non_finite_non_finite")
1274 if stop: 1274 ↛ 1300line 1274 didn't jump to line 1300 because the condition on line 1274 was always true
1275 if celltext == debug_cell_text: 1275 ↛ 1276line 1275 didn't jump to line 1276 because the condition on line 1275 was never true
1276 print("stopping on non-finite-non-finite")
1277 break
1278 elif "non-finite" in cur_cats and "voice" in new_cats:
1279 stop = get_lang_conf(lang, "stop_non_finite_voice")
1280 if stop: 1280 ↛ 1300line 1280 didn't jump to line 1300 because the condition on line 1280 was always true
1281 if celltext == debug_cell_text: 1281 ↛ 1282line 1281 didn't jump to line 1282 because the condition on line 1281 was never true
1282 print("stopping on non-finite-voice")
1283 break
1284 elif "non-finite" in new_cats and cur_cats & set(
1285 ("person", "number")
1286 ):
1287 if celltext == debug_cell_text: 1287 ↛ 1288line 1287 didn't jump to line 1288 because the condition on line 1287 was never true
1288 print("stopping on non-finite new")
1289 break
1290 elif "non-finite" in new_cats and "tense" in new_cats:
1291 stop = get_lang_conf(lang, "stop_non_finite_tense")
1292 if stop:
1293 if celltext == debug_cell_text: 1293 ↛ 1294line 1293 didn't jump to line 1294 because the condition on line 1293 was never true
1294 print("stopping on non-finite new")
1295 break
1296 elif "non-finite" in cur_cats and new_cats & set(("mood",)): 1296 ↛ 1297line 1296 didn't jump to line 1297 because the condition on line 1296 was never true
1297 if celltext == debug_cell_text:
1298 print("stopping on non-finite cur")
1299 break
1300 if (
1301 "tense" in new_cats
1302 and any("imperative" in x for x in coltags)
1303 and get_lang_conf(lang, "imperative_no_tense")
1304 ):
1305 if celltext == debug_cell_text: 1305 ↛ 1306line 1305 didn't jump to line 1306 because the condition on line 1305 was never true
1306 print("skipping tense in imperative")
1307 continue
1308 elif (
1309 "mood" in new_cats
1310 and "mood" in cur_cats
1311 and
1312 # Allow if all new tags are already in current set
1313 any(
1314 t not in ts1
1315 for ts1 in coltags # current
1316 for ts2 in tagsets # new (from above)
1317 for t in ts2
1318 )
1319 ):
1320 skip = get_lang_conf(lang, "skip_mood_mood")
1321 if skip:
1322 if celltext == debug_cell_text: 1322 ↛ 1323line 1322 didn't jump to line 1323 because the condition on line 1322 was never true
1323 print("skipping on mood-mood")
1324 # we continue to next header
1325 else:
1326 if celltext == debug_cell_text: 1326 ↛ 1327line 1326 didn't jump to line 1327 because the condition on line 1326 was never true
1327 print("stopping on mood-mood")
1328 break
1329 elif "tense" in new_cats and "tense" in cur_cats:
1330 skip = get_lang_conf(lang, "skip_tense_tense")
1331 if skip:
1332 if celltext == debug_cell_text: 1332 ↛ 1333line 1332 didn't jump to line 1333 because the condition on line 1332 was never true
1333 print("skipping on tense-tense")
1334 # we continue to next header
1335 else:
1336 if celltext == debug_cell_text: 1336 ↛ 1337line 1336 didn't jump to line 1337 because the condition on line 1336 was never true
1337 print("stopping on tense-tense")
1338 break
1339 elif "aspect" in new_cats and "aspect" in cur_cats:
1340 if celltext == debug_cell_text: 1340 ↛ 1341line 1340 didn't jump to line 1341 because the condition on line 1340 was never true
1341 print("skipping on aspect-aspect")
1342 continue
1343 elif "number" in cur_cats and "number" in new_cats:
1344 if celltext == debug_cell_text: 1344 ↛ 1345line 1344 didn't jump to line 1345 because the condition on line 1344 was never true
1345 print("stopping on number-number")
1346 break
1347 elif "number" in cur_cats and "gender" in new_cats:
1348 if celltext == debug_cell_text: 1348 ↛ 1349line 1348 didn't jump to line 1349 because the condition on line 1348 was never true
1349 print("stopping on number-gender")
1350 break
1351 elif "person" in cur_cats and "person" in new_cats:
1352 if celltext == debug_cell_text: 1352 ↛ 1353line 1352 didn't jump to line 1353 because the condition on line 1352 was never true
1353 print("stopping on person-person")
1354 break
1355 else:
1356 # Merge tags and continue to next header up/left in the table.
1357 row_tagsets = and_tagsets(lang, pos, row_tagsets, tagsets)
1358 if celltext == debug_cell_text: 1358 ↛ 1359line 1358 didn't jump to line 1359 because the condition on line 1358 was never true
1359 print("merged: {}".format(coltags))
1360 # Update the row number from which we have last taken headers
1361 last_header_row = hdrspan.rownum
1362 # Merge the final row tagset into coltags
1363 coltags = and_tagsets(lang, pos, coltags, row_tagsets)
1364 # print(
1365 # "HDRSPANS:", list((x.start, x.colspan, x.tagsets) for x in hdrspans)
1366 # )
1367 if celltext == debug_cell_text: 1367 ↛ 1368line 1367 didn't jump to line 1368 because the condition on line 1367 was never true
1368 print("COMPUTE_COLTAGS {} {}: {}".format(start, colspan, coltags))
1369 assert isinstance(coltags, list)
1370 assert all(isinstance(x, tuple) for x in coltags)
1371 return coltags
1374def parse_simple_table(
1375 wxr, tablecontext, word, lang, pos, rows, titles, source, after, depth
1376):
1377 """This is the default table parser. Despite its name, it can parse
1378 complex tables. This returns a list of forms to be added to the
1379 part-of-speech, or None if the table could not be parsed."""
1380 assert isinstance(wxr, WiktextractContext)
1381 assert isinstance(tablecontext, TableContext)
1382 assert isinstance(word, str)
1383 assert isinstance(lang, str)
1384 assert isinstance(pos, str)
1385 assert isinstance(rows, list)
1386 assert isinstance(source, str)
1387 assert isinstance(after, str)
1388 assert isinstance(depth, int)
1389 for row in rows:
1390 for col in row:
1391 assert isinstance(col, InflCell)
1392 assert isinstance(titles, list)
1393 for x in titles:
1394 assert isinstance(x, str)
1396 # print("PARSE_SIMPLE_TABLE: TITLES:", titles)
1397 if debug_cell_text: 1397 ↛ 1398line 1397 didn't jump to line 1398 because the condition on line 1397 was never true
1398 print("ROWS:")
1399 for row in rows:
1400 print(" ", row)
1402 # Check for forced rowspan kludge. See e.g.
1403 # maorski/Serbo-Croatian. These are essentially multi-row
1404 # cells implemented using <br> rather than separate cell. We fix this
1405 # by identifying rows where this happens, and splitting the current row
1406 # to multiple rows by synthesizing additional cells.
1407 new_rows = []
1408 for row in rows:
1409 split_row = (
1410 any(x.is_title and x.text in ("inanimate\nanimate",) for x in row)
1411 and
1412 # x is an InflCell
1413 all(x.rowspan == 1 for x in row)
1414 )
1415 if not split_row:
1416 new_rows.append(row)
1417 continue
1418 row1 = []
1419 row2 = []
1420 for cell in row:
1421 cell1 = copy.deepcopy(cell)
1422 if "\n" in cell.text:
1423 # Has more than one line - split this cell
1424 parts = cell.text.strip().splitlines()
1425 if len(parts) != 2: 1425 ↛ 1426line 1425 didn't jump to line 1426 because the condition on line 1425 was never true
1426 wxr.wtp.debug(
1427 "forced rowspan kludge got {} parts: {!r}".format(
1428 len(parts), cell.text
1429 ),
1430 sortid="inflection/1234",
1431 )
1432 cell2 = copy.deepcopy(cell)
1433 cell1.text = parts[0]
1434 cell2.text = parts[1]
1435 else:
1436 cell1.rowspan = 2
1437 cell2 = cell1 # ref, not a copy
1438 row1.append(cell1)
1439 row2.append(cell2)
1440 new_rows.append(row1)
1441 new_rows.append(row2)
1442 rows = new_rows
1443 # print("ROWS AFTER FORCED ROWSPAN KLUDGE:")
1444 # for row in rows:
1445 # print(" ", row)
1447 # Parse definitions for references (from table itself and from text
1448 # after it)
1449 def_ht = {}
1451 def add_defs(defs: list[tuple[str, str]]) -> None:
1452 for ref, d in defs:
1453 # print("DEF: ref={} d={}".format(ref, d))
1454 d = d.strip()
1455 d = d.split(". ")[0].strip() # text before ". "
1456 if not d: 1456 ↛ 1457line 1456 didn't jump to line 1457 because the condition on line 1456 was never true
1457 continue
1458 if d.endswith("."): # catc ".."??
1459 d = d[:-1]
1460 tags, topics = decode_tags(d, no_unknown_starts=True)
1461 # print(f"{ref=}, {d=}, {tags=}")
1462 if topics or any("error-unknown-tag" in ts for ts in tags):
1463 d = d[0].lower() + d[1:]
1464 tags, topics = decode_tags(d, no_unknown_starts=True)
1465 if topics or any("error-unknown-tag" in ts for ts in tags):
1466 # Failed to parse as tags
1467 # print("Failed: topics={} tags={}"
1468 # .format(topics, tags))
1469 continue
1470 tags1_s: set[str] = set()
1471 for ts in tags:
1472 tags1_s.update(ts)
1473 tags1 = tuple(sorted(tags1_s))
1474 # print("DEFINED: {} -> {}".format(ref, tags1))
1475 def_ht[ref] = tags1
1477 def generate_tags(
1478 rowtags: list[tuple[str]], table_tags: list[str]
1479 ) -> tuple[
1480 list[tuple[str, ...]], list[tuple[str, ...]], list[tuple[str, ...]]
1481 ]:
1482 new_coltags = []
1483 all_hdr_tags = [] # list of tuples
1484 new_rowtags = []
1485 for rt0 in rowtags:
1486 for ct0 in compute_coltags(
1487 lang,
1488 pos,
1489 hdrspans,
1490 col_idx, # col_idx=>start
1491 colspan,
1492 col, # cell_text
1493 ):
1494 base_tags: set[str] = (
1495 set(rt0)
1496 | set(ct0)
1497 | set(global_tags)
1498 | set(itertools.chain.from_iterable(table_tags))
1499 ) # Union.
1500 alt_tags = expand_header(
1501 wxr,
1502 tablecontext,
1503 word,
1504 lang,
1505 pos,
1506 text,
1507 base_tags,
1508 depth=depth,
1509 )
1510 # base_tags are used in infl_map "if"-conds.
1511 for tt in alt_tags:
1512 if tt not in all_hdr_tags:
1513 all_hdr_tags.append(tt)
1514 tt_s = set(tt)
1515 # Certain tags are always moved to word-level tags
1516 if tt_s & TAGS_FORCED_WORDTAGS: 1516 ↛ 1517line 1516 didn't jump to line 1517 because the condition on line 1516 was never true
1517 table_tags.extend(tt_s & TAGS_FORCED_WORDTAGS)
1518 tt_s = tt_s - TAGS_FORCED_WORDTAGS
1519 # Add tags from referenced footnotes
1520 tt_s.update(refs_tags)
1521 # Sort, convert to tuple, and add to set of
1522 # alternatives.
1523 tt = tuple(sorted(tt_s))
1524 if tt not in new_coltags:
1525 new_coltags.append(tt)
1526 # Kludge (saprast/Latvian/Verb): ignore row tags
1527 # if trying to add a non-finite after mood.
1528 if any(valid_tags[t] == "mood" for t in rt0) and any(
1529 valid_tags[t] == "non-finite" for t in tt
1530 ):
1531 tags = tuple(sorted(set(tt) | set(hdr_tags)))
1532 else:
1533 tags = tuple(sorted(set(tt) | set(rt0) | set(hdr_tags)))
1534 if tags not in new_rowtags:
1535 new_rowtags.append(tags)
1536 return new_rowtags, new_coltags, all_hdr_tags
1538 def add_new_hdrspan(
1539 col: str,
1540 hdrspans: list[HdrSpan],
1541 store_new_hdrspan: bool,
1542 col0_followed_by_nonempty: bool,
1543 col0_hdrspan: Optional[HdrSpan],
1544 ) -> tuple[str, bool, Optional[HdrSpan]]:
1545 hdrspan = HdrSpan(
1546 col_idx, colspan, rowspan, rownum, new_coltags, col, all_headers
1547 )
1548 hdrspans.append(hdrspan)
1550 # infl-map tag "dummy-store-hdrspan" causes this new hdrspan
1551 # to be added to a register of stored hdrspans to be used
1552 # later with "dummy-load-stored-hdrspans".
1553 if store_new_hdrspan: 1553 ↛ 1554line 1553 didn't jump to line 1554 because the condition on line 1553 was never true
1554 tablecontext.stored_hdrspans.append(hdrspan)
1556 # Handle headers that are above left-side header
1557 # columns and are followed by personal pronouns in
1558 # remaining columns (basically headers that
1559 # evaluate to no tags). In such cases widen the
1560 # left-side header to the full row.
1561 if previously_seen: # id(cell) in seen_cells previously
1562 col0_followed_by_nonempty = True
1563 return col, col0_followed_by_nonempty, col0_hdrspan
1564 elif col0_hdrspan is None:
1565 col0_hdrspan = hdrspan
1566 elif any(all_hdr_tags): 1566 ↛ 1634line 1566 didn't jump to line 1634 because the condition on line 1566 was always true
1567 col0_cats = tagset_cats(col0_hdrspan.tagsets)
1568 later_cats = tagset_cats(all_hdr_tags)
1569 col0_allowed = get_lang_conf(lang, "hdr_expand_first")
1570 later_allowed = get_lang_conf(lang, "hdr_expand_cont")
1571 later_allowed = later_allowed | set(["dummy"])
1572 # dummy2 has different behavior than plain dummy
1573 # and does not belong here.
1575 # print("col0_cats={} later_cats={} "
1576 # "fol_by_nonempty={} col_idx={} end={} "
1577 # "tagsets={}"
1578 # .format(col0_cats, later_cats,
1579 # col0_followed_by_nonempty, col_idx,
1580 # col0_hdrspan.start +
1581 # col0_hdrspan.colspan,
1582 # col0_hdrspan.tagsets))
1583 # print("col0.rowspan={} rowspan={}"
1584 # .format(col0_hdrspan.rowspan, rowspan))
1585 # Only expand if [col0_cats and later_cats are allowed
1586 # and don't overlap] and [col0 has tags], and there have
1587 # been [no disallowed cells in between].
1588 #
1589 # There are three cases here:
1590 # - col0_hdrspan set, continue with allowed current
1591 # - col0_hdrspan set, expand, start new
1592 # - col0_hdrspan set, no expand, start new
1593 if (
1594 not col0_followed_by_nonempty
1595 and
1596 # XXX Only one cat of tags: kunna/Swedish
1597 # XXX len(col0_cats) == 1 and
1598 col0_hdrspan.rowspan >= rowspan
1599 and
1600 # from hdrspan
1601 not (later_cats - later_allowed)
1602 and not (col0_cats & later_cats)
1603 ):
1604 # First case: col0 set, continue
1605 return col, col0_followed_by_nonempty, col0_hdrspan
1606 # We are going to start new col0_hdrspan. Check if
1607 # we should expand.
1608 if (
1609 not col0_followed_by_nonempty
1610 and not (col0_cats - col0_allowed)
1611 and
1612 # Only "allowed" allowed
1613 # XXX len(col0_cats) == 1 and
1614 col_idx > col0_hdrspan.start + col0_hdrspan.colspan
1615 ):
1616 # col_idx is beyond current colspan
1617 # *Expand* current col0_hdrspan
1618 # print("EXPANDING COL0 MID: {} from {} to {} "
1619 # "cols {}"
1620 # .format(col0_hdrspan.text,
1621 # col0_hdrspan.colspan,
1622 # col_idx - col0_hdrspan.start,
1623 # col0_hdrspan.tagsets))
1624 col0_hdrspan.colspan = col_idx - col0_hdrspan.start
1625 col0_hdrspan.expanded = True
1626 # Clear old col0_hdrspan
1627 if col == debug_cell_text: 1627 ↛ 1628line 1627 didn't jump to line 1628 because the condition on line 1627 was never true
1628 print("START NEW {}".format(hdrspan.tagsets))
1629 col0_hdrspan = None
1630 # Now start new, unless it comes from previous row
1631 if not previously_seen: 1631 ↛ 1634line 1631 didn't jump to line 1634 because the condition on line 1631 was always true
1632 col0_hdrspan = hdrspan
1633 col0_followed_by_nonempty = False
1634 return col, col0_followed_by_nonempty, col0_hdrspan
1636 def split_text_into_alts(col: str) -> tuple[str, list[str], list[str]]:
1637 # Split the cell text into alternatives
1638 split_extra_tags = []
1639 if col and is_superscript(col[0]): 1639 ↛ 1640line 1639 didn't jump to line 1640 because the condition on line 1639 was never true
1640 alts = [col]
1641 else:
1642 separators = [";", "•", r"\n", " or "]
1643 if " + " not in col:
1644 separators.append(",")
1645 if not col.endswith("/"):
1646 separators.append("/")
1647 if col in special_phrase_splits:
1648 # Use language-specific special splits.
1649 # These are phrases and constructions that have
1650 # unique ways of splitting, not specific characters
1651 # to split on like with the default splitting.
1652 alts, tags = special_phrase_splits[col]
1653 split_extra_tags = tags.split()
1654 for x in split_extra_tags:
1655 assert x in valid_tags
1656 assert isinstance(alts, (list, tuple))
1657 assert isinstance(tags, str)
1658 else:
1659 # Use default splitting. However, recognize
1660 # language-specific replacements and change them to magic
1661 # characters before splitting. This way we won't split
1662 # them. This is important for, e.g., recognizing
1663 # alternative pronouns.
1664 # The magic characters are characters out of Unicode scope
1665 # that are given a simple incremental value, int > unicode.
1666 repls = {}
1667 magic_ch = MAGIC_FIRST
1668 trs = get_lang_conf(lang, "form_transformations")
1669 # trs is a list of lists of strings
1670 for _, v, _, _ in trs:
1671 # v is a pattern string, like "^ich"
1672 # form_transformations data is doing double-duty here,
1673 # because the pattern strings are already known to us and
1674 # not meant to be split.
1675 m = re.search(v, col)
1676 if m is not None:
1677 # if pattern found in text
1678 magic = chr(magic_ch)
1679 magic_ch += 1 # next magic character value
1680 col = re.sub(v, magic, col) # replace with magic ch
1681 repls[magic] = m.group(0)
1682 # remember what regex match string each magic char
1683 # replaces. .group(0) is the whole match.
1684 alts0 = split_at_comma_semi(col, separators=separators)
1685 # with magic characters in place, split the text so that
1686 # pre-transformation text is out of the way.
1687 alts = []
1688 for alt in alts0:
1689 # create a new list with the separated items and
1690 # the magic characters replaced with the original texts.
1691 for k, v in repls.items():
1692 alt = re.sub(k, v, alt)
1693 alts.append(alt)
1694 # Remove "*" from beginning of forms, as in non-attested
1695 # or reconstructed forms. Otherwise it might confuse romanization
1696 # detection.
1697 alts = list(re.sub(r"^\*\*?([^ ])", r"\1", x) for x in alts)
1698 alts = list(
1699 x for x in alts if not re.match(r"pronounced with |\(with ", x)
1700 )
1701 alts = list(
1702 re.sub(r"^\((in the sense [^)]*)\)\s+", "", x) for x in alts
1703 )
1704 # Check for parenthesized alternatives, e.g. ripromettersi/Italian
1705 if all( 1705 ↛ 1716line 1705 didn't jump to line 1716 because the condition on line 1705 was never true
1706 re.match(r"\w+( \w+)* \(\w+( \w+)*(, \w+( \w+)*)*\)$", alt)
1707 # word word* \(word word*(, word word*)*\)
1708 and all(
1709 distw([re.sub(r" \(.*", "", alt)], x) < 0.5
1710 # Levenshtein distance
1711 for x in re.sub(r".*\((.*)\)", r"\1", alt).split(", ")
1712 )
1713 # Extract from parentheses for testin
1714 for alt in alts
1715 ):
1716 new_alts = []
1717 for alt in alts:
1718 # Replace parentheses before splitting
1719 alt = alt.replace(" (", ", ")
1720 alt = alt.replace(")", "")
1721 for new_alt in alt.split(", "):
1722 new_alts.append(new_alt)
1723 alts = new_alts
1724 return col, alts, split_extra_tags
1726 def handle_mixed_lines(alts: list[str]) -> list[tuple[str, str, str]]:
1727 # Handle the special case where romanization is given under
1728 # normal form, e.g. in Russian. There can be multiple
1729 # comma-separated forms in each case. We also handle the case
1730 # where instead of romanization we have IPA pronunciation
1731 # (e.g., avoir/French/verb).
1732 len2 = len(alts) // 2
1733 # Check for IPAs (forms first, IPAs under)
1734 # base, base, IPA, IPA
1735 if (
1736 len(alts) % 2 == 0 # Divisibly by two
1737 and all(
1738 re.match(r"^\s*/.*/\s*$", x) # Inside slashes = IPA
1739 for x in alts[len2:]
1740 )
1741 ): # In the second half of alts
1742 nalts = list(
1743 (alts[i], "", alts[i + len2])
1744 # List of tuples: (base, "", ipa)
1745 for i in range(len2)
1746 )
1747 # base, base, base, IPA
1748 elif (
1749 len(alts) > 2
1750 and re.match(r"^\s*/.*/\s*$", alts[-1])
1751 and all(not x.startswith("/") for x in alts[:-1])
1752 ):
1753 # Only if the last alt is IPA
1754 nalts = list((alts[i], "", alts[-1]) for i in range(len(alts) - 1))
1755 # base, IPA, IPA, IPA
1756 elif (
1757 len(alts) > 2
1758 and not alts[0].startswith("/")
1759 and all(
1760 re.match(r"^\s*/.*/\s*$", alts[i]) for i in range(1, len(alts))
1761 )
1762 ):
1763 # First is base and the rest is IPA alternatives
1764 nalts = list((alts[0], "", alts[i]) for i in range(1, len(alts)))
1766 # Check for romanizations, forms first, romanizations under
1767 elif (
1768 len(alts) % 2 == 0
1769 and not any("(" in x for x in alts)
1770 and all(
1771 classify_desc(
1772 re.sub(
1773 r"\^.*$",
1774 "",
1775 # Remove ends of strings starting from ^.
1776 # Supescripts have been already removed
1777 # from the string, while ^xyz needs to be
1778 # removed separately, though it's usually
1779 # something with a single letter?
1780 "".join(xx for xx in x if not is_superscript(xx)),
1781 )
1782 )
1783 == "other"
1784 for x in alts[:len2]
1785 )
1786 and all(
1787 classify_desc(
1788 re.sub(
1789 r"\^.*$",
1790 "",
1791 "".join(xx for xx in x if not is_superscript(xx)),
1792 )
1793 )
1794 in ("romanization", "english")
1795 for x in alts[len2:]
1796 )
1797 ):
1798 nalts = list((alts[i], alts[i + len2], "") for i in range(len2))
1799 # Check for romanizations, forms and romanizations alternating
1800 elif (
1801 len(alts) % 2 == 0
1802 and not any("(" in x for x in alts)
1803 and all(
1804 classify_desc(
1805 re.sub(
1806 r"\^.*$",
1807 "",
1808 "".join(xx for xx in alts[i] if not is_superscript(xx)),
1809 )
1810 )
1811 == "other"
1812 for i in range(0, len(alts), 2)
1813 )
1814 and all(
1815 classify_desc(
1816 re.sub(
1817 r"\^.*$",
1818 "",
1819 "".join(xx for xx in alts[i] if not is_superscript(xx)),
1820 )
1821 )
1822 in ("romanization", "english")
1823 for i in range(1, len(alts), 2)
1824 )
1825 ):
1826 # odds
1827 nalts = list(
1828 (alts[i], alts[i + 1], "") for i in range(0, len(alts), 2)
1829 )
1830 # evens
1831 else:
1832 new_alts = []
1833 for alt in alts:
1834 lst = [""]
1835 idx = 0
1836 for m in re.finditer(
1837 r"(^|\w|\*)\((\w+" r"(/\w+)*)\)",
1838 # start OR letter OR asterisk (word/word*)
1839 # \\___________group 1_______/ \ \_g3_///
1840 # \ \__gr. 2_//
1841 # \_____________group 0________________/
1842 alt,
1843 ):
1844 v = m.group(2) # (word/word/word...)
1845 if (
1846 classify_desc(v) == "tags" # Tags inside parens
1847 or m.group(0) == alt
1848 ): # All in parens
1849 continue
1850 new_lst = []
1851 for x in lst:
1852 x += alt[idx : m.start()] + m.group(1)
1853 # alt until letter or asterisk
1854 idx = m.end()
1855 vparts = v.split("/")
1856 # group(2) = ["word", "wörd"...]
1857 if len(vparts) == 1:
1858 new_lst.append(x)
1859 new_lst.append(x + v)
1860 # "kind(er)" -> ["kind", "kinder"]
1861 else:
1862 for vv in vparts:
1863 new_lst.append(x + vv)
1864 # "lampai(tten/den)" ->
1865 # ["lampaitten", "lampaiden"]
1866 lst = new_lst
1867 for x in lst:
1868 new_alts.append(x + alt[idx:])
1869 # add the end of alt
1870 nalts = list((x, "", "") for x in new_alts)
1871 # [form, no romz, no ipa]
1872 return nalts
1874 def find_semantic_parens(form: str) -> tuple[str, list[str]]:
1875 # "Some languages" (=Greek) use brackets to mark things that
1876 # require tags, like (informality), [rarity] and {archaicity}.
1877 extra_tags = []
1878 if re.match(r"\([^][(){}]*\)$", form):
1879 if get_lang_conf(lang, "parentheses_for_informal"):
1880 form = form[1:-1]
1881 extra_tags.append("informal")
1882 else:
1883 form = form[1:-1]
1884 elif re.match(r"\{\[[^][(){}]*\]\}$", form):
1885 if get_lang_conf( 1885 ↛ 1892line 1885 didn't jump to line 1892 because the condition on line 1885 was always true
1886 lang, "square_brackets_for_rare"
1887 ) and get_lang_conf(lang, "curly_brackets_for_archaic"):
1888 # είμαι/Greek/Verb
1889 form = form[2:-2]
1890 extra_tags.extend(["rare", "archaic"])
1891 else:
1892 form = form[2:-2]
1893 elif re.match(r"\{[^][(){}]*\}$", form):
1894 if get_lang_conf(lang, "curly_brackets_for_archaic"): 1894 ↛ 1899line 1894 didn't jump to line 1899 because the condition on line 1894 was always true
1895 # είμαι/Greek/Verb
1896 form = form[1:-1]
1897 extra_tags.extend(["archaic"])
1898 else:
1899 form = form[1:-1]
1900 elif re.match(r"\[[^][(){}]*\]$", form):
1901 if get_lang_conf(lang, "square_brackets_for_rare"): 1901 ↛ 1906line 1901 didn't jump to line 1906 because the condition on line 1901 was always true
1902 # είμαι/Greek/Verb
1903 form = form[1:-1]
1904 extra_tags.append("rare")
1905 else:
1906 form = form[1:-1]
1907 return form, extra_tags
1909 def handle_parens(
1910 form: str, roman: str, clitic: str, extra_tags: list[str]
1911 ) -> tuple[str, str, str]:
1912 if re.match(r"[’'][a-z]([a-z][a-z]?)?$", paren):
1913 # is there a clitic starting with apostrophe?
1914 clitic = paren
1915 # assume the whole paren is a clitic
1916 # then remove paren from form
1917 form = (form[: m.start()] + subst + form[m.end() :]).strip()
1918 elif classify_desc(paren) == "tags":
1919 tagsets1, topics1 = decode_tags(paren)
1920 if not topics1: 1920 ↛ 1941line 1920 didn't jump to line 1941 because the condition on line 1920 was always true
1921 for ts in tagsets1:
1922 ts = tuple(x for x in ts if " " not in x)
1923 # There are some generated tags containing
1924 # spaces; do not let them through here.
1925 extra_tags.extend(ts)
1926 form = (form[: m.start()] + subst + form[m.end() :]).strip()
1927 # brackets contain romanization
1928 elif ( 1928 ↛ 1937line 1928 didn't jump to line 1937 because the condition on line 1928 was never true
1929 m.start() > 0
1930 and not roman
1931 and classify_desc(form[: m.start()]) == "other"
1932 and
1933 # "other" ~ text
1934 classify_desc(paren) in ("romanization", "english")
1935 and not re.search(r"^with |-form$", paren)
1936 ):
1937 roman = paren
1938 form = (form[: m.start()] + subst + form[m.end() :]).strip()
1939 elif re.search(r"^with |-form", paren): 1939 ↛ 1940line 1939 didn't jump to line 1940 because the condition on line 1939 was never true
1940 form = (form[: m.start()] + subst + form[m.end() :]).strip()
1941 return form, roman, clitic
1943 def merge_row_and_column_tags(form, some_has_covered_text):
1944 # Merge column tags and row tags. We give preference
1945 # to moods etc coming from rowtags (cf. austteigen/German/Verb
1946 # imperative forms).
1948 # In certain cases, what a tag means depends on whether
1949 # it is a row or column header. Depending on the language,
1950 # we replace certain tags with others if they're in
1951 # a column or row
1953 ret = []
1954 # rtagreplacs = get_lang_conf(lang, "rowtag_replacements")
1955 # ctagreplacs = get_lang_conf(lang, "coltag_replacements")
1956 for rt in sorted(rowtags):
1957 if "dummy-use-as-coltags" in rt: 1957 ↛ 1958line 1957 didn't jump to line 1958 because the condition on line 1957 was never true
1958 continue
1959 # if lang was in rowtag_replacements)
1960 # if not rtagreplacs == None:
1961 # rt = replace_directional_tags(rt, rtagreplacs)
1962 for ct in sorted(coltags):
1963 if "dummy-use-as-rowtags" in ct: 1963 ↛ 1964line 1963 didn't jump to line 1964 because the condition on line 1963 was never true
1964 continue
1965 # if lang was in coltag_replacements
1966 # if not ctagreplacs == None:
1967 # ct = replace_directional_tags(ct,
1968 # ctagreplacs)
1969 tags = set(global_tags)
1970 tags.update(extra_tags)
1971 tags.update(rt)
1972 tags.update(refs_tags)
1973 tags.update(tablecontext.section_header)
1974 # Merge tags from column. For certain kinds of tags,
1975 # those coming from row take precedence.
1976 old_tags = set(tags)
1977 for t in ct:
1978 c = valid_tags[t]
1979 if c in ("mood", "case", "number") and any(
1980 valid_tags[tt] == c for tt in old_tags
1981 ):
1982 continue
1983 tags.add(t)
1985 # Extract language-specific tags from the
1986 # form. This may also adjust the form.
1987 form, lang_tags = lang_specific_tags(lang, pos, form)
1988 tags.update(lang_tags)
1990 # For non-finite verb forms, see if they have
1991 # a gender/class suffix
1992 if pos == "verb" and any(
1993 valid_tags[t] == "non-finite" for t in tags
1994 ):
1995 form, tt = parse_head_final_tags(wxr, lang, form)
1996 tags.update(tt)
1998 # Remove "personal" tag if have nth person; these
1999 # come up with e.g. reconhecer/Portuguese/Verb. But
2000 # not if we also have "pronoun"
2001 if (
2002 "personal" in tags
2003 and "pronoun" not in tags
2004 and any(
2005 x in tags
2006 for x in [
2007 "first-person",
2008 "second-person",
2009 "third-person",
2010 ]
2011 )
2012 ):
2013 tags.remove("personal")
2015 # If we have impersonal, remove person and number.
2016 # This happens with e.g. viajar/Portuguese/Verb
2017 if "impersonal" in tags:
2018 tags = tags - set(
2019 [
2020 "first-person",
2021 "second-person",
2022 "third-person",
2023 "singular",
2024 "plural",
2025 ]
2026 )
2028 # Remove unnecessary "positive" tag from verb forms
2029 if pos == "verb" and "positive" in tags:
2030 if "negative" in tags: 2030 ↛ 2031line 2030 didn't jump to line 2031 because the condition on line 2030 was never true
2031 tags.remove("negative")
2032 tags.remove("positive")
2034 # Many Russian (and other Slavic) inflection tables
2035 # have animate/inanimate distinction that generates
2036 # separate entries for neuter/feminine, but the
2037 # distinction only applies to masculine. Remove them
2038 # form neuter/feminine and eliminate duplicates.
2039 if get_lang_conf(lang, "masc_only_animate"):
2040 for t1 in ("animate", "inanimate"):
2041 for t2 in ("neuter", "feminine"):
2042 if (
2043 t1 in tags
2044 and t2 in tags
2045 and "masculine" not in tags
2046 and "plural" not in tags
2047 ):
2048 tags.remove(t1)
2050 # German adjective tables contain "(keiner)" etc
2051 # for mixed declension plural. When the adjective
2052 # disappears and it becomes just one word, remove
2053 # the "includes-article" tag. e.g. eiskalt/German
2054 if "includes-article" in tags and " " not in form:
2055 tags.remove("includes-article")
2057 # Handle ignored forms. We mark that the form was
2058 # provided. This is important information; some words
2059 # just do not have a certain form. However, there also
2060 # many cases where no word in a language has a
2061 # particular form. Post-processing could detect and
2062 # remove such cases.
2063 if form in IGNORED_COLVALUES:
2064 # if cell text seems to be ignorable
2065 if "dummy-ignore-skipped" in tags:
2066 continue
2067 if (
2068 col_idx not in has_covering_hdr
2069 and some_has_covered_text
2070 ):
2071 continue
2072 # don't ignore this cell if there's been a header
2073 # above it
2074 form = "-"
2075 elif col_idx in has_covering_hdr:
2076 some_has_covered_text = True
2078 # Handle ambiguous object concord. If a header
2079 # gives the "dummy-object-concord"-tag to a word,
2080 # replace person, number and gender tags with
2081 # their "object-" counterparts so that the verb
2082 # agrees with the object instead.
2083 # Use only when the verb has ONLY object agreement!
2084 # a پخول/Pashto
2085 if "dummy-object-concord" in tags: 2085 ↛ 2086line 2085 didn't jump to line 2086 because the condition on line 2085 was never true
2086 for subtag, objtag in object_concord_replacements.items():
2087 if subtag in tags:
2088 tags.remove(subtag)
2089 tags.add(objtag)
2091 # Remove the dummy mood tag that we sometimes
2092 # use to block adding other mood and related
2093 # tags
2094 tags = tags - set(
2095 [
2096 "dummy-mood",
2097 "dummy-tense",
2098 "dummy-ignore-skipped",
2099 "dummy-object-concord",
2100 "dummy-reset-headers",
2101 "dummy-use-as-coltags",
2102 "dummy-use-as-rowtags",
2103 "dummy-store-hdrspan",
2104 "dummy-load-stored-hdrspans",
2105 "dummy-reset-stored-hdrspans",
2106 "dummy-section-header",
2107 ]
2108 )
2110 # Perform language-specific tag replacements according
2111 # to rules in a table.
2112 lang_tag_mappings = get_lang_conf(lang, "lang_tag_mappings")
2113 if lang_tag_mappings is not None: 2113 ↛ 2114line 2113 didn't jump to line 2114 because the condition on line 2113 was never true
2114 for pre, post in lang_tag_mappings.items():
2115 if all(t in tags for t in pre):
2116 tags = (tags - set(pre)) | set(post)
2118 # Warn if there are entries with empty tags
2119 if not tags: 2119 ↛ 2120line 2119 didn't jump to line 2120 because the condition on line 2119 was never true
2120 wxr.wtp.debug(
2121 "inflection table: empty tags for {}".format(form),
2122 sortid="inflection/1826",
2123 )
2125 # Warn if form looks like IPA
2126 ########## XXX ########
2127 # Because IPA is its own unicode block, we could also
2128 # technically do a Unicode name check to see if a string
2129 # contains IPA. Not all valid IPA characters are in the
2130 # IPA extension block, so you can technically have false
2131 # negatives if it's something like /toki/, but it
2132 # shouldn't give false positives.
2133 # Alternatively, you could make a list of IPA-admissible
2134 # characters and reject non-IPA stuff with that.
2135 if re.match(r"\s*/.*/\s*$", form): 2135 ↛ 2136line 2135 didn't jump to line 2136 because the condition on line 2135 was never true
2136 wxr.wtp.debug(
2137 "inflection table form looks like IPA: "
2138 "form={} tags={}".format(form, tags),
2139 sortid="inflection/1840",
2140 )
2142 # Note that this checks `form`, not `in tags`
2143 if form == "dummy-ignored-text-cell": 2143 ↛ 2144line 2143 didn't jump to line 2144 because the condition on line 2143 was never true
2144 continue
2146 if "dummy-remove-this-cell" in tags: 2146 ↛ 2147line 2146 didn't jump to line 2147 because the condition on line 2146 was never true
2147 continue
2149 # Add the form
2150 tags = list(sorted(tags))
2151 dt = {"form": form, "tags": tags, "source": source}
2152 if roman:
2153 dt["roman"] = roman
2154 if ipa:
2155 dt["ipa"] = ipa
2156 ret.append(dt)
2157 # If we got separate clitic form, add it
2158 if clitic:
2159 dt = {
2160 "form": clitic,
2161 "tags": tags + ["clitic"],
2162 "source": source,
2163 }
2164 ret.append(dt)
2165 return ret, form, some_has_covered_text
2167 # First extract definitions from cells
2168 # See defs_ht for footnote defs stuff
2169 for row in rows:
2170 for cell in row:
2171 text, refs, defs, hdr_tags = extract_cell_content(
2172 lang, word, cell.text
2173 )
2174 # refs, defs = footnote stuff, defs -> (ref, def)
2175 add_defs(defs)
2176 # Extract definitions from text after table
2177 text, refs, defs, hdr_tags = extract_cell_content(lang, word, after)
2178 add_defs(defs)
2180 # Then extract the actual forms
2181 ret = []
2182 hdrspans = []
2183 first_col_has_text = False
2184 rownum = 0
2185 title = None
2186 global_tags = []
2187 table_tags = []
2188 special_phrase_splits = get_lang_conf(lang, "special_phrase_splits")
2189 form_replacements = get_lang_conf(lang, "form_replacements")
2190 possibly_ignored_forms = get_lang_conf(lang, "conditionally_ignored_cells")
2191 cleanup_rules = get_lang_conf(lang, "minor_text_cleanups")
2193 for title in titles:
2194 more_global_tags, more_table_tags, extra_forms = parse_title(
2195 title, source
2196 )
2197 global_tags.extend(more_global_tags)
2198 table_tags.extend(more_table_tags)
2199 ret.extend(extra_forms)
2200 cell_rowcnt = collections.defaultdict(int)
2201 seen_cells = set()
2202 has_covering_hdr = set()
2203 some_has_covered_text = False
2204 for row in rows:
2205 # print("ROW:", row)
2206 # print("====")
2207 # print(f"Start of PREVIOUS row hdrspans:"
2208 # f"{tuple(sp.tagsets for sp in hdrspans)}")
2209 # print(f"Start of row txt: {tuple(t.text for t in row)}")
2210 if not row: 2210 ↛ 2211line 2210 didn't jump to line 2211 because the condition on line 2210 was never true
2211 continue # Skip empty rows
2212 all_headers = all(x.is_title or not x.text.strip() for x in row)
2213 text = row[0].text
2214 if (
2215 row[0].is_title
2216 and text
2217 and not is_superscript(text[0])
2218 and text not in infl_map # zealous inflation map?
2219 and (
2220 re.match(r"Inflection ", text)
2221 or re.sub(
2222 r"\s+",
2223 " ", # flatten whitespace
2224 re.sub(
2225 r"\s*\([^)]*\)",
2226 "",
2227 # Remove whitespace+parens
2228 text,
2229 ),
2230 ).strip()
2231 not in infl_map
2232 )
2233 and not re.match(infl_start_re, text)
2234 and all(
2235 x.is_title == row[0].is_title and x.text == text
2236 # all InflCells in `row` have the same is_title and text
2237 for x in row
2238 )
2239 ):
2240 if text and title is None:
2241 # Only if there were no titles previously make the first
2242 # text that is found the title
2243 title = text
2244 if re.match(r"(Note:|Notes:)", title): 2244 ↛ 2245line 2244 didn't jump to line 2245 because the condition on line 2244 was never true
2245 continue # not a title
2246 more_global_tags, more_table_tags, extra_forms = parse_title(
2247 title, source
2248 )
2249 global_tags.extend(more_global_tags)
2250 table_tags.extend(more_table_tags)
2251 ret.extend(extra_forms)
2252 continue # Skip title rows without incrementing i
2253 if "dummy-skip-this" in global_tags: 2253 ↛ 2254line 2253 didn't jump to line 2254 because the condition on line 2253 was never true
2254 return []
2255 rowtags = [()]
2256 # have_hdr = False
2257 # have_hdr never used?
2258 have_text = False
2259 samecell_cnt = 0
2260 col0_hdrspan = None # col0 or later header (despite its name)
2261 col0_followed_by_nonempty = False
2262 row_empty = True
2263 for col_idx, cell in enumerate(row):
2264 colspan = cell.colspan # >= 1
2265 rowspan = cell.rowspan # >= 1
2266 previously_seen = id(cell) in seen_cells
2267 # checks to see if this cell was in the previous ROW
2268 seen_cells.add(id(cell))
2269 if samecell_cnt == 0:
2270 # First column of a (possible multi-column) cell
2271 samecell_cnt = colspan - 1
2272 else:
2273 assert samecell_cnt > 0
2274 samecell_cnt -= 1
2275 continue
2277 # is_first_row_of_cell = cell_rowcnt[id(cell)] == 0
2278 # never used?
2280 # defaultdict(int) around line 1900
2281 cell_rowcnt[id(cell)] += 1
2282 # => how many cols this spans
2283 col = cell.text
2284 if not col:
2285 continue
2286 row_empty = False
2287 is_title = cell.is_title
2289 # If the cell has a target, i.e., text after colon, interpret
2290 # it as simply specifying a value for that value and ignore
2291 # it otherwise.
2292 if cell.target:
2293 text, refs, defs, hdr_tags = extract_cell_content(
2294 lang, word, col
2295 )
2296 if not text: 2296 ↛ 2297line 2296 didn't jump to line 2297 because the condition on line 2296 was never true
2297 continue
2298 refs_tags = set()
2299 for ref in refs: # gets tags from footnotes 2299 ↛ 2300line 2299 didn't jump to line 2300 because the loop on line 2299 never started
2300 if ref in def_ht:
2301 refs_tags.update(def_ht[ref])
2302 rowtags = expand_header(
2303 wxr,
2304 tablecontext,
2305 word,
2306 lang,
2307 pos,
2308 text,
2309 [],
2310 silent=True,
2311 depth=depth,
2312 )
2313 rowtags = list(
2314 set(tuple(sorted(set(x) | refs_tags)) for x in rowtags)
2315 )
2316 is_title = False
2317 col = cell.target
2319 # print(rownum, col_idx, col)
2320 # print(f"is_title: {is_title}")
2321 if is_title:
2322 # It is a header cell
2323 text, refs, defs, hdr_tags = extract_cell_content(
2324 lang, word, col
2325 )
2326 if not text:
2327 continue
2328 # Extract tags from referenced footnotes
2329 refs_tags = set()
2330 for ref in refs:
2331 if ref in def_ht:
2332 refs_tags.update(def_ht[ref])
2334 # Expand header to tags
2335 v = expand_header(
2336 wxr,
2337 tablecontext,
2338 word,
2339 lang,
2340 pos,
2341 text,
2342 [],
2343 silent=True,
2344 depth=depth,
2345 )
2346 # print("EXPANDED {!r} to {}".format(text, v))
2348 if col_idx == 0:
2349 # first_col_has_text is used for a test to ignore
2350 # upper-left cells that are just text without
2351 # header info
2352 first_col_has_text = True
2353 # Check if the header expands to reset hdrspans
2354 if any("dummy-reset-headers" in tt for tt in v):
2355 new_hdrspans = []
2356 for hdrspan in hdrspans:
2357 # if there are HdrSpan objects (abstract headers with
2358 # row- and column-spans) that are to the left or at the
2359 # same row or below, KEEP those; things above and to
2360 # the right of the hdrspan with dummy-reset-headers
2361 # are discarded. Tags from the header together with
2362 # dummy-reset-headers are kept as normal.
2363 if (
2364 hdrspan.start + hdrspan.colspan < col_idx
2365 or hdrspan.rownum > rownum - cell.rowspan
2366 ):
2367 new_hdrspans.append(hdrspan)
2368 hdrspans = new_hdrspans
2370 for tt in v:
2371 if "dummy-section-header" in tt: 2371 ↛ 2372line 2371 didn't jump to line 2372 because the condition on line 2371 was never true
2372 tablecontext.section_header = tt
2373 break
2374 if "dummy-reset-section-header" in tt: 2374 ↛ 2375line 2374 didn't jump to line 2375 because the condition on line 2374 was never true
2375 tablecontext.section_header = []
2376 # Text between headers on a row causes earlier headers to
2377 # be reset
2378 if have_text:
2379 # print(" HAVE_TEXT BEFORE HDR:", col)
2380 # Reset rowtags if new title column after previous
2381 # text cells
2382 # +-----+-----+-----+-----+
2383 # |hdr-a|txt-a|hdr-B|txt-B|
2384 # +-----+-----+-----+-----+
2385 # ^reset rowtags=>
2386 # XXX beware of header "—": "" - must not clear on that if
2387 # it expands to no tags
2388 rowtags = [()]
2389 # have_hdr = True
2390 # have_hdr never used?
2391 # print("HAVE_HDR: {} rowtags={}".format(col, rowtags))
2392 # Update rowtags and coltags
2393 has_covering_hdr.add(col_idx) # col_idx == current column
2394 # has_covering_hdr is a set that has the col_idx-ids of columns
2395 # that have previously had some kind of header. It is never
2396 # resetted inside the col_idx-loops OR the bigger rows-loop, so
2397 # applies to the whole table.
2399 rowtags, new_coltags, all_hdr_tags = generate_tags(
2400 rowtags, table_tags
2401 )
2403 if any("dummy-skip-this" in ts for ts in rowtags):
2404 continue # Skip this cell
2406 if any("dummy-load-stored-hdrspans" in ts for ts in v): 2406 ↛ 2407line 2406 didn't jump to line 2407 because the condition on line 2406 was never true
2407 hdrspans.extend(tablecontext.stored_hdrspans)
2409 if any("dummy-reset-stored-hdrspans" in ts for ts in v): 2409 ↛ 2410line 2409 didn't jump to line 2410 because the condition on line 2409 was never true
2410 tablecontext.stored_hdrspans = []
2412 if any("dummy-store-hdrspan" in ts for ts in v): 2412 ↛ 2414line 2412 didn't jump to line 2414 because the condition on line 2412 was never true
2413 # print(f"STORED: {col}")
2414 store_new_hdrspan = True
2415 else:
2416 store_new_hdrspan = False
2418 new_coltags = list(
2419 x
2420 for x in new_coltags
2421 if not any(t in noinherit_tags for t in x)
2422 )
2423 # print("new_coltags={} previously_seen={} all_hdr_tags={}"
2424 # .format(new_coltags, previously_seen, all_hdr_tags))
2425 if any(new_coltags):
2426 (
2427 col,
2428 col0_followed_by_nonempty,
2429 col0_hdrspan,
2430 ) = add_new_hdrspan(
2431 col,
2432 hdrspans,
2433 store_new_hdrspan,
2434 col0_followed_by_nonempty,
2435 col0_hdrspan,
2436 )
2438 continue
2440 # These values are ignored, at least for now
2441 if re.match(r"^(# |\(see )", col): 2441 ↛ 2442line 2441 didn't jump to line 2442 because the condition on line 2441 was never true
2442 continue
2444 if any("dummy-skip-this" in ts for ts in rowtags):
2445 continue # Skip this cell
2447 # If the word has no rowtags and is a multi-row cell, then
2448 # ignore this. This happens with empty separator rows
2449 # within a rowspan>1 cell. cf. wander/English/Conjugation.
2450 if rowtags == [()] and rowspan > 1:
2451 continue
2453 # Minor cleanup. See e.g. είμαι/Greek/Verb present participle.
2454 if cleanup_rules:
2455 for regx, substitution in cleanup_rules.items():
2456 col = re.sub(regx, substitution, col)
2458 if ( 2458 ↛ 2463line 2458 didn't jump to line 2463 because the condition on line 2458 was never true
2459 col_idx == 0
2460 and not first_col_has_text
2461 and get_lang_conf(lang, "ignore_top_left_text_cell") is True
2462 ):
2463 continue # Skip text at top left, as in Icelandic, Faroese
2465 # if col0_hdrspan is not None:
2466 # print("COL0 FOLLOWED NONHDR: {!r} by {!r}"
2467 # .format(col0_hdrspan.text, col))
2468 col0_followed_by_nonempty = True
2469 have_text = True
2471 # Determine column tags for the multi-column cell
2472 combined_coltags = compute_coltags(
2473 lang, pos, hdrspans, col_idx, colspan, col
2474 )
2475 if any("dummy-ignored-text-cell" in ts for ts in combined_coltags): 2475 ↛ 2476line 2475 didn't jump to line 2476 because the condition on line 2475 was never true
2476 continue
2478 # print("HAVE_TEXT:", repr(col))
2479 # Split the text into separate forms. First simplify spaces except
2480 # newline.
2481 col = re.sub(r"[ \t\r]+", " ", col)
2482 # Split the cell text into alternatives
2484 col, alts, split_extra_tags = split_text_into_alts(col)
2486 # Some cells have mixed form content, like text and romanization,
2487 # or text and IPA. Handle these.
2488 alts = handle_mixed_lines(alts)
2490 alts = list((x, combined_coltags) for x in alts)
2492 # Generate forms from the alternatives
2493 # alts is a list of (tuple of forms, tuple of tags)
2494 for (form, base_roman, ipa), coltags in alts:
2495 form = form.strip()
2496 extra_tags = []
2497 extra_tags.extend(split_extra_tags)
2498 # Handle special splits again here, so that we can have custom
2499 # mappings from form to form and tags.
2500 if form in form_replacements:
2501 replacement, tags = form_replacements[form]
2502 for x in tags.split():
2503 assert x in valid_tags
2504 assert isinstance(replacement, str)
2505 assert isinstance(tags, str)
2506 form = replacement
2507 extra_tags.extend(tags.split())
2508 # Clean the value, extracting reference symbols
2509 form, refs, defs, hdr_tags = extract_cell_content(
2510 lang, word, form
2511 )
2512 # if refs:
2513 # print("REFS:", refs)
2514 extra_tags.extend(hdr_tags)
2515 # Extract tags from referenced footnotes
2516 # Extract tags from referenced footnotes
2517 refs_tags = set()
2518 for ref in refs:
2519 if ref in def_ht:
2520 refs_tags.update(def_ht[ref])
2522 if base_roman:
2523 base_roman, _, _, hdr_tags = extract_cell_content(
2524 lang, word, base_roman
2525 )
2526 extra_tags.extend(hdr_tags)
2528 # Do some additional cleanup on the cell.
2529 form = re.sub(r"^\s*,\s*", "", form)
2530 form = re.sub(r"\s*,\s*$", "", form)
2531 form = re.sub(r"\s*(,\s*)+", ", ", form)
2532 form = re.sub(r"(?i)^Main:", "", form)
2533 form = re.sub(r"\s+", " ", form)
2534 form = form.strip()
2536 # Look for parentheses that have semantic meaning
2537 form, et = find_semantic_parens(form)
2538 extra_tags.extend(et)
2540 # Handle parentheses in the table element. We parse
2541 # tags anywhere and romanizations anywhere but beginning.
2542 roman = base_roman
2543 paren = None
2544 clitic = None
2545 m = re.search(r"(\s+|^)\(([^)]*)\)", form)
2546 # start|spaces + (anything)
2547 if m is not None:
2548 subst = m.group(1)
2549 paren = m.group(2)
2550 else:
2551 m = re.search(r"\(([^)]*)\)(\s+|$)", form)
2552 # (anything) + spaces|end
2553 if m is not None: 2553 ↛ 2554line 2553 didn't jump to line 2554 because the condition on line 2553 was never true
2554 paren = m.group(1)
2555 subst = m.group(2)
2556 if paren is not None:
2557 form, roman, clitic = handle_parens(
2558 form, roman, clitic, extra_tags
2559 )
2561 # Ignore certain forms that are not really forms,
2562 # unless they're really, really close to the article title
2563 if form in ( 2563 ↛ 2568line 2563 didn't jump to line 2568 because the condition on line 2563 was never true
2564 "",
2565 "unchanged",
2566 "after an", # in sona/Irish/Adj/Mutation
2567 ):
2568 Lev = distw([form], word)
2569 if form and Lev < 0.1:
2570 wxr.wtp.debug(
2571 "accepted possible false positive '{}' with"
2572 "> 0.1 Levenshtein distance in {}/{}".format(
2573 form, word, lang
2574 ),
2575 sortid="inflection/2213",
2576 )
2577 elif form and Lev < 0.3:
2578 wxr.wtp.debug(
2579 "skipped possible match '{}' with > 0.3"
2580 "Levenshtein distance in {}/{}".format(
2581 form, word, lang
2582 ),
2583 sortid="inflection/2218",
2584 )
2585 continue
2586 else:
2587 continue
2588 # print("ROWTAGS={} COLTAGS={} REFS_TAGS={} "
2589 # "FORM={!r} ROMAN={!r}"
2590 # .format(rowtags, coltags, refs_tags,
2591 # form, roman))
2593 # Merge tags from row and column and do miscellaneous
2594 # tag-related handling.
2595 (
2596 merge_ret,
2597 form,
2598 some_has_covered_text,
2599 ) = merge_row_and_column_tags(form, some_has_covered_text)
2600 ret.extend(merge_ret)
2602 # End of row.
2603 rownum += 1
2604 # For certain languages, if the row was empty, reset
2605 # hdrspans (saprast/Latvian/Verb, but not aussteigen/German/Verb).
2606 if row_empty and get_lang_conf(lang, "empty_row_resets"):
2607 hdrspans = []
2608 # Check if we should expand col0_hdrspan.
2609 if col0_hdrspan is not None:
2610 col0_allowed = get_lang_conf(lang, "hdr_expand_first")
2611 col0_cats = tagset_cats(col0_hdrspan.tagsets)
2612 # Only expand if col0_cats and later_cats are allowed
2613 # and don't overlap and col0 has tags, and there have
2614 # been no disallowed cells in between.
2615 if (
2616 not col0_followed_by_nonempty
2617 and not (col0_cats - col0_allowed)
2618 and
2619 # len(col0_cats) == 1 and
2620 col_idx > col0_hdrspan.start + col0_hdrspan.colspan
2621 ):
2622 # If an earlier header is only followed by headers that yield
2623 # no tags, expand it to entire row
2624 # print("EXPANDING COL0: {} from {} to {} cols {}"
2625 # .format(col0_hdrspan.text, col0_hdrspan.colspan,
2626 # len(row) - col0_hdrspan.start,
2627 # col0_hdrspan.tagsets))
2628 col0_hdrspan.colspan = len(row) - col0_hdrspan.start
2629 col0_hdrspan.expanded = True
2630 # XXX handle refs and defs
2631 # for x in hdrspans:
2632 # print(" HDRSPAN {} {} {} {!r}"
2633 # .format(x.start, x.colspan, x.tagsets, x.text))
2635 # Post-process German nouns with articles in separate columns. We move the
2636 # definite/indefinite/usually-without-article markers into the noun and
2637 # remove the article entries.
2638 if get_lang_conf(lang, "articles_in_separate_columns") and any(
2639 "noun" in x["tags"] for x in ret
2640 ):
2641 new_ret = []
2642 saved_tags = set()
2643 had_noun = False
2644 for dt in ret:
2645 tags = dt["tags"]
2646 # print(tags)
2647 if "noun" in tags:
2648 tags = list(
2649 sorted(set(t for t in tags if t != "noun") | saved_tags)
2650 )
2651 had_noun = True
2652 elif ( 2652 ↛ 2679line 2652 didn't jump to line 2679 because the condition on line 2652 was always true
2653 "indefinite" in tags
2654 or "definite" in tags
2655 or "usually-without-article" in tags
2656 or "without-article" in tags
2657 ):
2658 if had_noun:
2659 saved_tags = set(tags)
2660 else:
2661 saved_tags = saved_tags | set(tags) # E.g. Haus/German
2662 remove_useless_tags(lang, pos, saved_tags)
2663 saved_tags = saved_tags & set(
2664 [
2665 "masculine",
2666 "feminine",
2667 "neuter",
2668 "singular",
2669 "plural",
2670 "indefinite",
2671 "definite",
2672 "usually-without-article",
2673 "without-article",
2674 ]
2675 )
2676 had_noun = False
2677 continue # Skip the articles
2679 dt = dt.copy()
2680 dt["tags"] = tags
2681 new_ret.append(dt)
2682 ret = new_ret
2684 elif possibly_ignored_forms:
2685 # Some languages have tables with cells that are kind of separated
2686 # and difficult to handle, like eulersche Formel/German where
2687 # the definite and indefinite articles are just floating.
2688 # If a language has a dict of conditionally_ignored_cells,
2689 # and if the contents of a cell is found in one of the rules
2690 # there, ignore that cell if it
2691 # 1. Does not have the appropriate tag (like "definite" for "die")
2692 # and
2693 # 2. The title of the article is not one of the other co-words
2694 # (ie. it's an article for the definite articles in german etc.)
2695 # pass
2696 new_ret = []
2697 for cell_data in ret:
2698 tags = cell_data["tags"]
2699 text = cell_data["form"]
2700 skip_this = False
2701 for key_tag, ignored_forms in possibly_ignored_forms.items():
2702 if text not in ignored_forms: 2702 ↛ 2704line 2702 didn't jump to line 2704 because the condition on line 2702 was always true
2703 continue
2704 if word in ignored_forms:
2705 continue
2706 if key_tag not in tags:
2707 skip_this = True
2709 if skip_this: 2709 ↛ 2710line 2709 didn't jump to line 2710 because the condition on line 2709 was never true
2710 continue
2711 new_ret.append(cell_data)
2713 ret = new_ret
2715 # Post-process English inflection tables, addding "multiword-construction"
2716 # when the number of words has increased.
2717 if lang == "English" and pos == "verb":
2718 word_words = len(word.split())
2719 new_ret = []
2720 for dt in ret:
2721 form = dt.get("form", "")
2722 if len(form.split()) > word_words:
2723 dt = dt.copy()
2724 dt["tags"] = list(dt.get("tags", []))
2725 # This strange copy-assigning shuffle is preventative black
2726 # magic; do not touch lest you invoke deep bugs.
2727 data_append(dt, "tags", "multiword-construction")
2728 new_ret.append(dt)
2729 ret = new_ret
2731 # Always insert "table-tags" detail as the first entry in any inflection
2732 # table. This way we can reliably detect where a new table starts.
2733 # Table-tags applies until the next table-tags entry.
2734 if ret or table_tags:
2735 table_tags = list(sorted(set(table_tags)))
2736 dt = {
2737 "form": " ".join(table_tags),
2738 "source": source,
2739 "tags": ["table-tags"],
2740 }
2741 if dt["form"] == "":
2742 dt["form"] = "no-table-tags"
2743 if tablecontext.template_name:
2744 tn = {
2745 "form": tablecontext.template_name,
2746 "source": source,
2747 "tags": ["inflection-template"],
2748 }
2749 ret = [dt] + [tn] + ret
2750 else:
2751 ret = [dt] + ret
2753 return ret
2756def handle_generic_table(
2757 wxr, tablecontext, data, word, lang, pos, rows, titles, source, after, depth
2758):
2759 assert isinstance(wxr, WiktextractContext)
2760 assert isinstance(data, dict)
2761 assert isinstance(word, str)
2762 assert isinstance(lang, str)
2763 assert isinstance(pos, str)
2764 assert isinstance(rows, list)
2765 assert isinstance(source, str)
2766 assert isinstance(after, str)
2767 assert isinstance(depth, int)
2768 for row in rows:
2769 assert isinstance(row, list)
2770 for x in row:
2771 assert isinstance(x, InflCell)
2772 assert isinstance(titles, list)
2773 for x in titles:
2774 assert isinstance(x, str)
2776 # Try to parse the table as a simple table
2777 ret = parse_simple_table(
2778 wxr, tablecontext, word, lang, pos, rows, titles, source, after, depth
2779 )
2780 if ret is None: 2780 ↛ 2783line 2780 didn't jump to line 2783 because the condition on line 2780 was never true
2781 # XXX handle other table formats
2782 # We were not able to handle the table
2783 wxr.wtp.debug(
2784 "unhandled inflection table format, {}/{}".format(word, lang),
2785 sortid="inflection/2370",
2786 )
2787 return
2789 # Add the returned forms but eliminate duplicates.
2790 have_forms = set()
2791 for dt in ret:
2792 fdt = freeze(dt)
2793 if fdt in have_forms:
2794 continue # Don't add duplicates
2795 # Some Russian words have Declension and Pre-reform declension partially
2796 # duplicating same data. Don't add "dated" tags variant if already have
2797 # the same without "dated" from the modern declension table
2799 tags = dt.get("tags", [])
2800 for dated_tag in ("dated",):
2801 if dated_tag in tags:
2802 dt2 = dt.copy()
2803 tags2 = list(x for x in tags if x != dated_tag)
2804 dt2["tags"] = tags2
2805 if tags2 and freeze(dt2) in have_forms: 2805 ↛ 2806line 2805 didn't jump to line 2806 because the condition on line 2805 was never true
2806 break # Already have without archaic
2807 else:
2808 if "table-tags" not in tags:
2809 have_forms.add(fdt)
2810 data_append(data, "forms", dt)
2813def determine_header(
2814 wxr,
2815 tablecontext,
2816 lang,
2817 word,
2818 pos,
2819 table_kind,
2820 kind,
2821 style,
2822 row,
2823 col,
2824 celltext,
2825 titletext,
2826 cols_headered,
2827 target,
2828 cellstyle,
2829):
2830 assert isinstance(table_kind, NodeKind)
2831 assert isinstance(kind, (NodeKind, str))
2832 assert style is None or isinstance(style, str)
2833 assert cellstyle is None or isinstance(cellstyle, str)
2835 if table_kind == NodeKind.TABLE:
2836 header_kind = NodeKind.TABLE_HEADER_CELL
2837 elif table_kind == NodeKind.HTML: 2837 ↛ 2839line 2837 didn't jump to line 2839 because the condition on line 2837 was always true
2838 header_kind = "th"
2839 idx = celltext.find(": ")
2840 is_title = False
2841 # remove anything in parentheses, compress whitespace, .strip()
2842 cleaned_titletext = re.sub(
2843 r"\s+", " ", re.sub(r"\s*\([^)]*\)", "", titletext)
2844 ).strip()
2845 cleaned, _, _, _ = extract_cell_content(lang, word, celltext)
2846 cleaned = re.sub(r"\s+", " ", cleaned)
2847 hdr_expansion = expand_header(
2848 wxr,
2849 tablecontext,
2850 word,
2851 lang,
2852 pos,
2853 cleaned,
2854 [],
2855 silent=True,
2856 ignore_tags=True,
2857 )
2858 candidate_hdr = not any(
2859 any(t.startswith("error-") for t in ts) for ts in hdr_expansion
2860 )
2861 # KJ candidate_hdr says that a specific cell is a candidate
2862 # for being a header because it passed through expand_header
2863 # without getting any "error-" tags; that is, the contents
2864 # is "valid" for being a header; these are the false positives
2865 # we want to catch
2866 ignored_cell = any(
2867 any(t.startswith("dummy-") for t in ts) for ts in hdr_expansion
2868 )
2869 # ignored_cell should NOT be used to filter for headers, like
2870 # candidate_hdr is used, but only to filter for related *debug
2871 # messages*: some dummy-tags are actually half-way to headers,
2872 # like ones with "Notes", so they MUST be headers, but later
2873 # on they're ignored *as* headers so they don't need to print
2874 # out any cells-as-headers debug messages.
2875 if (
2876 candidate_hdr
2877 and kind != header_kind
2878 and cleaned != ""
2879 and cleaned != "dummy-ignored-text-cell"
2880 and cleaned not in IGNORED_COLVALUES
2881 ):
2882 # print("col: {}".format(col))
2883 if not ignored_cell and lang not in LANGUAGES_WITH_CELLS_AS_HEADERS:
2884 wxr.wtp.debug(
2885 "rejected heuristic header: "
2886 "table cell identified as header and given "
2887 "candidate status, BUT {} is not in "
2888 "LANGUAGES_WITH_CELLS_AS_HEADERS; "
2889 "cleaned text: {}".format(lang, cleaned),
2890 sortid="inflection/2447",
2891 )
2892 candidate_hdr = False
2893 elif cleaned not in LANGUAGES_WITH_CELLS_AS_HEADERS.get(lang, ""):
2894 wxr.wtp.debug(
2895 "rejected heuristic header: "
2896 "table cell identified as header and given "
2897 "candidate status, BUT the cleaned text is "
2898 "not in LANGUAGES_WITH_CELLS_AS_HEADERS[{}]; "
2899 "cleaned text: {}".format(lang, cleaned),
2900 sortid="inflection/2457",
2901 )
2902 candidate_hdr = False
2903 else:
2904 wxr.wtp.debug(
2905 "accepted heuristic header: "
2906 "table cell identified as header and given "
2907 "candidate status, AND the cleaned text is "
2908 "in LANGUAGES_WITH_CELLS_AS_HEADERS[{}]; "
2909 "cleaned text: {}".format(lang, cleaned),
2910 sortid="inflection/2466",
2911 )
2913 # If the cell starts with something that could start a
2914 # definition (typically a reference symbol), make it a candidate
2915 # regardless of whether the language is listed.
2916 if re.match(def_re, cleaned) and not re.match(nondef_re, cleaned): 2916 ↛ 2917line 2916 didn't jump to line 2917 because the condition on line 2916 was never true
2917 candidate_hdr = True
2919 # print("titletext={!r} hdr_expansion={!r} candidate_hdr={!r} "
2920 # "lang={} pos={}"
2921 # .format(titletext, hdr_expansion, candidate_hdr,
2922 # lang, pos))
2923 if idx >= 0 and titletext[:idx] in infl_map:
2924 target = titletext[idx + 2 :].strip()
2925 celltext = celltext[:idx]
2926 is_title = True
2927 elif (
2928 kind == header_kind
2929 and " + " not in titletext # For "avoir + blah blah"?
2930 and not any(
2931 isinstance(x, WikiNode)
2932 and x.kind == NodeKind.HTML
2933 and x.sarg == "span"
2934 and x.attrs.get("lang") in ("az",)
2935 for x in col.children
2936 )
2937 ):
2938 is_title = True
2939 elif (
2940 candidate_hdr
2941 and cleaned_titletext not in IGNORED_COLVALUES
2942 and distw([cleaned_titletext], word) > 0.3
2943 and cleaned_titletext not in ("I", "es")
2944 ):
2945 is_title = True
2946 # if first column or same style as first column
2947 elif (
2948 style == cellstyle
2949 and
2950 # and title is not identical to word name
2951 titletext != word
2952 and cleaned not in IGNORED_COLVALUES
2953 and cleaned != "dummy-ignored-text-cell"
2954 and
2955 # the style composite string is not broken
2956 not style.startswith("////")
2957 and " + " not in titletext
2958 ):
2959 if not ignored_cell and lang not in LANGUAGES_WITH_CELLS_AS_HEADERS: 2959 ↛ 2960line 2959 didn't jump to line 2960 because the condition on line 2959 was never true
2960 wxr.wtp.debug(
2961 "rejected heuristic header: "
2962 "table cell identified as header based "
2963 "on style, BUT {} is not in "
2964 "LANGUAGES_WITH_CELLS_AS_HEADERS; "
2965 "cleaned text: {}, style: {}".format(lang, cleaned, style),
2966 sortid="inflection/2512",
2967 )
2968 elif ( 2968 ↛ 2972line 2968 didn't jump to line 2972 because the condition on line 2968 was never true
2969 not ignored_cell
2970 and cleaned not in LANGUAGES_WITH_CELLS_AS_HEADERS.get(lang, "")
2971 ):
2972 wxr.wtp.debug(
2973 "rejected heuristic header: "
2974 "table cell identified as header based "
2975 "on style, BUT the cleaned text is "
2976 "not in LANGUAGES_WITH_CELLS_AS_HEADERS[{}]; "
2977 "cleaned text: {}, style: {}".format(lang, cleaned, style),
2978 sortid="inflection/2522",
2979 )
2980 else:
2981 wxr.wtp.debug(
2982 "accepted heuristic header: "
2983 "table cell identified as header based "
2984 "on style, AND the cleaned text is "
2985 "in LANGUAGES_WITH_CELLS_AS_HEADERS[{}]; "
2986 "cleaned text: {}, style: {}".format(lang, cleaned, style),
2987 sortid="inflection/2530",
2988 )
2989 is_title = True
2990 if ( 2990 ↛ 2997line 2990 didn't jump to line 2997 because the condition on line 2990 was never true
2991 not is_title
2992 and len(row) < len(cols_headered)
2993 and cols_headered[len(row)]
2994 ):
2995 # Whole column has title suggesting they are headers
2996 # (e.g. "Case")
2997 is_title = True
2998 if re.match(
2999 r"Conjugation of |Declension of |Inflection of |"
3000 r"Mutation of |Notes\b", # \b is word-boundary
3001 titletext,
3002 ):
3003 is_title = True
3004 return is_title, hdr_expansion, target, celltext
3007class TableContext:
3008 """Saved context used when parsing a table and its subtables."""
3010 __slot__ = (
3011 "stored_hdrspans",
3012 "section_header",
3013 "template_name",
3014 )
3016 def __init__(self, template_name=None):
3017 self.stored_hdrspans = []
3018 self.section_header = []
3019 if not template_name:
3020 self.template_name = ""
3021 else:
3022 self.template_name = template_name
3025def handle_wikitext_or_html_table(
3026 wxr, word, lang, pos, data, tree, titles, source, after, tablecontext=None
3027):
3028 """Parses a table from parsed Wikitext format into rows and columns of
3029 InflCell objects and then calls handle_generic_table() to parse it into
3030 forms. This adds the forms into ``data``."""
3031 assert isinstance(wxr, WiktextractContext)
3032 assert isinstance(word, str)
3033 assert isinstance(lang, str)
3034 assert isinstance(pos, str)
3035 assert isinstance(data, dict)
3036 assert isinstance(tree, WikiNode)
3037 assert tree.kind == NodeKind.TABLE or (
3038 tree.kind == NodeKind.HTML and tree.sarg == "table"
3039 )
3040 assert isinstance(titles, list)
3041 assert isinstance(source, str)
3042 for x in titles:
3043 assert isinstance(x, str)
3044 assert isinstance(after, str)
3045 assert tablecontext is None or isinstance(tablecontext, TableContext)
3046 # Imported here to avoid a circular import
3047 from wiktextract.page import clean_node, recursively_extract
3049 if not tablecontext:
3050 tablecontext = TableContext()
3052 def handle_table1(
3053 wxr,
3054 tablecontext,
3055 word,
3056 lang,
3057 pos,
3058 data,
3059 tree,
3060 titles,
3061 source,
3062 after,
3063 depth,
3064 ):
3065 """Helper function allowing the 'flattening' out of the table
3066 recursion: instead of handling the tables in the wrong order
3067 (recursively), this function adds to new_row that is then
3068 iterated through in the main function at the end, creating
3069 a longer table (still in pieces) in the correct order."""
3071 assert isinstance(data, dict)
3072 assert isinstance(titles, list)
3073 assert isinstance(source, str)
3074 for x in titles:
3075 assert isinstance(x, str)
3076 assert isinstance(after, str)
3077 assert isinstance(depth, int)
3078 # print("HANDLE_WIKITEXT_TABLE", titles)
3080 col_gap_data = [] # Filling for columns with rowspan > 1
3081 # col_gap_data contains None or InflCell
3082 vertical_still_left = [] # Number of remaining rows for which to fill
3083 # the column; vertical_still_left contains int
3084 cols_headered = [] # [F, T, F, F...]
3085 # True when the whole column contains headers, even
3086 # when the cell is not considered a header; triggered
3087 # by the "*" inflmap meta-tag.
3088 rows = []
3090 sub_ret = []
3092 for node in tree.children:
3093 if not isinstance(node, WikiNode):
3094 continue
3095 if node.kind == NodeKind.HTML:
3096 kind = node.sarg
3097 else:
3098 kind = node.kind
3100 # print(" {}".format(node))
3101 if kind in (NodeKind.TABLE_CAPTION, "caption"):
3102 # print(" CAPTION:", node)
3103 pass
3104 elif kind in (NodeKind.TABLE_ROW, "tr"):
3105 if "vsShow" in node.attrs.get("class", "").split():
3106 # vsShow rows are those that are intially shown in tables
3107 # that have more data. The hidden data duplicates these
3108 # rows, so we skip it and just process the hidden data.
3109 continue
3111 # Parse a table row.
3112 row = []
3113 style = None
3114 row_has_nonempty_cells = False
3115 # Have nonempty cell not from rowspan
3116 for col in node.children:
3117 # loop through each cell in the ROW
3118 if not isinstance(col, WikiNode):
3119 # This skip is not used for counting,
3120 # "None" is not used in
3121 # indexing or counting or looping.
3122 continue
3123 if col.kind == NodeKind.HTML:
3124 kind = col.sarg
3125 else:
3126 kind = col.kind
3127 if kind not in ( 3127 ↛ 3133line 3127 didn't jump to line 3133 because the condition on line 3127 was never true
3128 NodeKind.TABLE_HEADER_CELL,
3129 NodeKind.TABLE_CELL,
3130 "th",
3131 "td",
3132 ):
3133 print(" UNEXPECTED ROW CONTENT: {}".format(col))
3134 continue
3136 while (
3137 len(row) < len(vertical_still_left)
3138 and vertical_still_left[len(row)] > 0
3139 ):
3140 # vertical_still_left is [...0, 0, 2...] for each
3141 # column. It is populated at the end of the loop, at the
3142 # same time as col_gap_data. This needs to be looped and
3143 # filled this way because each `for col`-looping jumps
3144 # straight to the next meaningful cell; there is no
3145 # "None" cells, only emptiness between, and rowspan and
3146 # colspan are just to generate the "fill-
3147 vertical_still_left[len(row)] -= 1
3148 row.append(col_gap_data[len(row)])
3150 # appending row is how "indexing" is
3151 # done here; something is appended,
3152 # like a filler-cell here or a "start"
3153 # cell at the end of the row-loop,
3154 # which increased len(row) which is
3155 # then used as the target-index to check
3156 # for gaps. vertical_still_left is
3157 # the countdown to when to stop
3158 # filling in gaps, and goes down to 0,
3159 # and col_gap_data is not touched
3160 # except when a new rowspan is needed,
3161 # at the same time that
3162 # vertical_still_left gets reassigned.
3164 try:
3165 rowspan = int(col.attrs.get("rowspan", "1")) # 🡙
3166 colspan = int(col.attrs.get("colspan", "1")) # 🡘
3167 except ValueError:
3168 rowspan = 1
3169 colspan = 1
3170 # print("COL:", col)
3172 # Process any nested tables recursively.
3173 tables, rest = recursively_extract(
3174 col,
3175 lambda x: isinstance(x, WikiNode)
3176 and (x.kind == NodeKind.TABLE or x.sarg == "table"),
3177 )
3179 # Clean the rest of the cell.
3180 celltext = clean_node(wxr, None, rest)
3181 # print("CLEANED:", celltext)
3183 # Handle nested tables.
3184 for tbl in tables:
3185 # Some nested tables (e.g., croí/Irish) have subtitles
3186 # as normal paragraphs in the same cell under a descrip-
3187 # tive text that should be treated as a title (e.g.,
3188 # "Forms with the definite article", with "definite" not
3189 # mentioned elsewhere).
3190 new_titles = list(titles)
3191 if celltext:
3192 new_titles.append(celltext)
3193 subtbl = handle_table1(
3194 wxr,
3195 tablecontext,
3196 word,
3197 lang,
3198 pos,
3199 data,
3200 tbl,
3201 new_titles,
3202 source,
3203 "",
3204 depth + 1,
3205 )
3206 if subtbl: 3206 ↛ 3184line 3206 didn't jump to line 3184 because the condition on line 3206 was always true
3207 sub_ret.append((rows, titles, after, depth))
3208 rows = []
3209 titles = []
3210 after = ""
3211 sub_ret.extend(subtbl)
3213 # This magic value is used as part of header detection
3214 cellstyle = (
3215 col.attrs.get("style", "")
3216 + "//"
3217 + col.attrs.get("class", "")
3218 + "//"
3219 + str(kind)
3220 )
3222 if not row: # if first column in row
3223 style = cellstyle
3224 target = None
3225 titletext = celltext.strip()
3226 while titletext and is_superscript(titletext[-1]):
3227 titletext = titletext[:-1]
3229 (
3230 is_title,
3231 hdr_expansion,
3232 target,
3233 celltext,
3234 ) = determine_header(
3235 wxr,
3236 tablecontext,
3237 lang,
3238 word,
3239 pos,
3240 tree.kind,
3241 kind,
3242 style,
3243 row,
3244 col,
3245 celltext,
3246 titletext,
3247 cols_headered,
3248 None,
3249 cellstyle,
3250 )
3252 if is_title:
3253 # If this cell gets a "*" tag, make the whole column
3254 # below it (toggling it in cols_headered = [F, F, T...])
3255 # into headers.
3256 while len(cols_headered) <= len(row):
3257 cols_headered.append(False)
3258 if any("*" in tt for tt in hdr_expansion):
3259 cols_headered[len(row)] = True
3260 celltext = ""
3261 # if row_has_nonempty_cells has been True at some point, it
3262 # keeps on being True.
3263 # if row_has_nonempty_cells or is_title or celltext != "":
3264 # row_has_nonempty_cells = True
3265 # ⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓
3266 row_has_nonempty_cells |= is_title or celltext != ""
3267 cell = InflCell(
3268 celltext, is_title, colspan, rowspan, target
3269 )
3270 for _ in range(0, colspan):
3271 # colspan🡘 current loop (col) or 1
3272 # All the data-filling for colspan
3273 # is done simply in this loop,
3274 # while rowspan needs to use
3275 # vertical_still_left to count gaps
3276 # and col_gap_data to fill in
3277 # those gaps with InflCell data.
3278 if rowspan > 1: # rowspan🡙 current loop (col) or 1
3279 while len(col_gap_data) <= len(row):
3280 # Initialize col_gap_data/ed if
3281 # it is lacking slots
3282 # for each column; col_gap_data and
3283 # vertical_still_left are never
3284 # reset to [], during
3285 # the whole table function.
3286 col_gap_data.append(None)
3287 vertical_still_left.append(0)
3288 # Below is where the "rectangle" block of rowspan
3289 # and colspan is filled for the future.
3290 col_gap_data[len(row)] = cell
3291 # col_gap_data contains cells that
3292 # will be used in the
3293 # future, or None
3294 vertical_still_left[len(row)] = rowspan - 1
3295 # A counter for how many gaps🡙 are still left to be
3296 # filled (row.append or
3297 # row[col_gap_data[len(row)] =>
3298 # rows), it is not reset to [], but decremented to 0
3299 # each time a row gets something from col_gap_data.
3300 # Append this cell 1+ times for colspan🡘
3301 row.append(cell)
3302 if not row:
3303 continue
3304 # After looping the original row-nodes above, fill
3305 # in the rest of the row if the final cell has colspan
3306 # (inherited from above, so a cell with rowspan and colspan)
3307 for i in range(len(row), len(vertical_still_left)):
3308 if vertical_still_left[i] <= 0:
3309 continue
3310 vertical_still_left[i] -= 1
3311 while len(row) < i:
3312 row.append(InflCell("", False, 1, 1, None))
3313 row.append(col_gap_data[i])
3314 # print(" ROW {!r}".format(row))
3315 if row_has_nonempty_cells: 3315 ↛ 3092line 3315 didn't jump to line 3092 because the condition on line 3315 was always true
3316 rows.append(row)
3317 elif kind in ( 3317 ↛ 3092line 3317 didn't jump to line 3092 because the condition on line 3317 was always true
3318 NodeKind.TABLE_HEADER_CELL,
3319 NodeKind.TABLE_CELL,
3320 "th",
3321 "td",
3322 "span",
3323 ):
3324 # print(" TOP-LEVEL CELL", node)
3325 pass
3327 if sub_ret:
3328 main_ret = sub_ret
3329 main_ret.append((rows, titles, after, depth))
3330 else:
3331 main_ret = [(rows, titles, after, depth)]
3332 return main_ret
3334 new_rows = handle_table1(
3335 wxr, tablecontext, word, lang, pos, data, tree, titles, source, after, 0
3336 )
3338 # Now we have a table that has been parsed into rows and columns of
3339 # InflCell objects. Parse the inflection table from that format.
3340 if new_rows: 3340 ↛ exitline 3340 didn't return from function 'handle_wikitext_or_html_table' because the condition on line 3340 was always true
3341 for rows, titles, after, depth in new_rows:
3342 handle_generic_table(
3343 wxr,
3344 tablecontext,
3345 data,
3346 word,
3347 lang,
3348 pos,
3349 rows,
3350 titles,
3351 source,
3352 after,
3353 depth,
3354 )
3357def handle_html_table(
3358 wxr, word, lang, pos, data, tree, titles, source, after, tablecontext=None
3359):
3360 """A passer-on function for html-tables, XXX, remove these?"""
3361 handle_wikitext_or_html_table(
3362 wxr, word, lang, pos, data, tree, titles, source, after, tablecontext
3363 )
3366def handle_wikitext_table(
3367 wxr, word, lang, pos, data, tree, titles, source, after, tablecontext=None
3368):
3369 """A passer-on function for html-tables, XXX, remove these?"""
3370 handle_wikitext_or_html_table(
3371 wxr, word, lang, pos, data, tree, titles, source, after, tablecontext
3372 )
3375def parse_inflection_section(
3376 wxr, data, word, lang, pos, section, tree, tablecontext=None
3377):
3378 """Parses an inflection section on a page. ``data`` should be the
3379 data for a part-of-speech, and inflections will be added to it."""
3381 # print("PARSE_INFLECTION_SECTION {}/{}/{}/{}"
3382 # .format(word, lang, pos, section))
3383 assert isinstance(wxr, WiktextractContext)
3384 assert isinstance(data, dict)
3385 assert isinstance(word, str)
3386 assert isinstance(lang, str)
3387 assert isinstance(section, str)
3388 assert isinstance(tree, WikiNode)
3389 assert tablecontext is None or isinstance(tablecontext, TableContext)
3390 source = section
3391 tables = []
3392 titleparts = []
3393 preceding_bolded_title = ""
3395 def process_tables():
3396 for kind, node, titles, after in tables:
3397 after = "".join(after).strip()
3398 after = clean_value(wxr, after)
3399 if kind == "wikitext":
3400 handle_wikitext_table(
3401 wxr,
3402 word,
3403 lang,
3404 pos,
3405 data,
3406 node,
3407 titles,
3408 source,
3409 after,
3410 tablecontext=tablecontext,
3411 )
3412 elif kind == "html": 3412 ↛ 3426line 3412 didn't jump to line 3426 because the condition on line 3412 was always true
3413 handle_html_table(
3414 wxr,
3415 word,
3416 lang,
3417 pos,
3418 data,
3419 node,
3420 titles,
3421 source,
3422 after,
3423 tablecontext=tablecontext,
3424 )
3425 else:
3426 raise RuntimeError(
3427 "{}: unimplemented table kind {}".format(word, kind)
3428 )
3430 def recurse_navframe(node, titles):
3431 nonlocal tables
3432 nonlocal titleparts
3433 titleparts = []
3434 old_tables = tables
3435 tables = []
3437 recurse(node, [], navframe=True)
3439 process_tables()
3440 tables = old_tables
3442 def recurse(node, titles, navframe=False):
3443 nonlocal tables
3444 if isinstance(node, (list, tuple)):
3445 for x in node:
3446 recurse(x, titles, navframe)
3447 return
3448 if isinstance(node, str):
3449 if tables:
3450 tables[-1][-1].append(node)
3451 elif navframe:
3452 titleparts.append(node)
3453 return
3454 if not isinstance(node, WikiNode): 3454 ↛ 3455line 3454 didn't jump to line 3455 because the condition on line 3454 was never true
3455 if navframe:
3456 wxr.wtp.debug(
3457 "inflection table: unhandled in NavFrame: {}".format(node),
3458 sortid="inflection/2907",
3459 )
3460 return
3461 kind = node.kind
3462 if navframe:
3463 if kind == NodeKind.HTML:
3464 classes = node.attrs.get("class", "").split()
3465 if "NavToggle" in classes: 3465 ↛ 3466line 3465 didn't jump to line 3466 because the condition on line 3465 was never true
3466 return
3467 if "NavHead" in classes:
3468 # print("NAVHEAD:", node)
3469 recurse(node.children, titles, navframe)
3470 return
3471 if "NavContent" in classes:
3472 # print("NAVCONTENT:", node)
3473 title = "".join(titleparts).strip()
3474 title = html.unescape(title)
3475 title = title.strip()
3476 new_titles = list(titles)
3477 if not re.match(r"(Note:|Notes:)", title): 3477 ↛ 3479line 3477 didn't jump to line 3479 because the condition on line 3477 was always true
3478 new_titles.append(title)
3479 recurse(node, new_titles, navframe=False)
3480 return
3481 else:
3482 if kind == NodeKind.TABLE:
3483 tables.append(["wikitext", node, titles, []])
3484 return
3485 elif kind == NodeKind.HTML and node.sarg == "table":
3486 classes = node.attrs.get("class", ())
3487 if "audiotable" in classes:
3488 return
3489 tables.append(["html", node, titles, []])
3490 return
3491 elif kind in ( 3491 ↛ 3498line 3491 didn't jump to line 3498 because the condition on line 3491 was never true
3492 NodeKind.LEVEL2,
3493 NodeKind.LEVEL3,
3494 NodeKind.LEVEL4,
3495 NodeKind.LEVEL5,
3496 NodeKind.LEVEL6,
3497 ):
3498 return # Skip subsections
3499 if (
3500 kind == NodeKind.HTML
3501 and node.sarg == "div"
3502 and "NavFrame" in node.attrs.get("class", "").split()
3503 ):
3504 recurse_navframe(node, titles)
3505 return
3506 if kind == NodeKind.LINK:
3507 if len(node.largs) > 1:
3508 recurse(node.largs[1:], titles, navframe)
3509 else:
3510 recurse(node.largs[0], titles, navframe)
3511 return
3512 if kind == NodeKind.LIST and node.sarg == ";": 3512 ↛ 3514line 3512 didn't jump to line 3514 because the condition on line 3512 was never true
3513 nonlocal preceding_bolded_title
3514 from wiktextract.page import clean_node
3515 preceding_bolded_title = clean_node(wxr, None, node).strip("; ")
3516 for x in node.children:
3517 recurse(x, titles, navframe)
3519 assert tree.kind == NodeKind.ROOT
3520 for x in tree.children:
3521 if preceding_bolded_title != "": 3521 ↛ 3522line 3521 didn't jump to line 3522 because the condition on line 3521 was never true
3522 recurse(x, [preceding_bolded_title])
3523 else:
3524 recurse(x, [])
3526 # Process the tables we found
3527 process_tables()
3529 # XXX this code is used for extracting tables for inflection tests
3530 if wxr.config.expand_tables: 3530 ↛ 3531line 3530 didn't jump to line 3531 because the condition on line 3530 was never true
3531 if section != "Mutation":
3532 with open(wxr.config.expand_tables, "w") as f:
3533 f.write(word + "\n")
3534 f.write(lang + "\n")
3535 f.write(pos + "\n")
3536 f.write(section + "\n")
3537 text = wxr.wtp.node_to_wikitext(tree)
3538 f.write(text + "\n")