Coverage for src/wiktextract/extractor/en/inflection.py: 87%
1518 statements
« prev ^ index » next coverage.py v7.9.0, created at 2025-06-13 07:43 +0000
« prev ^ index » next coverage.py v7.9.0, created at 2025-06-13 07:43 +0000
1# Code for parsing inflection tables.
2#
3# Copyright (c) 2021-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org.
5import collections
6import copy
7import functools
8import html
9import itertools
10import re
11import unicodedata
12from typing import Generator, Optional, Union
14from wikitextprocessor import MAGIC_FIRST, HTMLNode, NodeKind, WikiNode
16from ...clean import clean_value
17from ...datautils import data_append, freeze, split_at_comma_semi
18from ...tags import valid_tags
19from ...wxr_context import WiktextractContext
20from .form_descriptions import (
21 classify_desc,
22 decode_tags,
23 distw,
24 parse_head_final_tags,
25)
26from .inflectiondata import infl_map, infl_start_map, infl_start_re
27from .lang_specific_configs import get_lang_conf, lang_specific_tags
28from .table_headers_heuristics_data import LANGUAGES_WITH_CELLS_AS_HEADERS
29from .type_utils import FormData
31# --debug-text-cell WORD
32# Command-line parameter for debugging. When parsing inflection tables,
33# print out debug messages when encountering this text.
34debug_cell_text: Optional[str] = None
37def set_debug_cell_text(text: str) -> None:
38 global debug_cell_text
39 debug_cell_text = text
42TagSets = list[tuple[str, ...]]
44# Column texts that are interpreted as an empty column.
45IGNORED_COLVALUES = {
46 "-",
47 "־",
48 "᠆",
49 "‐",
50 "‑",
51 "‒",
52 "–",
53 "—",
54 "―",
55 "−",
56 "⸺",
57 "⸻",
58 "﹘",
59 "﹣",
60 "-",
61 "/",
62 "?",
63 "not used",
64 "not applicable",
65}
67# These tags are never inherited from above
68# XXX merge with lang_specific
69noinherit_tags = {
70 "infinitive-i",
71 "infinitive-i-long",
72 "infinitive-ii",
73 "infinitive-iii",
74 "infinitive-iv",
75 "infinitive-v",
76}
78# Subject->object transformation mapping, when using dummy-object-concord
79# to replace subject concord tags with object concord tags
80object_concord_replacements = {
81 "first-person": "object-first-person",
82 "second-person": "object-second-person",
83 "third-person": "object-third-person",
84 "singular": "object-singular",
85 "plural": "object-plural",
86 "definite": "object-definite",
87 "indefinite": "object-indefinite",
88 "class-1": "object-class-1",
89 "class-2": "object-class-2",
90 "class-3": "object-class-3",
91 "class-4": "object-class-4",
92 "class-5": "object-class-5",
93 "class-6": "object-class-6",
94 "class-7": "object-class-7",
95 "class-8": "object-class-8",
96 "class-9": "object-class-9",
97 "class-10": "object-class-10",
98 "class-11": "object-class-11",
99 "class-12": "object-class-12",
100 "class-13": "object-class-13",
101 "class-14": "object-class-14",
102 "class-15": "object-class-15",
103 "class-16": "object-class-16",
104 "class-17": "object-class-17",
105 "class-18": "object-class-18",
106 "masculine": "object-masculine",
107 "feminine": "object-feminine",
108}
110# Words in title that cause addition of tags in all entries
111title_contains_global_map = {
112 "possessive": "possessive",
113 "possessed forms of": "possessive",
114 "predicative forms of": "predicative",
115 "negative": "negative",
116 "positive definite forms": "positive definite",
117 "positive indefinite forms": "positive indefinite",
118 "comparative": "comparative",
119 "superlative": "superlative",
120 "combined forms": "combined-form",
121 "mutation": "mutation",
122 "definite article": "definite",
123 "indefinite article": "indefinite",
124 "indefinite declension": "indefinite",
125 "bare forms": "indefinite", # e.g., cois/Irish
126 "definite declension": "definite",
127 "pre-reform": "dated",
128 "personal pronouns": "personal pronoun",
129 "composed forms of": "multiword-construction",
130 "subordinate-clause forms of": "subordinate-clause",
131 "participles of": "participle",
132 "variation of": "dummy-skip-this", # a'/Scottish Gaelic
133 "command form of": "imperative", # a راتلل/Pashto
134 "historical inflection of": "dummy-skip-this", # kork/Norwegian Nynorsk
135 "obsolete declension": "obsolete", # März/German 20241111
136}
137for k, v in title_contains_global_map.items():
138 if any(t not in valid_tags for t in v.split()): 138 ↛ 139line 138 didn't jump to line 139 because the condition on line 138 was never true
139 print("TITLE_CONTAINS_GLOBAL_MAP UNRECOGNIZED TAG: {}: {}".format(k, v))
140table_hdr_ign_part = r"(Inflection|Conjugation|Declension|Mutation) of [^\s]"
142table_hdr_ign_part_re = re.compile(r"(?i)(" + table_hdr_ign_part + ")")
143# (?i) python regex extension, ignore case
144title_contains_global_re = re.compile(
145 r"(?i)(^|\b)({}|{})($|\b)".format(
146 table_hdr_ign_part,
147 "|".join(re.escape(x) for x in title_contains_global_map.keys()),
148 )
149)
151# Words in title that cause addition of tags to table-tags "form"
152title_contains_wordtags_map = {
153 "pf": "perfective",
154 "impf": "imperfective",
155 "strong": "strong",
156 "weak": "weak",
157 "countable": "countable",
158 "uncountable": "uncountable",
159 "inanimate": "inanimate",
160 "animate": "animate",
161 "transitive": "transitive",
162 "intransitive": "intransitive",
163 "ditransitive": "ditransitive",
164 "ambitransitive": "ambitransitive",
165 "archaic": "archaic",
166 "dated": "dated",
167 "affirmative": "affirmative",
168 "negative": "negative",
169 "subject pronouns": "subjective",
170 "object pronouns": "objective",
171 "emphatic": "emphatic",
172 "proper noun": "proper-noun",
173 "no plural": "no-plural",
174 "imperfective": "imperfective",
175 "perfective": "perfective",
176 "no supine stem": "no-supine",
177 "no perfect stem": "no-perfect",
178 "deponent": "deponent",
179 "irregular": "irregular",
180 "no short forms": "no-short-form",
181 "iō-variant": "iō-variant",
182 "1st declension": "declension-1",
183 "2nd declension": "declension-2",
184 "3rd declension": "declension-3",
185 "4th declension": "declension-4",
186 "5th declension": "declension-5",
187 "6th declension": "declension-6",
188 "first declension": "declension-1",
189 "second declension": "declension-2",
190 "third declension": "declension-3",
191 "fourth declension": "declension-4",
192 "fifth declension": "declension-5",
193 "sixth declension": "declension-6",
194 "1st conjugation": "conjugation-1",
195 "2nd conjugation": "conjugation-2",
196 "3rd conjugation": "conjugation-3",
197 "4th conjugation": "conjugation-4",
198 "5th conjugation": "conjugation-5",
199 "6th conjugation": "conjugation-6",
200 "7th conjugation": "conjugation-7",
201 "first conjugation": "conjugation-1",
202 "second conjugation": "conjugation-2",
203 "third conjugation": "conjugation-3",
204 "fourth conjugation": "conjugation-4",
205 "fifth conjugation": "conjugation-5",
206 "sixth conjugation": "conjugation-6",
207 "seventh conjugation": "conjugation-7",
208 # Corsican regional tags in table header
209 "cismontane": "Cismontane",
210 "ultramontane": "Ultramontane",
211 "western lombard": "Western-Lombard",
212 "eastern lombard": "Eastern-Lombard",
213}
214for k, v in title_contains_wordtags_map.items():
215 if any(t not in valid_tags for t in v.split()): 215 ↛ 216line 215 didn't jump to line 216 because the condition on line 215 was never true
216 print(
217 "TITLE_CONTAINS_WORDTAGS_MAP UNRECOGNIZED TAG: {}: {}".format(k, v)
218 )
219title_contains_wordtags_re = re.compile(
220 r"(?i)(^|\b)({}|{})($|\b)".format(
221 table_hdr_ign_part,
222 "|".join(re.escape(x) for x in title_contains_wordtags_map.keys()),
223 )
224)
226# Parenthesized elements in title that are converted to tags in
227# "table-tags" form
228title_elements_map = {
229 "weak": "weak",
230 "strong": "strong",
231 "separable": "separable",
232 "masculine": "masculine",
233 "feminine": "feminine",
234 "neuter": "neuter",
235 "singular": "singular",
236 "plural": "plural",
237 "archaic": "archaic",
238 "dated": "dated",
239 "Attic": "Attic", # e.g. καλός/Greek/Adj
240 "Epic": "Epic", # e.g. καλός/Greek/Adj
241}
242for k, v in title_elements_map.items():
243 if any(t not in valid_tags for t in v.split()): 243 ↛ 244line 243 didn't jump to line 244 because the condition on line 243 was never true
244 print("TITLE_ELEMENTS_MAP UNRECOGNIZED TAG: {}: {}".format(k, v))
246# Parenthized element starts to map them to tags for form for the rest of
247# the element
248title_elemstart_map = {
249 "auxiliary": "auxiliary",
250 "Kotus type": "class",
251 "ÕS type": "class",
252 "class": "class",
253 "short class": "class",
254 "type": "class",
255 "strong class": "class",
256 "weak class": "class",
257 "accent paradigm": "accent-paradigm",
258 "stem in": "class",
259}
260for k, v in title_elemstart_map.items():
261 if any(t not in valid_tags for t in v.split()): 261 ↛ 262line 261 didn't jump to line 262 because the condition on line 261 was never true
262 print("TITLE_ELEMSTART_MAP UNRECOGNIZED TAG: {}: {}".format(k, v))
263title_elemstart_re = re.compile(
264 r"^({}) ".format("|".join(re.escape(x) for x in title_elemstart_map.keys()))
265)
268# Regexp for cell starts that are likely definitions of reference symbols.
269# See also nondef_re.
270def_re = re.compile(
271 r"(\s*•?\s+)?"
272 r"((\*+|[△†0123456789⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻]+)([⁾):]|\s|(?=[A-Z]))|"
273 r"\^(\*+|[△†])|"
274 r"([¹²³⁴⁵⁶⁷⁸⁹])|"
275 r"([ᴬᴮᴰᴱᴳᴴᴵᴶᴷᴸᴹᴺᴼᴾᴿᵀᵁⱽᵂᵃᵇᶜᵈᵉᶠᵍʰⁱʲᵏˡᵐⁿᵒᵖʳˢᵗᵘᵛʷˣʸᶻᵝᵞᵟᶿᶥᵠᵡ]))"
276)
277# ᴺᴸᴴ persan/Old Irish
279# Regexp for cell starts that are exceptions to def_re and do not actually
280# start a definition.
281nondef_re = re.compile(
282 r"(^\s*(1|2|3)\s+(sg|pl)\s*$|" # 1s or 3p etc.
283 r"\s*\d\d?\s*/\s*\d\d?\s*$)"
284) # taka/Swahili "15 / 17"
286# Certain tags are moved from headers in tables into word tags, as they always
287# apply to the whole word.
288TAGS_FORCED_WORDTAGS: set[str] = set(
289 [
290 # This was originally created for a issue with number paradigms in
291 # Arabic, but that is being handled elsewhere now.
292 ]
293)
296class InflCell:
297 """Cell in an inflection table."""
299 __slots__ = (
300 "text",
301 "is_title",
302 "colspan",
303 "rowspan",
304 "target",
305 )
307 def __init__(
308 self,
309 text: str,
310 is_title: bool,
311 colspan: int,
312 rowspan: int,
313 target: Optional[str],
314 ) -> None:
315 assert isinstance(text, str)
316 assert is_title in (True, False)
317 assert isinstance(colspan, int) and colspan >= 1
318 assert isinstance(rowspan, int) and rowspan >= 1
319 assert target is None or isinstance(target, str)
320 self.text = text.strip()
321 self.is_title = text and is_title
322 self.colspan = colspan
323 self.rowspan = rowspan
324 self.target = target
326 def __str__(self) -> str:
327 v = "{}/{}/{}/{!r}".format(
328 self.text, self.is_title, self.colspan, self.rowspan
329 )
330 if self.target:
331 v += ": {!r}".format(self.target)
332 return v
334 def __repr__(self) -> str:
335 return str(self)
338class HdrSpan:
339 """Saved information about a header cell/span during the parsing
340 of a table."""
342 __slots__ = (
343 "start",
344 "colspan",
345 "rowspan",
346 "rownum", # Row number where this occurred
347 "tagsets", # list of tuples
348 "text", # For debugging
349 "all_headers_row",
350 "expanded", # The header has been expanded to cover whole row/part
351 )
353 def __init__(
354 self,
355 start: int,
356 colspan: int,
357 rowspan: int,
358 rownum: int,
359 tagsets: TagSets,
360 text: str,
361 all_headers_row: bool,
362 ) -> None:
363 assert isinstance(start, int) and start >= 0
364 assert isinstance(colspan, int) and colspan >= 1
365 assert isinstance(rownum, int)
366 assert isinstance(tagsets, list)
367 for x in tagsets:
368 assert isinstance(x, tuple)
369 assert all_headers_row in (True, False)
370 self.start = start
371 self.colspan = colspan
372 self.rowspan = rowspan
373 self.rownum = rownum
374 self.tagsets = list(tuple(sorted(set(tags))) for tags in tagsets)
375 self.text = text
376 self.all_headers_row = all_headers_row
377 self.expanded = False
380def is_superscript(ch: str) -> bool:
381 """Returns True if the argument is a superscript character."""
382 assert isinstance(ch, str) and len(ch) == 1
383 try:
384 name = unicodedata.name(ch)
385 except ValueError:
386 return False
387 return (
388 re.match(
389 r"SUPERSCRIPT |"
390 r"MODIFIER LETTER SMALL |"
391 r"MODIFIER LETTER CAPITAL ",
392 name,
393 )
394 is not None
395 )
398def remove_useless_tags(lang: str, pos: str, tags: set[str]) -> None:
399 """Remove certain tag combinations from ``tags`` when they serve no purpose
400 together (cover all options)."""
401 assert isinstance(lang, str)
402 assert isinstance(pos, str)
403 assert isinstance(tags, set)
404 if (
405 "animate" in tags
406 and "inanimate" in tags
407 and get_lang_conf(lang, "animate_inanimate_remove")
408 ):
409 tags.remove("animate")
410 tags.remove("inanimate")
411 if (
412 "virile" in tags
413 and "nonvirile" in tags
414 and get_lang_conf(lang, "virile_nonvirile_remove")
415 ):
416 tags.remove("virile")
417 tags.remove("nonvirile")
418 # If all numbers in the language are listed, remove them all
419 numbers = get_lang_conf(lang, "numbers")
420 if numbers and all(x in tags for x in numbers):
421 for x in numbers:
422 tags.remove(x)
423 # If all genders in the language are listed, remove them all
424 genders = get_lang_conf(lang, "genders")
425 if genders and all(x in tags for x in genders):
426 for x in genders:
427 tags.remove(x)
428 # If all voices in the language are listed, remove them all
429 voices = get_lang_conf(lang, "voices")
430 if voices and all(x in tags for x in voices):
431 for x in voices:
432 tags.remove(x)
433 # If all strengths of the language are listed, remove them all
434 strengths = get_lang_conf(lang, "strengths")
435 if strengths and all(x in tags for x in strengths):
436 for x in strengths:
437 tags.remove(x)
438 # If all persons of the language are listed, remove them all
439 persons = get_lang_conf(lang, "persons")
440 if persons and all(x in tags for x in persons):
441 for x in persons:
442 tags.remove(x)
443 # If all definitenesses of the language are listed, remove them all
444 definitenesses = get_lang_conf(lang, "definitenesses")
445 if definitenesses and all(x in tags for x in definitenesses):
446 for x in definitenesses:
447 tags.remove(x)
450def tagset_cats(tagset: TagSets) -> set[str]:
451 """Returns a set of tag categories for the tagset (merged from all
452 alternatives)."""
453 return set(valid_tags[t] for ts in tagset for t in ts)
456def or_tagsets(
457 lang: str, pos: str, tagsets1: TagSets, tagsets2: TagSets
458) -> TagSets:
459 """Merges two tagsets (the new tagset just merges the tags from both, in
460 all combinations). If they contain simple alternatives (differ in
461 only one category), they are simply merged; otherwise they are split to
462 more alternatives. The tagsets are assumed be sets of sorted tuples."""
463 assert isinstance(tagsets1, list)
464 assert all(isinstance(x, tuple) for x in tagsets1)
465 assert isinstance(tagsets2, list)
466 assert all(isinstance(x, tuple) for x in tagsets1)
467 tagsets: TagSets = [] # This will be the result
469 def add_tags(tags1: tuple[str, ...]) -> None:
470 # CONTINUE
471 if not tags1:
472 return # empty set would merge with anything, won't change result
473 if not tagsets:
474 tagsets.append(tags1)
475 return
476 for tags2 in tagsets:
477 # Determine if tags1 can be merged with tags2
478 num_differ = 0
479 if tags1 and tags2: 479 ↛ 497line 479 didn't jump to line 497 because the condition on line 479 was always true
480 cats1 = set(valid_tags[t] for t in tags1)
481 cats2 = set(valid_tags[t] for t in tags2)
482 cats = cats1 | cats2
483 for cat in cats:
484 tags1_in_cat = set(t for t in tags1 if valid_tags[t] == cat)
485 tags2_in_cat = set(t for t in tags2 if valid_tags[t] == cat)
486 if (
487 tags1_in_cat != tags2_in_cat
488 or not tags1_in_cat
489 or not tags2_in_cat
490 ):
491 num_differ += 1
492 if not tags1_in_cat or not tags2_in_cat:
493 # Prevent merging if one is empty
494 num_differ += 1
495 # print("tags1={} tags2={} num_differ={}"
496 # .format(tags1, tags2, num_differ))
497 if num_differ <= 1:
498 # Yes, they can be merged
499 tagsets.remove(tags2)
500 tags_s = set(tags1) | set(tags2)
501 remove_useless_tags(lang, pos, tags_s)
502 tags_t = tuple(sorted(tags_s))
503 add_tags(tags_t) # Could result in further merging
504 return
505 # If we could not merge, add to tagsets
506 tagsets.append(tags1)
508 for tags in tagsets1:
509 add_tags(tags)
510 for tags in tagsets2:
511 add_tags(tags)
512 if not tagsets:
513 tagsets.append(())
515 # print("or_tagsets: {} + {} -> {}"
516 # .format(tagsets1, tagsets2, tagsets))
517 return tagsets
520def and_tagsets(
521 lang: str,
522 pos: str,
523 tagsets1: list[tuple[str, ...]],
524 tagsets2: list[tuple[str, ...]],
525) -> list[tuple[str, ...]]:
526 """Merges tagsets by taking union of all cobinations, without trying
527 to determine whether they are compatible."""
528 assert isinstance(tagsets1, list) and len(tagsets1) >= 1
529 assert all(isinstance(x, tuple) for x in tagsets1)
530 assert isinstance(tagsets2, list) and len(tagsets2) >= 1
531 assert all(isinstance(x, tuple) for x in tagsets1)
532 new_tagsets = []
533 tags: Union[set[str], tuple[str, ...]]
534 for tags1 in tagsets1:
535 for tags2 in tagsets2:
536 tags = set(tags1) | set(tags2)
537 remove_useless_tags(lang, pos, tags)
538 if "dummy-ignored-text-cell" in tags: 538 ↛ 539line 538 didn't jump to line 539 because the condition on line 538 was never true
539 tags.remove("dummy-ignored-text-cell")
540 tags = tuple(sorted(tags))
541 if tags not in new_tagsets: 541 ↛ 535line 541 didn't jump to line 535 because the condition on line 541 was always true
542 new_tagsets.append(tags)
543 # print("and_tagsets: {} + {} -> {}"
544 # .format(tagsets1, tagsets2, new_tagsets))
545 return new_tagsets
548@functools.lru_cache(65536)
549def extract_cell_content(
550 lang: str, word: str, col: str
551) -> tuple[str, list[str], list[tuple[str, str]], list[str]]:
552 """Cleans a row/column header for later processing. This returns
553 (cleaned, refs, defs, tags)."""
554 # print("EXTRACT_CELL_CONTENT {!r}".format(col))
555 hdr_tags = []
556 col = re.sub(r"(?s)\s*,\s*$", "", col)
557 col = re.sub(r"(?s)\s*•\s*$", "", col)
558 col = re.sub(r"\s+", " ", col)
559 col = col.strip()
560 if re.search(
561 r"^\s*(There are |"
562 r"\* |"
563 r"see |"
564 r"Use |"
565 r"use the |"
566 r"Only used |"
567 r"The forms in |"
568 r"these are also written |"
569 r"The genitive can be |"
570 r"Genitive forms are rare or non-existant|"
571 r"Accusative Note: |"
572 r"Classifier Note: |"
573 r"Noun: Assamese nouns are |"
574 r"the active conjugation|"
575 r"the instrumenal singular|"
576 r"Note:|"
577 r"\^* Note:|"
578 r"possible mutated form |"
579 r"The future tense: )",
580 col,
581 ):
582 return "dummy-ignored-text-cell", [], [], []
584 # Temporarily remove final parenthesized part (if separated by whitespace),
585 # so that we can extract reference markers before it.
586 final_paren = ""
587 m = re.search(r"\s+\([^)]*\)$", col)
588 if m is not None:
589 final_paren = m.group(0)
590 col = col[: m.start()]
592 # Extract references and tag markers
593 refs = []
594 special_references = get_lang_conf(lang, "special_references")
595 while True:
596 m = re.search(r"\^(.|\([^)]*\))$", col)
597 if not m:
598 break
599 r = m.group(1)
600 if r.startswith("(") and r.endswith(")"):
601 r = r[1:-1]
602 for r1 in r.split(","):
603 if r1 == "rare": 603 ↛ 604line 603 didn't jump to line 604 because the condition on line 603 was never true
604 hdr_tags.append("rare")
605 elif special_references and r1 in special_references:
606 hdr_tags.extend(special_references[r1].split())
607 else:
608 # v = m.group(1)
609 if r1.startswith("(") and r1.endswith(")"): 609 ↛ 610line 609 didn't jump to line 610 because the condition on line 609 was never true
610 r1 = r1[1:-1]
611 refs.append(unicodedata.normalize("NFKD", r1))
612 col = col[: m.start()]
613 # See if it is a ref definition
614 # print("BEFORE REF CHECK: {!r}".format(col))
615 m = def_re.match(col)
616 # print(f"Before def_re: {refs=}")
617 if m and not nondef_re.match(col):
618 ofs = 0
619 ref = None
620 deflst = []
621 for m in re.finditer(def_re, col):
622 if ref:
623 deflst.append((ref, col[ofs : m.start()].strip()))
624 ref = unicodedata.normalize(
625 "NFKD", m.group(3) or m.group(5) or m.group(6) or ""
626 )
627 ofs = m.end()
628 if ref: 628 ↛ 631line 628 didn't jump to line 631 because the condition on line 628 was always true
629 deflst.append((ref, col[ofs:].strip()))
630 # print("deflst:", deflst)
631 return "", [], deflst, []
632 # See if it *looks* like a reference to a definition
633 # print(f"After def_re: {refs=}")
634 while col:
635 if is_superscript(col[-1]) or col[-1] in ("†",):
636 if col.endswith("ʳᵃʳᵉ"):
637 hdr_tags.append("rare")
638 col = col[:-4].strip()
639 continue
640 if special_references:
641 stop_flag = False
642 for r in special_references:
643 if col.endswith(r):
644 hdr_tags.extend(special_references[r].split())
645 col = col[: -len(r)].strip()
646 stop_flag = True
647 break # this for loop
648 if stop_flag:
649 continue # this while loop
650 # Numbers and H/L/N are useful information
651 refs.append(unicodedata.normalize("NFKD", col[-1]))
652 col = col[:-1]
653 else:
654 break
656 # Check for another form of note definition
657 if ( 657 ↛ 663line 657 didn't jump to line 663 because the condition on line 657 was never true
658 len(col) > 2
659 and col[1] in (")", " ", ":")
660 and col[0].isdigit()
661 and not re.match(nondef_re, col)
662 ):
663 return "", [], [(col[0], col[2:].strip())], []
664 col = col.strip()
666 # Extract final "*" reference symbols. Sometimes there are multiple.
667 m = re.search(r"\*+$", col)
668 if m is not None:
669 col = col[: m.start()]
670 refs.append(unicodedata.normalize("NFKD", m.group(0)))
671 if col.endswith("(*)"): 671 ↛ 672line 671 didn't jump to line 672 because the condition on line 671 was never true
672 col = col[:-3].strip()
673 refs.append("*")
675 # Put back the final parenthesized part
676 col = col.strip() + final_paren
677 # print("EXTRACT_CELL_CONTENT: orig_col={!r} col={!r} refs={!r} hdr_tags={}"
678 # .format(orig_col, col, refs, hdr_tags))
679 return col.strip(), refs, [], hdr_tags
682@functools.lru_cache(10000)
683def parse_title(
684 title: str, source: str
685) -> tuple[list[str], list[str], list[FormData]]:
686 """Parses inflection table title. This returns (global_tags, table_tags,
687 extra_forms), where ``global_tags`` is tags to be added to each inflection
688 entry, ``table_tags`` are tags for the word but not to be added to every
689 form, and ``extra_forms`` is dictionary describing additional forms to be
690 included in the part-of-speech entry)."""
691 assert isinstance(title, str)
692 assert isinstance(source, str)
693 title = html.unescape(title)
694 title = re.sub(r"(?i)<[^>]*>", "", title).strip()
695 title = re.sub(r"\s+", " ", title)
696 # print("PARSE_TITLE:", title)
697 global_tags = []
698 table_tags = []
699 extra_forms = []
700 # Add certain global tags based on contained words
701 for m in re.finditer(title_contains_global_re, title):
702 v = m.group(0).lower()
703 if re.match(table_hdr_ign_part_re, v): 703 ↛ 704line 703 didn't jump to line 704 because the condition on line 703 was never true
704 continue
705 global_tags.extend(title_contains_global_map[v].split())
706 # Add certain tags to table-tags "form" based on contained words
707 for m in re.finditer(title_contains_wordtags_re, title):
708 v = m.group(0).lower()
709 if re.match(table_hdr_ign_part_re, v): 709 ↛ 710line 709 didn't jump to line 710 because the condition on line 709 was never true
710 continue
711 table_tags.extend(title_contains_wordtags_map[v].split())
712 if re.search(r"Conjugation of (s’|se ).*French verbs", title): 712 ↛ 713line 712 didn't jump to line 713 because the condition on line 712 was never true
713 global_tags.append("reflexive")
714 # Check for <x>-type at the beginning of title (e.g., Armenian) and various
715 # other ways of specifying an inflection class.
716 for m in re.finditer(
717 r"\b("
718 r"[\w/]+-type|"
719 r"accent-\w+|"
720 r"[\w/]+-stem|"
721 r"[^ ]+ gradation|"
722 r"\b(stem in [\w/ ]+)|"
723 r"[^ ]+ alternation|"
724 r"(First|Second|Third|Fourth|Fifth|Sixth|Seventh) "
725 r"(Conjugation|declension)|"
726 r"First and second declension|"
727 r"(1st|2nd|3rd|4th|5th|6th) declension|"
728 r"\w[\w/ ]* harmony"
729 r")\b",
730 title,
731 ):
732 dt: FormData = {"form": m.group(1), "source": source, "tags": ["class"]}
733 extra_forms.append(dt)
734 # Parse parenthesized part from title
735 for m in re.finditer(r"\(([^)]*)\)", title):
736 for elem in m.group(1).split(","):
737 # group(0) is the whole string, group(1) first parens
738 elem = elem.strip()
739 if elem in title_elements_map:
740 table_tags.extend(title_elements_map[elem].split())
741 else:
742 m1 = re.match(title_elemstart_re, elem)
743 if m1:
744 tags = title_elemstart_map[m1.group(1)].split()
745 dt = {
746 "form": elem[m1.end() :],
747 "source": source,
748 "tags": tags,
749 }
750 extra_forms.append(dt)
751 # For titles that contains no parenthesized parts, do some special
752 # handling to still interpret parts from them
753 if "(" not in title:
754 # No parenthesized parts
755 m1 = re.search(r"\b(Portuguese) (-.* verb) ", title)
756 if m1 is not None:
757 dt = {"form": m1.group(2), "tags": ["class"], "source": source}
758 extra_forms.append(dt)
759 for elem in title.split(","):
760 elem = elem.strip()
761 if elem in title_elements_map: 761 ↛ 762line 761 didn't jump to line 762 because the condition on line 761 was never true
762 table_tags.extend(title_elements_map[elem].split())
763 elif elem.endswith("-stem"): 763 ↛ 764line 763 didn't jump to line 764 because the condition on line 763 was never true
764 dt = {"form": elem, "tags": ["class"], "source": source}
765 extra_forms.append(dt)
766 return global_tags, table_tags, extra_forms
769def expand_header(
770 wxr: WiktextractContext,
771 tablecontext: "TableContext",
772 word: str,
773 lang: str,
774 pos: str,
775 text: str,
776 base_tags: Union[list[str], set[str], tuple[str, ...]],
777 silent=False,
778 ignore_tags=False,
779 depth=0,
780) -> list[tuple[str, ...]]:
781 """Expands a cell header to tagset, handling conditional expressions
782 in infl_map. This returns list of tuples of tags, each list element
783 describing an alternative interpretation. ``base_tags`` is combined
784 column and row tags for the cell in which the text is being interpreted
785 (conditional expressions in inflection data may depend on it).
786 If ``silent`` is True, then no warnings will be printed. If ``ignore_tags``
787 is True, then tags listed in "if" will be ignored in the test (this is
788 used when trying to heuristically detect whether a non-<th> cell is anyway
789 a header)."""
790 assert isinstance(wxr, WiktextractContext)
791 assert isinstance(word, str)
792 assert isinstance(lang, str)
793 assert isinstance(pos, str)
794 assert isinstance(text, str)
795 assert isinstance(base_tags, (list, tuple, set))
796 assert silent in (True, False)
797 assert isinstance(depth, int)
798 # print("EXPAND_HDR: text={!r} base_tags={!r}".format(text, base_tags))
799 # First map the text using the inflection map
800 text = clean_value(wxr, text)
801 combined_return: list[tuple[str, ...]] = []
802 parts = split_at_comma_semi(text, separators=[";"])
803 for text in parts:
804 if not text: 804 ↛ 805line 804 didn't jump to line 805 because the condition on line 804 was never true
805 continue
806 if text in infl_map:
807 v = infl_map[text] # list or string
808 else:
809 m = re.match(infl_start_re, text)
810 if m is not None: 810 ↛ 811line 810 didn't jump to line 811 because the condition on line 810 was never true
811 v = infl_start_map[m.group(1)]
812 # print("INFL_START {} -> {}".format(text, v))
813 elif re.match(r"Notes", text):
814 # Ignored header
815 # print("IGNORING NOTES")
816 combined_return = or_tagsets(
817 lang, pos, combined_return, [("dummy-skip-this",)]
818 )
819 # this just adds dummy-skip-this
820 continue
821 elif text in IGNORED_COLVALUES:
822 combined_return = or_tagsets(
823 lang, pos, combined_return, [("dummy-ignore-skipped",)]
824 )
825 continue
826 # Try without final parenthesized part
827 text_without_parens = re.sub(r"[,/]?\s+\([^)]*\)\s*$", "", text)
828 if text_without_parens in infl_map:
829 v = infl_map[text_without_parens]
830 elif m is None: 830 ↛ 846line 830 didn't jump to line 846 because the condition on line 830 was always true
831 if not silent:
832 wxr.wtp.debug(
833 "inflection table: unrecognized header: {}".format(
834 repr(text)
835 ),
836 sortid="inflection/735",
837 )
838 # Unrecognized header
839 combined_return = or_tagsets(
840 lang, pos, combined_return, [("error-unrecognized-form",)]
841 )
842 continue
844 # Then loop interpreting the value, until the value is a simple string.
845 # This may evaluate nested conditional expressions.
846 default_then = None
847 while True:
848 # If it is a string, we are done.
849 if isinstance(v, str):
850 tags = set(v.split())
851 remove_useless_tags(lang, pos, tags)
852 tagset = [tuple(sorted(tags))]
853 break
854 # For a list, just interpret it as alternatives. (Currently the
855 # alternatives must directly be strings.)
856 if isinstance(v, (list, tuple)):
857 tagset = []
858 for x in v:
859 tags = set(x.split())
860 remove_useless_tags(lang, pos, tags)
861 tags_t = tuple(sorted(tags))
862 if tags_t not in tagset: 862 ↛ 858line 862 didn't jump to line 858 because the condition on line 862 was always true
863 tagset.append(tags_t)
864 break
865 # Otherwise the value should be a dictionary describing a
866 # conditional expression.
867 if not isinstance(v, dict): 867 ↛ 868line 867 didn't jump to line 868 because the condition on line 867 was never true
868 wxr.wtp.debug(
869 "inflection table: internal: "
870 "UNIMPLEMENTED INFL_MAP VALUE: {}".format(infl_map[text]),
871 sortid="inflection/767",
872 )
873 tagset = [()]
874 break
875 # Evaluate the conditional expression.
876 assert isinstance(v, dict)
877 cond: Union[bool, str] = "default-true"
878 c: Union[str, list[str], set[str]] = ""
879 # Handle "lang" condition. The value must be either a
880 # single language or a list of languages, and the
881 # condition evaluates to True if the table is one of
882 # those languages.
883 if "lang" in v:
884 c = v["lang"]
885 if isinstance(c, str):
886 cond = c == lang
887 else:
888 assert isinstance(c, (list, tuple, set))
889 cond = lang in c
890 # Handle "nested-table-depth" condition. The value must
891 # be an int or list of ints, and the condition evaluates
892 # True if the depth is one of those values.
893 # "depth" is how deep into a nested table tree the current
894 # table lies. It is first started in handle_wikitext_table,
895 # so only applies to tables-within-tables, not other
896 # WikiNode content. `depth` is currently only passed as a
897 # parameter down the table parsing stack, and not stored.
898 if cond and "nested-table-depth" in v: 898 ↛ 899line 898 didn't jump to line 899 because the condition on line 898 was never true
899 d = v["nested-table-depth"]
900 if isinstance(d, int):
901 cond = d == depth
902 else:
903 assert isinstance(d, (list, tuple, set))
904 cond = depth in d
905 # Handle inflection-template condition. Must be a string
906 # or list of strings, and if tablecontext.template_name is in
907 # those, accept the condition.
908 # TableContext.template_name is passed down from page/
909 # parse_inflection, before parsing and expanding itself
910 # has begun.
911 if cond and tablecontext and "inflection-template" in v:
912 d1 = v["inflection-template"]
913 if isinstance(d1, str): 913 ↛ 916line 913 didn't jump to line 916 because the condition on line 913 was always true
914 cond = d1 == tablecontext.template_name
915 else:
916 assert isinstance(d1, (list, tuple, set))
917 cond = tablecontext.template_name in d1
918 # Handle "pos" condition. The value must be either a single
919 # part-of-speech or a list of them, and the condition evaluates to
920 # True if the part-of-speech is any of those listed.
921 if cond and "pos" in v:
922 c = v["pos"]
923 if isinstance(c, str):
924 cond = c == pos
925 else:
926 assert isinstance(c, (list, tuple, set))
927 cond = pos in c
928 # Handle "if" condition. The value must be a string containing a
929 # space-separated list of tags. The condition evaluates to True if
930 # ``base_tags`` contains all of the listed tags. If the condition
931 # is of the form "any: ...tags...", then any of the tags will be
932 # enough.
933 if cond and "if" in v and not ignore_tags:
934 c = v["if"]
935 assert isinstance(c, str)
936 # "if" condition is true if any of the listed tags is present if
937 # it starts with "any:", otherwise all must be present
938 if c.startswith("any: "):
939 cond = any(t in base_tags for t in c[5:].split())
940 else:
941 cond = all(t in base_tags for t in c.split())
943 # Handle "default" assignment. Store the value to be used
944 # as a default later.
945 if "default" in v:
946 assert isinstance(v["default"], str)
947 default_then = v["default"]
949 # Warning message about missing conditions for debugging.
951 if cond == "default-true" and not default_then and not silent:
952 wxr.wtp.debug(
953 "inflection table: IF MISSING COND: word={} "
954 "lang={} text={} base_tags={} c={} cond={}".format(
955 word, lang, text, base_tags, c, cond
956 ),
957 sortid="inflection/851",
958 )
959 # Based on the result of evaluating the condition, select either
960 # "then" part or "else" part.
961 if cond:
962 v = v.get("then", "")
963 else:
964 v1 = v.get("else")
965 if v1 is None:
966 if default_then:
967 v = default_then
968 else:
969 if not silent:
970 wxr.wtp.debug(
971 "inflection table: IF WITHOUT ELSE EVALS "
972 "False: "
973 "{}/{} {!r} base_tags={}".format(
974 word, lang, text, base_tags
975 ),
976 sortid="inflection/865",
977 )
978 v = "error-unrecognized-form"
979 else:
980 v = v1
982 # Merge the resulting tagset from this header part with the other
983 # tagsets from the whole header
984 combined_return = or_tagsets(lang, pos, combined_return, tagset)
986 # Return the combined tagsets, or empty tagset if we got no tagsets
987 if not combined_return:
988 combined_return = [()]
989 return combined_return
992def compute_coltags(
993 lang: str,
994 pos: str,
995 hdrspans: list[str],
996 start: int,
997 colspan: int,
998 celltext: int,
999) -> list[tuple[str]]:
1000 """Computes column tags for a column of the given width based on the
1001 current header spans."""
1002 assert isinstance(lang, str)
1003 assert isinstance(pos, str)
1004 assert isinstance(hdrspans, list)
1005 assert isinstance(start, int) and start >= 0
1006 assert isinstance(colspan, int) and colspan >= 1
1007 assert isinstance(celltext, str) # For debugging only
1008 # print("COMPUTE_COLTAGS CALLED start={} colspan={} celltext={!r}"
1009 # .format(start, colspan, celltext))
1010 # For debugging, set this to the form for whose cell you want debug prints
1011 if celltext == debug_cell_text: 1011 ↛ 1012line 1011 didn't jump to line 1012 because the condition on line 1011 was never true
1012 print(
1013 "COMPUTE_COLTAGS CALLED start={} colspan={} celltext={!r}".format(
1014 start, colspan, celltext
1015 )
1016 )
1017 for hdrspan in hdrspans:
1018 print(
1019 " row={} start={} colspans={} tagsets={}".format(
1020 hdrspan.rownum,
1021 hdrspan.start,
1022 hdrspan.colspan,
1023 hdrspan.tagsets,
1024 )
1025 )
1026 used = set()
1027 coltags = [()]
1028 last_header_row = 1000000
1029 # Iterate through the headers in reverse order, i.e., headers lower in the
1030 # table (closer to the cell) first.
1031 row_tagsets = [()]
1032 row_tagsets_rownum = 1000000
1033 used_hdrspans = set()
1034 for hdrspan in reversed(hdrspans):
1035 if (
1036 hdrspan.start + hdrspan.colspan <= start
1037 or hdrspan.start >= start + colspan
1038 ):
1039 # Does not horizontally overlap current cell. Ignore this hdrspan.
1040 if celltext == debug_cell_text: 1040 ↛ 1041line 1040 didn't jump to line 1041 because the condition on line 1040 was never true
1041 print(
1042 "Ignoring row={} start={} colspan={} tagsets={}".format(
1043 hdrspan.rownum,
1044 hdrspan.start,
1045 hdrspan.colspan,
1046 hdrspan.tagsets,
1047 )
1048 )
1049 continue
1050 # If the cell partially overlaps the current cell, assume we have
1051 # reached something unrelated and abort.
1052 if (
1053 hdrspan.start < start
1054 and hdrspan.start + hdrspan.colspan > start
1055 and hdrspan.start + hdrspan.colspan < start + colspan
1056 ):
1057 if celltext == debug_cell_text: 1057 ↛ 1058line 1057 didn't jump to line 1058 because the condition on line 1057 was never true
1058 print(
1059 "break on partial overlap at start {} {} {}".format(
1060 hdrspan.start, hdrspan.colspan, hdrspan.tagsets
1061 )
1062 )
1063 break
1064 if (
1065 hdrspan.start < start + colspan
1066 and hdrspan.start > start
1067 and hdrspan.start + hdrspan.colspan > start + colspan
1068 and not hdrspan.expanded
1069 ):
1070 if celltext == debug_cell_text: 1070 ↛ 1071line 1070 didn't jump to line 1071 because the condition on line 1070 was never true
1071 print(
1072 "break on partial overlap at end {} {} {}".format(
1073 hdrspan.start, hdrspan.colspan, hdrspan.tagsets
1074 )
1075 )
1076 break
1077 # Check if we have already used this cell.
1078 if id(hdrspan) in used_hdrspans:
1079 continue
1080 # We are going to use this cell.
1081 used_hdrspans.add(id(hdrspan))
1082 tagsets = hdrspan.tagsets
1083 # If the hdrspan is fully inside the current cell and does not cover
1084 # it fully, check if we should merge information from multiple cells.
1085 if not hdrspan.expanded and (
1086 hdrspan.start > start
1087 or hdrspan.start + hdrspan.colspan < start + colspan
1088 ):
1089 # Multiple columns apply to the current cell, only
1090 # gender/number/case tags present
1091 # If there are no tags outside the range in any of the
1092 # categories included in these cells, don't add anything
1093 # (assume all choices valid in the language are possible).
1094 in_cats = set(
1095 valid_tags[t]
1096 for x in hdrspans
1097 if x.rownum == hdrspan.rownum
1098 and x.start >= start
1099 and x.start + x.colspan <= start + colspan
1100 for tt in x.tagsets
1101 for t in tt
1102 )
1103 if celltext == debug_cell_text: 1103 ↛ 1104line 1103 didn't jump to line 1104 because the condition on line 1103 was never true
1104 print("in_cats={} tagsets={}".format(in_cats, tagsets))
1105 # Merge the tagsets into existing tagsets. This merges
1106 # alternatives into the same tagset if there is only one
1107 # category different; otherwise this splits the tagset into
1108 # more alternatives.
1109 includes_all_on_row = True
1110 for x in hdrspans:
1111 # print("X: x.rownum={} x.start={}".format(x.rownum, x.start))
1112 if x.rownum != hdrspan.rownum:
1113 continue
1114 if x.start < start or x.start + x.colspan > start + colspan:
1115 if celltext == debug_cell_text: 1115 ↛ 1116line 1115 didn't jump to line 1116 because the condition on line 1115 was never true
1116 print(
1117 "NOT IN RANGE: {} {} {}".format(
1118 x.start, x.colspan, x.tagsets
1119 )
1120 )
1121 includes_all_on_row = False
1122 continue
1123 if id(x) in used_hdrspans:
1124 if celltext == debug_cell_text: 1124 ↛ 1125line 1124 didn't jump to line 1125 because the condition on line 1124 was never true
1125 print(
1126 "ALREADY USED: {} {} {}".format(
1127 x.start, x.colspan, x.tagsets
1128 )
1129 )
1130 continue
1131 used_hdrspans.add(id(x))
1132 if celltext == debug_cell_text: 1132 ↛ 1133line 1132 didn't jump to line 1133 because the condition on line 1132 was never true
1133 print(
1134 "Merging into wide col: x.rownum={} "
1135 "x.start={} x.colspan={} "
1136 "start={} colspan={} tagsets={} x.tagsets={}".format(
1137 x.rownum,
1138 x.start,
1139 x.colspan,
1140 start,
1141 colspan,
1142 tagsets,
1143 x.tagsets,
1144 )
1145 )
1146 tagsets = or_tagsets(lang, pos, tagsets, x.tagsets)
1147 # If all headers on the row were included, ignore them.
1148 # See e.g. kunna/Swedish/Verb.
1149 ts_cats = tagset_cats(tagsets)
1150 if (
1151 includes_all_on_row
1152 or
1153 # Kludge, see fut/Hungarian/Verb
1154 ("tense" in ts_cats and "object" in ts_cats)
1155 ):
1156 tagsets = [()]
1157 # For limited categories, if the category doesn't appear
1158 # outside, we won't include the category
1159 if not in_cats - set(
1160 ("gender", "number", "person", "case", "category", "voice")
1161 ):
1162 # Sometimes we have masc, fem, neut and plural, so treat
1163 # number and gender as the same here (if one given, look for
1164 # the other too)
1165 if "number" in in_cats or "gender" in in_cats:
1166 in_cats.update(("number", "gender"))
1167 # Determine which categories occur outside on
1168 # the same row. Ignore headers that have been expanded
1169 # to cover the whole row/part of it.
1170 out_cats = set(
1171 valid_tags[t]
1172 for x in hdrspans
1173 if x.rownum == hdrspan.rownum
1174 and not x.expanded
1175 and (
1176 x.start < start or x.start + x.colspan > start + colspan
1177 )
1178 for tt in x.tagsets
1179 for t in tt
1180 )
1181 if celltext == debug_cell_text: 1181 ↛ 1182line 1181 didn't jump to line 1182 because the condition on line 1181 was never true
1182 print("in_cats={} out_cats={}".format(in_cats, out_cats))
1183 # Remove all inside categories that do not appear outside
1185 new_tagsets = []
1186 for ts in tagsets:
1187 tags = tuple(
1188 sorted(t for t in ts if valid_tags[t] in out_cats)
1189 )
1190 if tags not in new_tagsets: 1190 ↛ 1186line 1190 didn't jump to line 1186 because the condition on line 1190 was always true
1191 new_tagsets.append(tags)
1192 if celltext == debug_cell_text and new_tagsets != tagsets: 1192 ↛ 1193line 1192 didn't jump to line 1193 because the condition on line 1192 was never true
1193 print(
1194 "Removed tags that do not "
1195 "appear outside {} -> {}".format(
1196 # have_hdr never used?
1197 tagsets,
1198 new_tagsets,
1199 )
1200 )
1201 tagsets = new_tagsets
1202 key = (hdrspan.start, hdrspan.colspan)
1203 if key in used:
1204 if celltext == debug_cell_text: 1204 ↛ 1205line 1204 didn't jump to line 1205 because the condition on line 1204 was never true
1205 print(
1206 "Cellspan already used: start={} "
1207 "colspan={} rownum={} {}".format(
1208 hdrspan.start,
1209 hdrspan.colspan,
1210 hdrspan.rownum,
1211 hdrspan.tagsets,
1212 )
1213 )
1214 action = get_lang_conf(lang, "reuse_cellspan")
1215 # can be "stop", "skip" or "reuse"
1216 if action == "stop":
1217 break
1218 if action == "skip":
1219 continue
1220 assert action == "reuse"
1221 tcats = tagset_cats(tagsets)
1222 # Most headers block using the same column position above. However,
1223 # "register" tags don't do this (cf. essere/Italian/verb: "formal")
1224 if len(tcats) != 1 or "register" not in tcats:
1225 used.add(key)
1226 # If we have moved to a different row, merge into column tagsets
1227 # (we use different and_tagsets within the row)
1228 if row_tagsets_rownum != hdrspan.rownum:
1229 # row_tagsets_rownum was initialized as 10000000
1230 ret = and_tagsets(lang, pos, coltags, row_tagsets)
1231 if celltext == debug_cell_text: 1231 ↛ 1232line 1231 didn't jump to line 1232 because the condition on line 1231 was never true
1232 print(
1233 "merging rows: {} {} -> {}".format(
1234 coltags, row_tagsets, ret
1235 )
1236 )
1237 coltags = ret
1238 row_tagsets = [()]
1239 row_tagsets_rownum = hdrspan.rownum
1240 # Merge into coltags
1241 if hdrspan.all_headers_row and hdrspan.rownum + 1 == last_header_row:
1242 # If this row is all headers and immediately preceeds the last
1243 # header we accepted, take any header from there.
1244 row_tagsets = and_tagsets(lang, pos, row_tagsets, tagsets)
1245 if celltext == debug_cell_text: 1245 ↛ 1246line 1245 didn't jump to line 1246 because the condition on line 1245 was never true
1246 print("merged (next header row): {}".format(row_tagsets))
1247 else:
1248 # new_cats is for the new tags (higher up in the table)
1249 new_cats = tagset_cats(tagsets)
1250 # cur_cats is for the tags already collected (lower in the table)
1251 cur_cats = tagset_cats(coltags)
1252 if celltext == debug_cell_text: 1252 ↛ 1253line 1252 didn't jump to line 1253 because the condition on line 1252 was never true
1253 print(
1254 "row={} start={} colspan={} tagsets={} coltags={} "
1255 "new_cats={} cur_cats={}".format(
1256 hdrspan.rownum,
1257 hdrspan.start,
1258 hdrspan.colspan,
1259 tagsets,
1260 coltags,
1261 new_cats,
1262 cur_cats,
1263 )
1264 )
1265 if "detail" in new_cats:
1266 if not any(coltags): # Only if no tags so far
1267 coltags = or_tagsets(lang, pos, coltags, tagsets)
1268 if celltext == debug_cell_text: 1268 ↛ 1269line 1268 didn't jump to line 1269 because the condition on line 1268 was never true
1269 print("stopping on detail after merge")
1270 break
1271 # Here, we block bleeding of categories from above
1272 elif "non-finite" in cur_cats and "non-finite" in new_cats:
1273 stop = get_lang_conf(lang, "stop_non_finite_non_finite")
1274 if stop: 1274 ↛ 1300line 1274 didn't jump to line 1300 because the condition on line 1274 was always true
1275 if celltext == debug_cell_text: 1275 ↛ 1276line 1275 didn't jump to line 1276 because the condition on line 1275 was never true
1276 print("stopping on non-finite-non-finite")
1277 break
1278 elif "non-finite" in cur_cats and "voice" in new_cats:
1279 stop = get_lang_conf(lang, "stop_non_finite_voice")
1280 if stop: 1280 ↛ 1300line 1280 didn't jump to line 1300 because the condition on line 1280 was always true
1281 if celltext == debug_cell_text: 1281 ↛ 1282line 1281 didn't jump to line 1282 because the condition on line 1281 was never true
1282 print("stopping on non-finite-voice")
1283 break
1284 elif "non-finite" in new_cats and cur_cats & set(
1285 ("person", "number")
1286 ):
1287 if celltext == debug_cell_text: 1287 ↛ 1288line 1287 didn't jump to line 1288 because the condition on line 1287 was never true
1288 print("stopping on non-finite new")
1289 break
1290 elif "non-finite" in new_cats and "tense" in new_cats:
1291 stop = get_lang_conf(lang, "stop_non_finite_tense")
1292 if stop:
1293 if celltext == debug_cell_text: 1293 ↛ 1294line 1293 didn't jump to line 1294 because the condition on line 1293 was never true
1294 print("stopping on non-finite new")
1295 break
1296 elif "non-finite" in cur_cats and new_cats & set(("mood",)): 1296 ↛ 1297line 1296 didn't jump to line 1297 because the condition on line 1296 was never true
1297 if celltext == debug_cell_text:
1298 print("stopping on non-finite cur")
1299 break
1300 if (
1301 "tense" in new_cats
1302 and any("imperative" in x for x in coltags)
1303 and get_lang_conf(lang, "imperative_no_tense")
1304 ):
1305 if celltext == debug_cell_text: 1305 ↛ 1306line 1305 didn't jump to line 1306 because the condition on line 1305 was never true
1306 print("skipping tense in imperative")
1307 continue
1308 elif (
1309 "mood" in new_cats
1310 and "mood" in cur_cats
1311 and
1312 # Allow if all new tags are already in current set
1313 any(
1314 t not in ts1
1315 for ts1 in coltags # current
1316 for ts2 in tagsets # new (from above)
1317 for t in ts2
1318 )
1319 ):
1320 skip = get_lang_conf(lang, "skip_mood_mood")
1321 if skip:
1322 if celltext == debug_cell_text: 1322 ↛ 1323line 1322 didn't jump to line 1323 because the condition on line 1322 was never true
1323 print("skipping on mood-mood")
1324 # we continue to next header
1325 else:
1326 if celltext == debug_cell_text: 1326 ↛ 1327line 1326 didn't jump to line 1327 because the condition on line 1326 was never true
1327 print("stopping on mood-mood")
1328 break
1329 elif "tense" in new_cats and "tense" in cur_cats:
1330 skip = get_lang_conf(lang, "skip_tense_tense")
1331 if skip:
1332 if celltext == debug_cell_text: 1332 ↛ 1333line 1332 didn't jump to line 1333 because the condition on line 1332 was never true
1333 print("skipping on tense-tense")
1334 # we continue to next header
1335 else:
1336 if celltext == debug_cell_text: 1336 ↛ 1337line 1336 didn't jump to line 1337 because the condition on line 1336 was never true
1337 print("stopping on tense-tense")
1338 break
1339 elif "aspect" in new_cats and "aspect" in cur_cats:
1340 if celltext == debug_cell_text: 1340 ↛ 1341line 1340 didn't jump to line 1341 because the condition on line 1340 was never true
1341 print("skipping on aspect-aspect")
1342 continue
1343 elif "number" in cur_cats and "number" in new_cats:
1344 if celltext == debug_cell_text: 1344 ↛ 1345line 1344 didn't jump to line 1345 because the condition on line 1344 was never true
1345 print("stopping on number-number")
1346 break
1347 elif "number" in cur_cats and "gender" in new_cats:
1348 if celltext == debug_cell_text: 1348 ↛ 1349line 1348 didn't jump to line 1349 because the condition on line 1348 was never true
1349 print("stopping on number-gender")
1350 break
1351 elif "person" in cur_cats and "person" in new_cats:
1352 if celltext == debug_cell_text: 1352 ↛ 1353line 1352 didn't jump to line 1353 because the condition on line 1352 was never true
1353 print("stopping on person-person")
1354 break
1355 else:
1356 # Merge tags and continue to next header up/left in the table.
1357 row_tagsets = and_tagsets(lang, pos, row_tagsets, tagsets)
1358 if celltext == debug_cell_text: 1358 ↛ 1359line 1358 didn't jump to line 1359 because the condition on line 1358 was never true
1359 print("merged: {}".format(coltags))
1360 # Update the row number from which we have last taken headers
1361 last_header_row = hdrspan.rownum
1362 # Merge the final row tagset into coltags
1363 coltags = and_tagsets(lang, pos, coltags, row_tagsets)
1364 # print(
1365 # "HDRSPANS:", list((x.start, x.colspan, x.tagsets) for x in hdrspans)
1366 # )
1367 if celltext == debug_cell_text: 1367 ↛ 1368line 1367 didn't jump to line 1368 because the condition on line 1367 was never true
1368 print("COMPUTE_COLTAGS {} {}: {}".format(start, colspan, coltags))
1369 assert isinstance(coltags, list)
1370 assert all(isinstance(x, tuple) for x in coltags)
1371 return coltags
1374def parse_simple_table(
1375 wxr, tablecontext, word, lang, pos, rows, titles, source, after, depth
1376):
1377 """This is the default table parser. Despite its name, it can parse
1378 complex tables. This returns a list of forms to be added to the
1379 part-of-speech, or None if the table could not be parsed."""
1380 assert isinstance(wxr, WiktextractContext)
1381 assert isinstance(tablecontext, TableContext)
1382 assert isinstance(word, str)
1383 assert isinstance(lang, str)
1384 assert isinstance(pos, str)
1385 assert isinstance(rows, list)
1386 assert isinstance(source, str)
1387 assert isinstance(after, str)
1388 assert isinstance(depth, int)
1389 for row in rows:
1390 for col in row:
1391 assert isinstance(col, InflCell)
1392 assert isinstance(titles, list)
1393 for x in titles:
1394 assert isinstance(x, str)
1396 # print("PARSE_SIMPLE_TABLE: TITLES:", titles)
1397 if debug_cell_text: 1397 ↛ 1398line 1397 didn't jump to line 1398 because the condition on line 1397 was never true
1398 print("ROWS:")
1399 for row in rows:
1400 print(" ", row)
1402 # Check for forced rowspan kludge. See e.g.
1403 # maorski/Serbo-Croatian. These are essentially multi-row
1404 # cells implemented using <br> rather than separate cell. We fix this
1405 # by identifying rows where this happens, and splitting the current row
1406 # to multiple rows by synthesizing additional cells.
1407 new_rows = []
1408 for row in rows:
1409 split_row = (
1410 any(x.is_title and x.text in ("inanimate\nanimate",) for x in row)
1411 and
1412 # x is an InflCell
1413 all(x.rowspan == 1 for x in row)
1414 )
1415 if not split_row:
1416 new_rows.append(row)
1417 continue
1418 row1 = []
1419 row2 = []
1420 for cell in row:
1421 cell1 = copy.deepcopy(cell)
1422 if "\n" in cell.text:
1423 # Has more than one line - split this cell
1424 parts = cell.text.strip().splitlines()
1425 if len(parts) != 2: 1425 ↛ 1426line 1425 didn't jump to line 1426 because the condition on line 1425 was never true
1426 wxr.wtp.debug(
1427 "forced rowspan kludge got {} parts: {!r}".format(
1428 len(parts), cell.text
1429 ),
1430 sortid="inflection/1234",
1431 )
1432 cell2 = copy.deepcopy(cell)
1433 cell1.text = parts[0]
1434 cell2.text = parts[1]
1435 else:
1436 cell1.rowspan = 2
1437 cell2 = cell1 # ref, not a copy
1438 row1.append(cell1)
1439 row2.append(cell2)
1440 new_rows.append(row1)
1441 new_rows.append(row2)
1442 rows = new_rows
1443 # print("ROWS AFTER FORCED ROWSPAN KLUDGE:")
1444 # for row in rows:
1445 # print(" ", row)
1447 # Parse definitions for references (from table itself and from text
1448 # after it)
1449 def_ht = {}
1451 def add_defs(defs: list[tuple[str, str]]) -> None:
1452 for ref, d in defs:
1453 # print("DEF: ref={} d={}".format(ref, d))
1454 d = d.strip()
1455 d = d.split(". ")[0].strip() # text before ". "
1456 if not d: 1456 ↛ 1457line 1456 didn't jump to line 1457 because the condition on line 1456 was never true
1457 continue
1458 if d.endswith("."): # catc ".."??
1459 d = d[:-1]
1460 tags, topics = decode_tags(d, no_unknown_starts=True)
1461 # print(f"{ref=}, {d=}, {tags=}")
1462 if topics or any("error-unknown-tag" in ts for ts in tags):
1463 d = d[0].lower() + d[1:]
1464 tags, topics = decode_tags(d, no_unknown_starts=True)
1465 if topics or any("error-unknown-tag" in ts for ts in tags):
1466 # Failed to parse as tags
1467 # print("Failed: topics={} tags={}"
1468 # .format(topics, tags))
1469 continue
1470 tags1_s: set[str] = set()
1471 for ts in tags:
1472 tags1_s.update(ts)
1473 tags1 = tuple(sorted(tags1_s))
1474 # print("DEFINED: {} -> {}".format(ref, tags1))
1475 def_ht[ref] = tags1
1477 def generate_tags(
1478 rowtags: list[tuple[str]], table_tags: list[str]
1479 ) -> tuple[
1480 list[tuple[str, ...]], list[tuple[str, ...]], list[tuple[str, ...]]
1481 ]:
1482 new_coltags = []
1483 all_hdr_tags = [] # list of tuples
1484 new_rowtags = []
1485 for rt0 in rowtags:
1486 for ct0 in compute_coltags(
1487 lang,
1488 pos,
1489 hdrspans,
1490 col_idx, # col_idx=>start
1491 colspan,
1492 col, # cell_text
1493 ):
1494 base_tags: set[str] = (
1495 set(rt0)
1496 | set(ct0)
1497 | set(global_tags)
1498 | set(itertools.chain.from_iterable(table_tags))
1499 ) # Union.
1500 alt_tags = expand_header(
1501 wxr,
1502 tablecontext,
1503 word,
1504 lang,
1505 pos,
1506 text,
1507 base_tags,
1508 depth=depth,
1509 )
1510 # base_tags are used in infl_map "if"-conds.
1511 for tt in alt_tags:
1512 if tt not in all_hdr_tags:
1513 all_hdr_tags.append(tt)
1514 tt_s = set(tt)
1515 # Certain tags are always moved to word-level tags
1516 if tt_s & TAGS_FORCED_WORDTAGS: 1516 ↛ 1517line 1516 didn't jump to line 1517 because the condition on line 1516 was never true
1517 table_tags.extend(tt_s & TAGS_FORCED_WORDTAGS)
1518 tt_s = tt_s - TAGS_FORCED_WORDTAGS
1519 # Add tags from referenced footnotes
1520 tt_s.update(refs_tags)
1521 # Sort, convert to tuple, and add to set of
1522 # alternatives.
1523 tt = tuple(sorted(tt_s))
1524 if tt not in new_coltags:
1525 new_coltags.append(tt)
1526 # Kludge (saprast/Latvian/Verb): ignore row tags
1527 # if trying to add a non-finite after mood.
1528 if any(valid_tags[t] == "mood" for t in rt0) and any(
1529 valid_tags[t] == "non-finite" for t in tt
1530 ):
1531 tags = tuple(sorted(set(tt) | set(hdr_tags)))
1532 else:
1533 tags = tuple(sorted(set(tt) | set(rt0) | set(hdr_tags)))
1534 if tags not in new_rowtags:
1535 new_rowtags.append(tags)
1536 return new_rowtags, new_coltags, all_hdr_tags
1538 def add_new_hdrspan(
1539 col: str,
1540 hdrspans: list[HdrSpan],
1541 store_new_hdrspan: bool,
1542 col0_followed_by_nonempty: bool,
1543 col0_hdrspan: Optional[HdrSpan],
1544 ) -> tuple[str, bool, Optional[HdrSpan]]:
1545 hdrspan = HdrSpan(
1546 col_idx, colspan, rowspan, rownum, new_coltags, col, all_headers
1547 )
1548 hdrspans.append(hdrspan)
1550 # infl-map tag "dummy-store-hdrspan" causes this new hdrspan
1551 # to be added to a register of stored hdrspans to be used
1552 # later with "dummy-load-stored-hdrspans".
1553 if store_new_hdrspan: 1553 ↛ 1554line 1553 didn't jump to line 1554 because the condition on line 1553 was never true
1554 tablecontext.stored_hdrspans.append(hdrspan)
1556 # Handle headers that are above left-side header
1557 # columns and are followed by personal pronouns in
1558 # remaining columns (basically headers that
1559 # evaluate to no tags). In such cases widen the
1560 # left-side header to the full row.
1561 if previously_seen: # id(cell) in seen_cells previously
1562 col0_followed_by_nonempty = True
1563 return col, col0_followed_by_nonempty, col0_hdrspan
1564 elif col0_hdrspan is None:
1565 col0_hdrspan = hdrspan
1566 elif any(all_hdr_tags): 1566 ↛ 1634line 1566 didn't jump to line 1634 because the condition on line 1566 was always true
1567 col0_cats = tagset_cats(col0_hdrspan.tagsets)
1568 later_cats = tagset_cats(all_hdr_tags)
1569 col0_allowed = get_lang_conf(lang, "hdr_expand_first")
1570 later_allowed = get_lang_conf(lang, "hdr_expand_cont")
1571 later_allowed = later_allowed | set(["dummy"])
1572 # dummy2 has different behavior than plain dummy
1573 # and does not belong here.
1575 # print("col0_cats={} later_cats={} "
1576 # "fol_by_nonempty={} col_idx={} end={} "
1577 # "tagsets={}"
1578 # .format(col0_cats, later_cats,
1579 # col0_followed_by_nonempty, col_idx,
1580 # col0_hdrspan.start +
1581 # col0_hdrspan.colspan,
1582 # col0_hdrspan.tagsets))
1583 # print("col0.rowspan={} rowspan={}"
1584 # .format(col0_hdrspan.rowspan, rowspan))
1585 # Only expand if [col0_cats and later_cats are allowed
1586 # and don't overlap] and [col0 has tags], and there have
1587 # been [no disallowed cells in between].
1588 #
1589 # There are three cases here:
1590 # - col0_hdrspan set, continue with allowed current
1591 # - col0_hdrspan set, expand, start new
1592 # - col0_hdrspan set, no expand, start new
1593 if (
1594 not col0_followed_by_nonempty
1595 and
1596 # XXX Only one cat of tags: kunna/Swedish
1597 # XXX len(col0_cats) == 1 and
1598 col0_hdrspan.rowspan >= rowspan
1599 and
1600 # from hdrspan
1601 not (later_cats - later_allowed)
1602 and not (col0_cats & later_cats)
1603 ):
1604 # First case: col0 set, continue
1605 return col, col0_followed_by_nonempty, col0_hdrspan
1606 # We are going to start new col0_hdrspan. Check if
1607 # we should expand.
1608 if (
1609 not col0_followed_by_nonempty
1610 and not (col0_cats - col0_allowed)
1611 and
1612 # Only "allowed" allowed
1613 # XXX len(col0_cats) == 1 and
1614 col_idx > col0_hdrspan.start + col0_hdrspan.colspan
1615 ):
1616 # col_idx is beyond current colspan
1617 # *Expand* current col0_hdrspan
1618 # print("EXPANDING COL0 MID: {} from {} to {} "
1619 # "cols {}"
1620 # .format(col0_hdrspan.text,
1621 # col0_hdrspan.colspan,
1622 # col_idx - col0_hdrspan.start,
1623 # col0_hdrspan.tagsets))
1624 col0_hdrspan.colspan = col_idx - col0_hdrspan.start
1625 col0_hdrspan.expanded = True
1626 # Clear old col0_hdrspan
1627 if col == debug_cell_text: 1627 ↛ 1628line 1627 didn't jump to line 1628 because the condition on line 1627 was never true
1628 print("START NEW {}".format(hdrspan.tagsets))
1629 col0_hdrspan = None
1630 # Now start new, unless it comes from previous row
1631 if not previously_seen: 1631 ↛ 1634line 1631 didn't jump to line 1634 because the condition on line 1631 was always true
1632 col0_hdrspan = hdrspan
1633 col0_followed_by_nonempty = False
1634 return col, col0_followed_by_nonempty, col0_hdrspan
1636 def split_text_into_alts(col: str) -> tuple[str, list[str], list[str]]:
1637 # Split the cell text into alternatives
1638 split_extra_tags = []
1639 if col and is_superscript(col[0]): 1639 ↛ 1640line 1639 didn't jump to line 1640 because the condition on line 1639 was never true
1640 alts = [col]
1641 else:
1642 separators = [";", "•", r"\n", " or "]
1643 if " + " not in col:
1644 separators.append(",")
1645 if not col.endswith("/"):
1646 separators.append("/")
1647 if col in special_phrase_splits:
1648 # Use language-specific special splits.
1649 # These are phrases and constructions that have
1650 # unique ways of splitting, not specific characters
1651 # to split on like with the default splitting.
1652 alts, tags = special_phrase_splits[col]
1653 split_extra_tags = tags.split()
1654 for x in split_extra_tags:
1655 assert x in valid_tags
1656 assert isinstance(alts, (list, tuple))
1657 assert isinstance(tags, str)
1658 else:
1659 # Use default splitting. However, recognize
1660 # language-specific replacements and change them to magic
1661 # characters before splitting. This way we won't split
1662 # them. This is important for, e.g., recognizing
1663 # alternative pronouns.
1664 # The magic characters are characters out of Unicode scope
1665 # that are given a simple incremental value, int > unicode.
1666 repls = {}
1667 magic_ch = MAGIC_FIRST
1668 trs = get_lang_conf(lang, "form_transformations")
1669 # trs is a list of lists of strings
1670 for _, v, _, _ in trs:
1671 # v is a pattern string, like "^ich"
1672 # form_transformations data is doing double-duty here,
1673 # because the pattern strings are already known to us and
1674 # not meant to be split.
1675 m = re.search(v, col)
1676 if m is not None:
1677 # if pattern found in text
1678 magic = chr(magic_ch)
1679 magic_ch += 1 # next magic character value
1680 col = re.sub(v, magic, col) # replace with magic ch
1681 repls[magic] = m.group(0)
1682 # remember what regex match string each magic char
1683 # replaces. .group(0) is the whole match.
1684 alts0 = split_at_comma_semi(col, separators=separators)
1685 # with magic characters in place, split the text so that
1686 # pre-transformation text is out of the way.
1687 alts = []
1688 for alt in alts0:
1689 # create a new list with the separated items and
1690 # the magic characters replaced with the original texts.
1691 for k, v in repls.items():
1692 alt = re.sub(k, v, alt)
1693 alts.append(alt)
1695 # Remove "*" from beginning of forms, as in non-attested
1696 # or reconstructed forms. Otherwise it might confuse romanization
1697 # detection.
1698 alts = list(re.sub(r"^\*\*?([^ ])", r"\1", x) for x in alts)
1699 alts = list(
1700 x for x in alts if not re.match(r"pronounced with |\(with ", x)
1701 )
1702 alts = list(
1703 re.sub(r"^\((in the sense [^)]*)\)\s+", "", x) for x in alts
1704 )
1705 # Check for parenthesized alternatives, e.g. ripromettersi/Italian
1706 if all( 1706 ↛ 1717line 1706 didn't jump to line 1717 because the condition on line 1706 was never true
1707 re.match(r"\w+( \w+)* \(\w+( \w+)*(, \w+( \w+)*)*\)$", alt)
1708 # word word* \(word word*(, word word*)*\)
1709 and all(
1710 distw([re.sub(r" \(.*", "", alt)], x) < 0.5
1711 # Levenshtein distance
1712 for x in re.sub(r".*\((.*)\)", r"\1", alt).split(", ")
1713 )
1714 # Extract from parentheses for testin
1715 for alt in alts
1716 ):
1717 new_alts = []
1718 for alt in alts:
1719 # Replace parentheses before splitting
1720 alt = alt.replace(" (", ", ")
1721 alt = alt.replace(")", "")
1722 for new_alt in alt.split(", "):
1723 new_alts.append(new_alt)
1724 alts = new_alts
1725 return col, alts, split_extra_tags
1727 def handle_mixed_lines(alts: list[str]) -> list[tuple[str, str, str]]:
1728 # Handle the special case where romanization is given under
1729 # normal form, e.g. in Russian. There can be multiple
1730 # comma-separated forms in each case. We also handle the case
1731 # where instead of romanization we have IPA pronunciation
1732 # (e.g., avoir/French/verb).
1733 len2 = len(alts) // 2
1734 # Check for IPAs (forms first, IPAs under)
1735 # base, base, IPA, IPA
1736 if (
1737 len(alts) % 2 == 0 # Divisibly by two
1738 and all(
1739 re.match(r"^\s*/.*/\s*$", x) # Inside slashes = IPA
1740 for x in alts[len2:]
1741 )
1742 ): # In the second half of alts
1743 nalts = list(
1744 (alts[i], "", alts[i + len2])
1745 # List of tuples: (base, "", ipa)
1746 for i in range(len2)
1747 )
1748 # base, base, base, IPA
1749 elif (
1750 len(alts) > 2
1751 and re.match(r"^\s*/.*/\s*$", alts[-1])
1752 and all(not x.startswith("/") for x in alts[:-1])
1753 ):
1754 # Only if the last alt is IPA
1755 nalts = list((alts[i], "", alts[-1]) for i in range(len(alts) - 1))
1756 # base, IPA, IPA, IPA
1757 elif (
1758 len(alts) > 2
1759 and not alts[0].startswith("/")
1760 and all(
1761 re.match(r"^\s*/.*/\s*$", alts[i]) for i in range(1, len(alts))
1762 )
1763 ):
1764 # First is base and the rest is IPA alternatives
1765 nalts = list((alts[0], "", alts[i]) for i in range(1, len(alts)))
1767 # Check for romanizations, forms first, romanizations under
1768 elif (
1769 len(alts) % 2 == 0
1770 and not any("(" in x for x in alts)
1771 and all(
1772 classify_desc(
1773 re.sub(
1774 r"\^.*$",
1775 "",
1776 # Remove ends of strings starting from ^.
1777 # Supescripts have been already removed
1778 # from the string, while ^xyz needs to be
1779 # removed separately, though it's usually
1780 # something with a single letter?
1781 "".join(xx for xx in x if not is_superscript(xx)),
1782 )
1783 )
1784 == "other"
1785 for x in alts[:len2]
1786 )
1787 and all(
1788 classify_desc(
1789 re.sub(
1790 r"\^.*$",
1791 "",
1792 "".join(xx for xx in x if not is_superscript(xx)),
1793 )
1794 )
1795 in ("romanization", "english")
1796 for x in alts[len2:]
1797 )
1798 ):
1799 nalts = list((alts[i], alts[i + len2], "") for i in range(len2))
1800 # Check for romanizations, forms and romanizations alternating
1801 elif (
1802 len(alts) % 2 == 0
1803 and not any("(" in x for x in alts)
1804 and all(
1805 classify_desc(
1806 re.sub(
1807 r"\^.*$",
1808 "",
1809 "".join(xx for xx in alts[i] if not is_superscript(xx)),
1810 )
1811 )
1812 == "other"
1813 for i in range(0, len(alts), 2)
1814 )
1815 and all(
1816 classify_desc(
1817 re.sub(
1818 r"\^.*$",
1819 "",
1820 "".join(xx for xx in alts[i] if not is_superscript(xx)),
1821 )
1822 )
1823 in ("romanization", "english")
1824 for i in range(1, len(alts), 2)
1825 )
1826 ):
1827 # odds
1828 nalts = list(
1829 (alts[i], alts[i + 1], "") for i in range(0, len(alts), 2)
1830 )
1831 # evens
1832 else:
1833 new_alts = []
1834 for alt in alts:
1835 lst = [""]
1836 idx = 0
1837 for m in re.finditer(
1838 r"(^|\w|\*)\((\w+" r"(/\w+)*)\)",
1839 # start OR letter OR asterisk (word/word*)
1840 # \\___________group 1_______/ \ \_g3_///
1841 # \ \__gr. 2_//
1842 # \_____________group 0________________/
1843 alt,
1844 ):
1845 v = m.group(2) # (word/word/word...)
1846 if (
1847 classify_desc(v) == "tags" # Tags inside parens
1848 or m.group(0) == alt
1849 ): # All in parens
1850 continue
1851 new_lst = []
1852 for x in lst:
1853 x += alt[idx : m.start()] + m.group(1)
1854 # alt until letter or asterisk
1855 idx = m.end()
1856 vparts = v.split("/")
1857 # group(2) = ["word", "wörd"...]
1858 if len(vparts) == 1:
1859 new_lst.append(x)
1860 new_lst.append(x + v)
1861 # "kind(er)" -> ["kind", "kinder"]
1862 else:
1863 for vv in vparts:
1864 new_lst.append(x + vv)
1865 # "lampai(tten/den)" ->
1866 # ["lampaitten", "lampaiden"]
1867 lst = new_lst
1868 for x in lst:
1869 new_alts.append(x + alt[idx:])
1870 # add the end of alt
1871 nalts = list((x, "", "") for x in new_alts)
1872 # [form, no romz, no ipa]
1873 return nalts
1875 def find_semantic_parens(form: str) -> tuple[str, list[str]]:
1876 # "Some languages" (=Greek) use brackets to mark things that
1877 # require tags, like (informality), [rarity] and {archaicity}.
1878 extra_tags = []
1879 if re.match(r"\([^][(){}]*\)$", form):
1880 if get_lang_conf(lang, "parentheses_for_informal"):
1881 form = form[1:-1]
1882 extra_tags.append("informal")
1883 else:
1884 form = form[1:-1]
1885 elif re.match(r"\{\[[^][(){}]*\]\}$", form):
1886 if get_lang_conf( 1886 ↛ 1893line 1886 didn't jump to line 1893 because the condition on line 1886 was always true
1887 lang, "square_brackets_for_rare"
1888 ) and get_lang_conf(lang, "curly_brackets_for_archaic"):
1889 # είμαι/Greek/Verb
1890 form = form[2:-2]
1891 extra_tags.extend(["rare", "archaic"])
1892 else:
1893 form = form[2:-2]
1894 elif re.match(r"\{[^][(){}]*\}$", form):
1895 if get_lang_conf(lang, "curly_brackets_for_archaic"): 1895 ↛ 1900line 1895 didn't jump to line 1900 because the condition on line 1895 was always true
1896 # είμαι/Greek/Verb
1897 form = form[1:-1]
1898 extra_tags.extend(["archaic"])
1899 else:
1900 form = form[1:-1]
1901 elif re.match(r"\[[^][(){}]*\]$", form):
1902 if get_lang_conf(lang, "square_brackets_for_rare"): 1902 ↛ 1907line 1902 didn't jump to line 1907 because the condition on line 1902 was always true
1903 # είμαι/Greek/Verb
1904 form = form[1:-1]
1905 extra_tags.append("rare")
1906 else:
1907 form = form[1:-1]
1908 return form, extra_tags
1910 def handle_parens(
1911 form: str, roman: str, clitic: str, extra_tags: list[str]
1912 ) -> tuple[str, str, str]:
1913 if re.match(r"[’'][a-z]([a-z][a-z]?)?$", paren):
1914 # is there a clitic starting with apostrophe?
1915 clitic = paren
1916 # assume the whole paren is a clitic
1917 # then remove paren from form
1918 form = (form[: m.start()] + subst + form[m.end() :]).strip()
1919 elif classify_desc(paren) == "tags":
1920 tagsets1, topics1 = decode_tags(paren)
1921 if not topics1: 1921 ↛ 1942line 1921 didn't jump to line 1942 because the condition on line 1921 was always true
1922 for ts in tagsets1:
1923 ts = tuple(x for x in ts if " " not in x)
1924 # There are some generated tags containing
1925 # spaces; do not let them through here.
1926 extra_tags.extend(ts)
1927 form = (form[: m.start()] + subst + form[m.end() :]).strip()
1928 # brackets contain romanization
1929 elif (
1930 m.start() > 0
1931 and not roman
1932 and classify_desc(form[: m.start()]) == "other"
1933 and
1934 # "other" ~ text
1935 classify_desc(paren) in ("romanization", "english")
1936 and not re.search(r"^with |-form$", paren)
1937 ):
1938 roman = paren
1939 form = (form[: m.start()] + subst + form[m.end() :]).strip()
1940 elif re.search(r"^with |-form", paren): 1940 ↛ 1941line 1940 didn't jump to line 1941 because the condition on line 1940 was never true
1941 form = (form[: m.start()] + subst + form[m.end() :]).strip()
1942 return form, roman, clitic
1944 def merge_row_and_column_tags(form, some_has_covered_text):
1945 # Merge column tags and row tags. We give preference
1946 # to moods etc coming from rowtags (cf. austteigen/German/Verb
1947 # imperative forms).
1949 # In certain cases, what a tag means depends on whether
1950 # it is a row or column header. Depending on the language,
1951 # we replace certain tags with others if they're in
1952 # a column or row
1954 ret = []
1955 # rtagreplacs = get_lang_conf(lang, "rowtag_replacements")
1956 # ctagreplacs = get_lang_conf(lang, "coltag_replacements")
1957 for rt in sorted(rowtags):
1958 if "dummy-use-as-coltags" in rt: 1958 ↛ 1959line 1958 didn't jump to line 1959 because the condition on line 1958 was never true
1959 continue
1960 # if lang was in rowtag_replacements)
1961 # if not rtagreplacs == None:
1962 # rt = replace_directional_tags(rt, rtagreplacs)
1963 for ct in sorted(coltags):
1964 if "dummy-use-as-rowtags" in ct: 1964 ↛ 1965line 1964 didn't jump to line 1965 because the condition on line 1964 was never true
1965 continue
1966 # if lang was in coltag_replacements
1967 # if not ctagreplacs == None:
1968 # ct = replace_directional_tags(ct,
1969 # ctagreplacs)
1970 tags = set(global_tags)
1971 tags.update(extra_tags)
1972 tags.update(rt)
1973 tags.update(refs_tags)
1974 tags.update(tablecontext.section_header)
1975 # Merge tags from column. For certain kinds of tags,
1976 # those coming from row take precedence.
1977 old_tags = set(tags)
1978 for t in ct:
1979 c = valid_tags[t]
1980 if c in ("mood", "case", "number") and any(
1981 valid_tags[tt] == c for tt in old_tags
1982 ):
1983 continue
1984 tags.add(t)
1986 # Extract language-specific tags from the
1987 # form. This may also adjust the form.
1988 form, lang_tags = lang_specific_tags(lang, pos, form)
1989 tags.update(lang_tags)
1991 # For non-finite verb forms, see if they have
1992 # a gender/class suffix
1993 if pos == "verb" and any(
1994 valid_tags[t] == "non-finite" for t in tags
1995 ):
1996 form, tt = parse_head_final_tags(wxr, lang, form)
1997 tags.update(tt)
1999 # Remove "personal" tag if have nth person; these
2000 # come up with e.g. reconhecer/Portuguese/Verb. But
2001 # not if we also have "pronoun"
2002 if (
2003 "personal" in tags
2004 and "pronoun" not in tags
2005 and any(
2006 x in tags
2007 for x in [
2008 "first-person",
2009 "second-person",
2010 "third-person",
2011 ]
2012 )
2013 ):
2014 tags.remove("personal")
2016 # If we have impersonal, remove person and number.
2017 # This happens with e.g. viajar/Portuguese/Verb
2018 if "impersonal" in tags:
2019 tags = tags - set(
2020 [
2021 "first-person",
2022 "second-person",
2023 "third-person",
2024 "singular",
2025 "plural",
2026 ]
2027 )
2029 # Remove unnecessary "positive" tag from verb forms
2030 if pos == "verb" and "positive" in tags:
2031 if "negative" in tags: 2031 ↛ 2032line 2031 didn't jump to line 2032 because the condition on line 2031 was never true
2032 tags.remove("negative")
2033 tags.remove("positive")
2035 # Many Russian (and other Slavic) inflection tables
2036 # have animate/inanimate distinction that generates
2037 # separate entries for neuter/feminine, but the
2038 # distinction only applies to masculine. Remove them
2039 # form neuter/feminine and eliminate duplicates.
2040 if get_lang_conf(lang, "masc_only_animate"):
2041 for t1 in ("animate", "inanimate"):
2042 for t2 in ("neuter", "feminine"):
2043 if (
2044 t1 in tags
2045 and t2 in tags
2046 and "masculine" not in tags
2047 and "plural" not in tags
2048 ):
2049 tags.remove(t1)
2051 # German adjective tables contain "(keiner)" etc
2052 # for mixed declension plural. When the adjective
2053 # disappears and it becomes just one word, remove
2054 # the "includes-article" tag. e.g. eiskalt/German
2055 if "includes-article" in tags and " " not in form:
2056 tags.remove("includes-article")
2058 # Handle ignored forms. We mark that the form was
2059 # provided. This is important information; some words
2060 # just do not have a certain form. However, there also
2061 # many cases where no word in a language has a
2062 # particular form. Post-processing could detect and
2063 # remove such cases.
2064 if form in IGNORED_COLVALUES:
2065 # if cell text seems to be ignorable
2066 if "dummy-ignore-skipped" in tags:
2067 continue
2068 if (
2069 col_idx not in has_covering_hdr
2070 and some_has_covered_text
2071 ):
2072 continue
2073 # don't ignore this cell if there's been a header
2074 # above it
2075 form = "-"
2076 elif col_idx in has_covering_hdr:
2077 some_has_covered_text = True
2079 # Handle ambiguous object concord. If a header
2080 # gives the "dummy-object-concord"-tag to a word,
2081 # replace person, number and gender tags with
2082 # their "object-" counterparts so that the verb
2083 # agrees with the object instead.
2084 # Use only when the verb has ONLY object agreement!
2085 # a پخول/Pashto
2086 if "dummy-object-concord" in tags: 2086 ↛ 2087line 2086 didn't jump to line 2087 because the condition on line 2086 was never true
2087 for subtag, objtag in object_concord_replacements.items():
2088 if subtag in tags:
2089 tags.remove(subtag)
2090 tags.add(objtag)
2092 # Remove the dummy mood tag that we sometimes
2093 # use to block adding other mood and related
2094 # tags
2095 tags = tags - set(
2096 [
2097 "dummy-mood",
2098 "dummy-tense",
2099 "dummy-ignore-skipped",
2100 "dummy-object-concord",
2101 "dummy-reset-headers",
2102 "dummy-use-as-coltags",
2103 "dummy-use-as-rowtags",
2104 "dummy-store-hdrspan",
2105 "dummy-load-stored-hdrspans",
2106 "dummy-reset-stored-hdrspans",
2107 "dummy-section-header",
2108 ]
2109 )
2111 # Perform language-specific tag replacements according
2112 # to rules in a table.
2113 lang_tag_mappings = get_lang_conf(lang, "lang_tag_mappings")
2114 if lang_tag_mappings is not None: 2114 ↛ 2115line 2114 didn't jump to line 2115 because the condition on line 2114 was never true
2115 for pre, post in lang_tag_mappings.items():
2116 if all(t in tags for t in pre):
2117 tags = (tags - set(pre)) | set(post)
2119 # Warn if there are entries with empty tags
2120 if not tags:
2121 wxr.wtp.debug(
2122 "inflection table: empty tags for {}".format(form),
2123 sortid="inflection/1826",
2124 )
2126 # Warn if form looks like IPA
2127 ########## XXX ########
2128 # Because IPA is its own unicode block, we could also
2129 # technically do a Unicode name check to see if a string
2130 # contains IPA. Not all valid IPA characters are in the
2131 # IPA extension block, so you can technically have false
2132 # negatives if it's something like /toki/, but it
2133 # shouldn't give false positives.
2134 # Alternatively, you could make a list of IPA-admissible
2135 # characters and reject non-IPA stuff with that.
2136 if re.match(r"\s*/.*/\s*$", form): 2136 ↛ 2137line 2136 didn't jump to line 2137 because the condition on line 2136 was never true
2137 wxr.wtp.debug(
2138 "inflection table form looks like IPA: "
2139 "form={} tags={}".format(form, tags),
2140 sortid="inflection/1840",
2141 )
2143 # Note that this checks `form`, not `in tags`
2144 if form == "dummy-ignored-text-cell": 2144 ↛ 2145line 2144 didn't jump to line 2145 because the condition on line 2144 was never true
2145 continue
2147 if "dummy-remove-this-cell" in tags: 2147 ↛ 2148line 2147 didn't jump to line 2148 because the condition on line 2147 was never true
2148 continue
2150 # Add the form
2151 tags = list(sorted(tags))
2152 dt = {"form": form, "tags": tags, "source": source}
2153 if roman:
2154 dt["roman"] = roman
2155 if ipa:
2156 dt["ipa"] = ipa
2157 ret.append(dt)
2158 # If we got separate clitic form, add it
2159 if clitic:
2160 dt = {
2161 "form": clitic,
2162 "tags": tags + ["clitic"],
2163 "source": source,
2164 }
2165 ret.append(dt)
2166 return ret, form, some_has_covered_text
2168 # First extract definitions from cells
2169 # See defs_ht for footnote defs stuff
2170 for row in rows:
2171 for cell in row:
2172 text, refs, defs, hdr_tags = extract_cell_content(
2173 lang, word, cell.text
2174 )
2175 # refs, defs = footnote stuff, defs -> (ref, def)
2176 add_defs(defs)
2177 # Extract definitions from text after table
2178 text, refs, defs, hdr_tags = extract_cell_content(lang, word, after)
2179 add_defs(defs)
2181 # Then extract the actual forms
2182 ret = []
2183 hdrspans = []
2184 first_col_has_text = False
2185 rownum = 0
2186 title = None
2187 global_tags = []
2188 table_tags = []
2189 special_phrase_splits = get_lang_conf(lang, "special_phrase_splits")
2190 form_replacements = get_lang_conf(lang, "form_replacements")
2191 form_transformations = get_lang_conf(lang, "form_transformations")
2192 possibly_ignored_forms = get_lang_conf(lang, "conditionally_ignored_cells")
2193 cleanup_rules = get_lang_conf(lang, "minor_text_cleanups")
2195 for title in titles:
2196 more_global_tags, more_table_tags, extra_forms = parse_title(
2197 title, source
2198 )
2199 global_tags.extend(more_global_tags)
2200 table_tags.extend(more_table_tags)
2201 ret.extend(extra_forms)
2202 cell_rowcnt = collections.defaultdict(int)
2203 seen_cells = set()
2204 has_covering_hdr = set()
2205 some_has_covered_text = False
2206 for row in rows:
2207 # print("ROW:", row)
2208 # print("====")
2209 # print(f"Start of PREVIOUS row hdrspans:"
2210 # f"{tuple(sp.tagsets for sp in hdrspans)}")
2211 # print(f"Start of row txt: {tuple(t.text for t in row)}")
2212 if not row: 2212 ↛ 2213line 2212 didn't jump to line 2213 because the condition on line 2212 was never true
2213 continue # Skip empty rows
2214 all_headers = all(x.is_title or not x.text.strip() for x in row)
2215 text = row[0].text
2216 if (
2217 row[0].is_title
2218 and text
2219 and not is_superscript(text[0])
2220 and text not in infl_map # zealous inflation map?
2221 and (
2222 re.match(r"Inflection ", text)
2223 or re.sub(
2224 r"\s+",
2225 " ", # flatten whitespace
2226 re.sub(
2227 r"\s*\([^)]*\)",
2228 "",
2229 # Remove whitespace+parens
2230 text,
2231 ),
2232 ).strip()
2233 not in infl_map
2234 )
2235 and not re.match(infl_start_re, text)
2236 and all(
2237 x.is_title == row[0].is_title and x.text == text
2238 # all InflCells in `row` have the same is_title and text
2239 for x in row
2240 )
2241 ):
2242 if text and title is None:
2243 # Only if there were no titles previously make the first
2244 # text that is found the title
2245 title = text
2246 if re.match(r"(Note:|Notes:)", title): 2246 ↛ 2247line 2246 didn't jump to line 2247 because the condition on line 2246 was never true
2247 continue # not a title
2248 more_global_tags, more_table_tags, extra_forms = parse_title(
2249 title, source
2250 )
2251 global_tags.extend(more_global_tags)
2252 table_tags.extend(more_table_tags)
2253 ret.extend(extra_forms)
2254 continue # Skip title rows without incrementing i
2255 if "dummy-skip-this" in global_tags: 2255 ↛ 2256line 2255 didn't jump to line 2256 because the condition on line 2255 was never true
2256 return []
2257 rowtags = [()]
2258 # have_hdr = False
2259 # have_hdr never used?
2260 have_text = False
2261 samecell_cnt = 0
2262 col0_hdrspan = None # col0 or later header (despite its name)
2263 col0_followed_by_nonempty = False
2264 row_empty = True
2265 for col_idx, cell in enumerate(row):
2266 colspan = cell.colspan # >= 1
2267 rowspan = cell.rowspan # >= 1
2268 previously_seen = id(cell) in seen_cells
2269 # checks to see if this cell was in the previous ROW
2270 seen_cells.add(id(cell))
2271 if samecell_cnt == 0:
2272 # First column of a (possible multi-column) cell
2273 samecell_cnt = colspan - 1
2274 else:
2275 assert samecell_cnt > 0
2276 samecell_cnt -= 1
2277 continue
2279 # is_first_row_of_cell = cell_rowcnt[id(cell)] == 0
2280 # never used?
2282 # defaultdict(int) around line 1900
2283 cell_rowcnt[id(cell)] += 1
2284 # => how many cols this spans
2285 col = cell.text
2286 if not col:
2287 continue
2288 row_empty = False
2289 is_title = cell.is_title
2291 # If the cell has a target, i.e., text after colon, interpret
2292 # it as simply specifying a value for that value and ignore
2293 # it otherwise.
2294 if cell.target:
2295 text, refs, defs, hdr_tags = extract_cell_content(
2296 lang, word, col
2297 )
2298 if not text: 2298 ↛ 2299line 2298 didn't jump to line 2299 because the condition on line 2298 was never true
2299 continue
2300 refs_tags = set()
2301 for ref in refs: # gets tags from footnotes 2301 ↛ 2302line 2301 didn't jump to line 2302 because the loop on line 2301 never started
2302 if ref in def_ht:
2303 refs_tags.update(def_ht[ref])
2304 rowtags = expand_header(
2305 wxr,
2306 tablecontext,
2307 word,
2308 lang,
2309 pos,
2310 text,
2311 [],
2312 silent=True,
2313 depth=depth,
2314 )
2315 rowtags = list(
2316 set(tuple(sorted(set(x) | refs_tags)) for x in rowtags)
2317 )
2318 is_title = False
2319 col = cell.target
2321 # print(rownum, col_idx, col)
2322 # print(f"is_title: {is_title}")
2323 if is_title:
2324 # It is a header cell
2325 text, refs, defs, hdr_tags = extract_cell_content(
2326 lang, word, col
2327 )
2328 if not text:
2329 continue
2330 # Extract tags from referenced footnotes
2331 refs_tags = set()
2332 for ref in refs:
2333 if ref in def_ht:
2334 refs_tags.update(def_ht[ref])
2336 # Expand header to tags
2337 v = expand_header(
2338 wxr,
2339 tablecontext,
2340 word,
2341 lang,
2342 pos,
2343 text,
2344 [],
2345 silent=True,
2346 depth=depth,
2347 )
2348 # print("EXPANDED {!r} to {}".format(text, v))
2350 if col_idx == 0:
2351 # first_col_has_text is used for a test to ignore
2352 # upper-left cells that are just text without
2353 # header info
2354 first_col_has_text = True
2355 # Check if the header expands to reset hdrspans
2356 if any("dummy-reset-headers" in tt for tt in v):
2357 new_hdrspans = []
2358 for hdrspan in hdrspans:
2359 # if there are HdrSpan objects (abstract headers with
2360 # row- and column-spans) that are to the left or at the
2361 # same row or below, KEEP those; things above and to
2362 # the right of the hdrspan with dummy-reset-headers
2363 # are discarded. Tags from the header together with
2364 # dummy-reset-headers are kept as normal.
2365 if (
2366 hdrspan.start + hdrspan.colspan < col_idx
2367 or hdrspan.rownum > rownum - cell.rowspan
2368 ):
2369 new_hdrspans.append(hdrspan)
2370 hdrspans = new_hdrspans
2372 for tt in v:
2373 if "dummy-section-header" in tt: 2373 ↛ 2374line 2373 didn't jump to line 2374 because the condition on line 2373 was never true
2374 tablecontext.section_header = tt
2375 break
2376 if "dummy-reset-section-header" in tt: 2376 ↛ 2377line 2376 didn't jump to line 2377 because the condition on line 2376 was never true
2377 tablecontext.section_header = []
2378 # Text between headers on a row causes earlier headers to
2379 # be reset
2380 if have_text:
2381 # print(" HAVE_TEXT BEFORE HDR:", col)
2382 # Reset rowtags if new title column after previous
2383 # text cells
2384 # +-----+-----+-----+-----+
2385 # |hdr-a|txt-a|hdr-B|txt-B|
2386 # +-----+-----+-----+-----+
2387 # ^reset rowtags=>
2388 # XXX beware of header "—": "" - must not clear on that if
2389 # it expands to no tags
2390 rowtags = [()]
2391 # have_hdr = True
2392 # have_hdr never used?
2393 # print("HAVE_HDR: {} rowtags={}".format(col, rowtags))
2394 # Update rowtags and coltags
2395 has_covering_hdr.add(col_idx) # col_idx == current column
2396 # has_covering_hdr is a set that has the col_idx-ids of columns
2397 # that have previously had some kind of header. It is never
2398 # resetted inside the col_idx-loops OR the bigger rows-loop, so
2399 # applies to the whole table.
2401 rowtags, new_coltags, all_hdr_tags = generate_tags(
2402 rowtags, table_tags
2403 )
2405 if any("dummy-skip-this" in ts for ts in rowtags):
2406 continue # Skip this cell
2408 if any("dummy-load-stored-hdrspans" in ts for ts in v): 2408 ↛ 2409line 2408 didn't jump to line 2409 because the condition on line 2408 was never true
2409 hdrspans.extend(tablecontext.stored_hdrspans)
2411 if any("dummy-reset-stored-hdrspans" in ts for ts in v): 2411 ↛ 2412line 2411 didn't jump to line 2412 because the condition on line 2411 was never true
2412 tablecontext.stored_hdrspans = []
2414 if any("dummy-store-hdrspan" in ts for ts in v): 2414 ↛ 2416line 2414 didn't jump to line 2416 because the condition on line 2414 was never true
2415 # print(f"STORED: {col}")
2416 store_new_hdrspan = True
2417 else:
2418 store_new_hdrspan = False
2420 new_coltags = list(
2421 x
2422 for x in new_coltags
2423 if not any(t in noinherit_tags for t in x)
2424 )
2425 # print("new_coltags={} previously_seen={} all_hdr_tags={}"
2426 # .format(new_coltags, previously_seen, all_hdr_tags))
2427 if any(new_coltags):
2428 (
2429 col,
2430 col0_followed_by_nonempty,
2431 col0_hdrspan,
2432 ) = add_new_hdrspan(
2433 col,
2434 hdrspans,
2435 store_new_hdrspan,
2436 col0_followed_by_nonempty,
2437 col0_hdrspan,
2438 )
2440 continue
2442 # These values are ignored, at least for now
2443 if re.match(r"^(# |\(see )", col): 2443 ↛ 2444line 2443 didn't jump to line 2444 because the condition on line 2443 was never true
2444 continue
2446 if any("dummy-skip-this" in ts for ts in rowtags):
2447 continue # Skip this cell
2449 # If the word has no rowtags and is a multi-row cell, then
2450 # ignore this. This happens with empty separator rows
2451 # within a rowspan>1 cell. cf. wander/English/Conjugation.
2452 if rowtags == [()] and rowspan > 1:
2453 continue
2455 # Minor cleanup. See e.g. είμαι/Greek/Verb present participle.
2456 if cleanup_rules:
2457 for regx, substitution in cleanup_rules.items():
2458 col = re.sub(regx, substitution, col)
2460 if ( 2460 ↛ 2465line 2460 didn't jump to line 2465 because the condition on line 2460 was never true
2461 col_idx == 0
2462 and not first_col_has_text
2463 and get_lang_conf(lang, "ignore_top_left_text_cell") is True
2464 ):
2465 continue # Skip text at top left, as in Icelandic, Faroese
2467 # if col0_hdrspan is not None:
2468 # print("COL0 FOLLOWED NONHDR: {!r} by {!r}"
2469 # .format(col0_hdrspan.text, col))
2470 col0_followed_by_nonempty = True
2471 have_text = True
2473 # Determine column tags for the multi-column cell
2474 combined_coltags = compute_coltags(
2475 lang, pos, hdrspans, col_idx, colspan, col
2476 )
2477 if any("dummy-ignored-text-cell" in ts for ts in combined_coltags): 2477 ↛ 2478line 2477 didn't jump to line 2478 because the condition on line 2477 was never true
2478 continue
2480 # print("HAVE_TEXT:", repr(col))
2481 # Split the text into separate forms. First simplify spaces except
2482 # newline.
2483 col = re.sub(r"[ \t\r]+", " ", col)
2484 # Split the cell text into alternatives
2486 col, alts, split_extra_tags = split_text_into_alts(col)
2488 # Some cells have mixed form content, like text and romanization,
2489 # or text and IPA. Handle these.
2490 alts = handle_mixed_lines(alts)
2492 alts = list((x, combined_coltags) for x in alts)
2494 # Generate forms from the alternatives
2495 # alts is a list of (tuple of forms, tuple of tags)
2496 for (form, base_roman, ipa), coltags in alts:
2497 form = form.strip()
2498 extra_tags = []
2499 extra_tags.extend(split_extra_tags)
2500 # Handle special splits again here, so that we can have custom
2501 # mappings from form to form and tags.
2502 if form in form_replacements:
2503 replacement, tags = form_replacements[form]
2504 for x in tags.split():
2505 assert x in valid_tags
2506 assert isinstance(replacement, str)
2507 assert isinstance(tags, str)
2508 form = replacement
2509 extra_tags.extend(tags.split())
2511 check_romanization_form_transformation = False
2512 # loop over regexes in form_transformation and replace text
2513 # in form using regex patterns
2514 # this does a bit of the same stuff the above does,
2515 # but with regexes and re.sub() instead
2516 for (
2517 form_transformations_pos,
2518 v,
2519 subst,
2520 tags,
2521 ) in form_transformations:
2522 # v is a pattern string, like "^ich"
2523 if pos != form_transformations_pos:
2524 continue
2525 m = re.search(v, form)
2526 if m is not None:
2527 form = re.sub(v, subst, form)
2528 for x in tags.split():
2529 assert x in valid_tags
2530 extra_tags.extend(tags.split())
2531 check_romanization_form_transformation = True
2532 break
2534 # Clean the value, extracting reference symbols
2535 form, refs, defs, hdr_tags = extract_cell_content(
2536 lang, word, form
2537 )
2538 # if refs:
2539 # print("REFS:", refs)
2540 extra_tags.extend(hdr_tags)
2541 # Extract tags from referenced footnotes
2542 # Extract tags from referenced footnotes
2543 refs_tags = set()
2544 for ref in refs:
2545 if ref in def_ht:
2546 refs_tags.update(def_ht[ref])
2548 if base_roman:
2549 if check_romanization_form_transformation: 2549 ↛ 2553line 2549 didn't jump to line 2553 because the condition on line 2549 was never true
2550 # because form_transformations are used to handle things
2551 # where the romanization has the "same" structure, we
2552 # need to handle that here too....
2553 for (
2554 _,
2555 v,
2556 subst,
2557 _,
2558 ) in form_transformations:
2559 # v is a pattern string, like "^ich"
2560 m = re.search(v, base_roman)
2561 if m is not None:
2562 base_roman = re.sub(v, subst, base_roman)
2563 # XXX add tag stuff here if needed
2564 break
2566 base_roman, _, _, hdr_tags = extract_cell_content(
2567 lang, word, base_roman
2568 )
2569 extra_tags.extend(hdr_tags)
2571 # Do some additional cleanup on the cell.
2572 form = re.sub(r"^\s*,\s*", "", form)
2573 form = re.sub(r"\s*,\s*$", "", form)
2574 form = re.sub(r"\s*(,\s*)+", ", ", form)
2575 form = re.sub(r"(?i)^Main:", "", form)
2576 form = re.sub(r"\s+", " ", form)
2577 form = form.strip()
2579 # Look for parentheses that have semantic meaning
2580 form, et = find_semantic_parens(form)
2581 extra_tags.extend(et)
2583 # Handle parentheses in the table element. We parse
2584 # tags anywhere and romanizations anywhere but beginning.
2585 roman = base_roman
2586 paren = None
2587 clitic = None
2588 m = re.search(r"(\s+|^)\(([^)]*)\)", form)
2589 # start|spaces + (anything)
2590 if m is not None:
2591 subst = m.group(1)
2592 paren = m.group(2)
2593 else:
2594 m = re.search(r"\(([^)]*)\)(\s+|$)", form)
2595 # (anything) + spaces|end
2596 if m is not None: 2596 ↛ 2597line 2596 didn't jump to line 2597 because the condition on line 2596 was never true
2597 paren = m.group(1)
2598 subst = m.group(2)
2599 if paren is not None:
2600 form, roman, clitic = handle_parens(
2601 form, roman, clitic, extra_tags
2602 )
2604 # Ignore certain forms that are not really forms,
2605 # unless they're really, really close to the article title
2606 if form in ( 2606 ↛ 2611line 2606 didn't jump to line 2611 because the condition on line 2606 was never true
2607 "",
2608 "unchanged",
2609 "after an", # in sona/Irish/Adj/Mutation
2610 ):
2611 Lev = distw([form], word)
2612 if form and Lev < 0.1:
2613 wxr.wtp.debug(
2614 "accepted possible false positive '{}' with"
2615 "> 0.1 Levenshtein distance in {}/{}".format(
2616 form, word, lang
2617 ),
2618 sortid="inflection/2213",
2619 )
2620 elif form and Lev < 0.3:
2621 wxr.wtp.debug(
2622 "skipped possible match '{}' with > 0.3"
2623 "Levenshtein distance in {}/{}".format(
2624 form, word, lang
2625 ),
2626 sortid="inflection/2218",
2627 )
2628 continue
2629 else:
2630 continue
2631 # print("ROWTAGS={} COLTAGS={} REFS_TAGS={} "
2632 # "FORM={!r} ROMAN={!r}"
2633 # .format(rowtags, coltags, refs_tags,
2634 # form, roman))
2636 # Merge tags from row and column and do miscellaneous
2637 # tag-related handling.
2638 (
2639 merge_ret,
2640 form,
2641 some_has_covered_text,
2642 ) = merge_row_and_column_tags(form, some_has_covered_text)
2643 ret.extend(merge_ret)
2645 # End of row.
2646 rownum += 1
2647 # For certain languages, if the row was empty, reset
2648 # hdrspans (saprast/Latvian/Verb, but not aussteigen/German/Verb).
2649 if row_empty and get_lang_conf(lang, "empty_row_resets"):
2650 hdrspans = []
2651 # Check if we should expand col0_hdrspan.
2652 if col0_hdrspan is not None:
2653 col0_allowed = get_lang_conf(lang, "hdr_expand_first")
2654 col0_cats = tagset_cats(col0_hdrspan.tagsets)
2655 # Only expand if col0_cats and later_cats are allowed
2656 # and don't overlap and col0 has tags, and there have
2657 # been no disallowed cells in between.
2658 if (
2659 not col0_followed_by_nonempty
2660 and not (col0_cats - col0_allowed)
2661 and
2662 # len(col0_cats) == 1 and
2663 col_idx > col0_hdrspan.start + col0_hdrspan.colspan
2664 ):
2665 # If an earlier header is only followed by headers that yield
2666 # no tags, expand it to entire row
2667 # print("EXPANDING COL0: {} from {} to {} cols {}"
2668 # .format(col0_hdrspan.text, col0_hdrspan.colspan,
2669 # len(row) - col0_hdrspan.start,
2670 # col0_hdrspan.tagsets))
2671 col0_hdrspan.colspan = len(row) - col0_hdrspan.start
2672 col0_hdrspan.expanded = True
2673 # XXX handle refs and defs
2674 # for x in hdrspans:
2675 # print(" HDRSPAN {} {} {} {!r}"
2676 # .format(x.start, x.colspan, x.tagsets, x.text))
2678 # Post-process German nouns with articles in separate columns. We move the
2679 # definite/indefinite/usually-without-article markers into the noun and
2680 # remove the article entries.
2681 if get_lang_conf(lang, "articles_in_separate_columns") and any(
2682 "noun" in x["tags"] for x in ret
2683 ):
2684 new_ret = []
2685 saved_tags = set()
2686 had_noun = False
2687 for dt in ret:
2688 tags = dt["tags"]
2689 # print(tags)
2690 if "noun" in tags:
2691 tags = list(
2692 sorted(set(t for t in tags if t != "noun") | saved_tags)
2693 )
2694 had_noun = True
2695 elif ( 2695 ↛ 2722line 2695 didn't jump to line 2722 because the condition on line 2695 was always true
2696 "indefinite" in tags
2697 or "definite" in tags
2698 or "usually-without-article" in tags
2699 or "without-article" in tags
2700 ):
2701 if had_noun:
2702 saved_tags = set(tags)
2703 else:
2704 saved_tags = saved_tags | set(tags) # E.g. Haus/German
2705 remove_useless_tags(lang, pos, saved_tags)
2706 saved_tags = saved_tags & set(
2707 [
2708 "masculine",
2709 "feminine",
2710 "neuter",
2711 "singular",
2712 "plural",
2713 "indefinite",
2714 "definite",
2715 "usually-without-article",
2716 "without-article",
2717 ]
2718 )
2719 had_noun = False
2720 continue # Skip the articles
2722 dt = dt.copy()
2723 dt["tags"] = tags
2724 new_ret.append(dt)
2725 ret = new_ret
2727 elif possibly_ignored_forms:
2728 # Some languages have tables with cells that are kind of separated
2729 # and difficult to handle, like eulersche Formel/German where
2730 # the definite and indefinite articles are just floating.
2731 # If a language has a dict of conditionally_ignored_cells,
2732 # and if the contents of a cell is found in one of the rules
2733 # there, ignore that cell if it
2734 # 1. Does not have the appropriate tag (like "definite" for "die")
2735 # and
2736 # 2. The title of the article is not one of the other co-words
2737 # (ie. it's an article for the definite articles in german etc.)
2738 # pass
2739 new_ret = []
2740 for cell_data in ret:
2741 tags = cell_data["tags"]
2742 text = cell_data["form"]
2743 skip_this = False
2744 for key_tag, ignored_forms in possibly_ignored_forms.items():
2745 if text not in ignored_forms: 2745 ↛ 2747line 2745 didn't jump to line 2747 because the condition on line 2745 was always true
2746 continue
2747 if word in ignored_forms:
2748 continue
2749 if key_tag not in tags:
2750 skip_this = True
2752 if skip_this: 2752 ↛ 2753line 2752 didn't jump to line 2753 because the condition on line 2752 was never true
2753 continue
2754 new_ret.append(cell_data)
2756 ret = new_ret
2758 # Post-process English inflection tables, addding "multiword-construction"
2759 # when the number of words has increased.
2760 if lang == "English" and pos == "verb":
2761 word_words = len(word.split())
2762 new_ret = []
2763 for dt in ret:
2764 form = dt.get("form", "")
2765 if len(form.split()) > word_words:
2766 dt = dt.copy()
2767 dt["tags"] = list(dt.get("tags", []))
2768 # This strange copy-assigning shuffle is preventative black
2769 # magic; do not touch lest you invoke deep bugs.
2770 data_append(dt, "tags", "multiword-construction")
2771 new_ret.append(dt)
2772 ret = new_ret
2774 # Always insert "table-tags" detail as the first entry in any inflection
2775 # table. This way we can reliably detect where a new table starts.
2776 # Table-tags applies until the next table-tags entry.
2777 if ret or table_tags:
2778 table_tags = list(sorted(set(table_tags)))
2779 dt = {
2780 "form": " ".join(table_tags),
2781 "source": source,
2782 "tags": ["table-tags"],
2783 }
2784 if dt["form"] == "":
2785 dt["form"] = "no-table-tags"
2786 if tablecontext.template_name:
2787 tn = {
2788 "form": tablecontext.template_name,
2789 "source": source,
2790 "tags": ["inflection-template"],
2791 }
2792 ret = [dt] + [tn] + ret
2793 else:
2794 ret = [dt] + ret
2796 return ret
2799def handle_generic_table(
2800 wxr, tablecontext, data, word, lang, pos, rows, titles, source, after, depth
2801):
2802 assert isinstance(wxr, WiktextractContext)
2803 assert isinstance(data, dict)
2804 assert isinstance(word, str)
2805 assert isinstance(lang, str)
2806 assert isinstance(pos, str)
2807 assert isinstance(rows, list)
2808 assert isinstance(source, str)
2809 assert isinstance(after, str)
2810 assert isinstance(depth, int)
2811 for row in rows:
2812 assert isinstance(row, list)
2813 for x in row:
2814 assert isinstance(x, InflCell)
2815 assert isinstance(titles, list)
2816 for x in titles:
2817 assert isinstance(x, str)
2819 # Try to parse the table as a simple table
2820 ret = parse_simple_table(
2821 wxr, tablecontext, word, lang, pos, rows, titles, source, after, depth
2822 )
2823 if ret is None: 2823 ↛ 2826line 2823 didn't jump to line 2826 because the condition on line 2823 was never true
2824 # XXX handle other table formats
2825 # We were not able to handle the table
2826 wxr.wtp.debug(
2827 "unhandled inflection table format, {}/{}".format(word, lang),
2828 sortid="inflection/2370",
2829 )
2830 return
2832 # Add the returned forms but eliminate duplicates.
2833 have_forms = set()
2834 for dt in ret:
2835 fdt = freeze(dt)
2836 if fdt in have_forms:
2837 continue # Don't add duplicates
2838 # Some Russian words have Declension and Pre-reform declension partially
2839 # duplicating same data. Don't add "dated" tags variant if already have
2840 # the same without "dated" from the modern declension table
2842 tags = dt.get("tags", [])
2843 for dated_tag in ("dated",):
2844 if dated_tag in tags:
2845 dt2 = dt.copy()
2846 tags2 = list(x for x in tags if x != dated_tag)
2847 dt2["tags"] = tags2
2848 if tags2 and freeze(dt2) in have_forms: 2848 ↛ 2849line 2848 didn't jump to line 2849 because the condition on line 2848 was never true
2849 break # Already have without archaic
2850 else:
2851 if "table-tags" not in tags:
2852 have_forms.add(fdt)
2853 data_append(data, "forms", dt)
2856def determine_header(
2857 wxr,
2858 tablecontext,
2859 lang,
2860 word,
2861 pos,
2862 table_kind,
2863 kind,
2864 style,
2865 row,
2866 col,
2867 celltext,
2868 titletext,
2869 cols_headered,
2870 target,
2871 cellstyle,
2872):
2873 assert isinstance(table_kind, NodeKind)
2874 assert isinstance(kind, (NodeKind, str))
2875 assert style is None or isinstance(style, str)
2876 assert cellstyle is None or isinstance(cellstyle, str)
2878 if table_kind == NodeKind.TABLE:
2879 header_kind = NodeKind.TABLE_HEADER_CELL
2880 elif table_kind == NodeKind.HTML: 2880 ↛ 2882line 2880 didn't jump to line 2882 because the condition on line 2880 was always true
2881 header_kind = "th"
2882 idx = celltext.find(": ")
2883 is_title = False
2884 # remove anything in parentheses, compress whitespace, .strip()
2885 cleaned_titletext = re.sub(
2886 r"\s+", " ", re.sub(r"\s*\([^)]*\)", "", titletext)
2887 ).strip()
2888 cleaned, _, _, _ = extract_cell_content(lang, word, celltext)
2889 cleaned = re.sub(r"\s+", " ", cleaned)
2890 hdr_expansion = expand_header(
2891 wxr,
2892 tablecontext,
2893 word,
2894 lang,
2895 pos,
2896 cleaned,
2897 [],
2898 silent=True,
2899 ignore_tags=True,
2900 )
2901 candidate_hdr = not any(
2902 any(t.startswith("error-") for t in ts) for ts in hdr_expansion
2903 )
2904 # KJ candidate_hdr says that a specific cell is a candidate
2905 # for being a header because it passed through expand_header
2906 # without getting any "error-" tags; that is, the contents
2907 # is "valid" for being a header; these are the false positives
2908 # we want to catch
2909 ignored_cell = any(
2910 any(t.startswith("dummy-") for t in ts) for ts in hdr_expansion
2911 )
2912 # ignored_cell should NOT be used to filter for headers, like
2913 # candidate_hdr is used, but only to filter for related *debug
2914 # messages*: some dummy-tags are actually half-way to headers,
2915 # like ones with "Notes", so they MUST be headers, but later
2916 # on they're ignored *as* headers so they don't need to print
2917 # out any cells-as-headers debug messages.
2918 if (
2919 candidate_hdr
2920 and kind != header_kind
2921 and cleaned != ""
2922 and cleaned != "dummy-ignored-text-cell"
2923 and cleaned not in IGNORED_COLVALUES
2924 ):
2925 # print("col: {}".format(col))
2926 if not ignored_cell and lang not in LANGUAGES_WITH_CELLS_AS_HEADERS:
2927 wxr.wtp.debug(
2928 "rejected heuristic header: "
2929 "table cell identified as header and given "
2930 "candidate status, BUT {} is not in "
2931 "LANGUAGES_WITH_CELLS_AS_HEADERS; "
2932 "cleaned text: {}".format(lang, cleaned),
2933 sortid="inflection/2447",
2934 )
2935 candidate_hdr = False
2936 elif cleaned not in LANGUAGES_WITH_CELLS_AS_HEADERS.get(lang, ""):
2937 wxr.wtp.debug(
2938 "rejected heuristic header: "
2939 "table cell identified as header and given "
2940 "candidate status, BUT the cleaned text is "
2941 "not in LANGUAGES_WITH_CELLS_AS_HEADERS[{}]; "
2942 "cleaned text: {}".format(lang, cleaned),
2943 sortid="inflection/2457",
2944 )
2945 candidate_hdr = False
2946 else:
2947 wxr.wtp.debug(
2948 "accepted heuristic header: "
2949 "table cell identified as header and given "
2950 "candidate status, AND the cleaned text is "
2951 "in LANGUAGES_WITH_CELLS_AS_HEADERS[{}]; "
2952 "cleaned text: {}".format(lang, cleaned),
2953 sortid="inflection/2466",
2954 )
2956 # If the cell starts with something that could start a
2957 # definition (typically a reference symbol), make it a candidate
2958 # regardless of whether the language is listed.
2959 if re.match(def_re, cleaned) and not re.match(nondef_re, cleaned): 2959 ↛ 2960line 2959 didn't jump to line 2960 because the condition on line 2959 was never true
2960 candidate_hdr = True
2962 # print("titletext={!r} hdr_expansion={!r} candidate_hdr={!r} "
2963 # "lang={} pos={}"
2964 # .format(titletext, hdr_expansion, candidate_hdr,
2965 # lang, pos))
2966 if idx >= 0 and titletext[:idx] in infl_map:
2967 target = titletext[idx + 2 :].strip()
2968 celltext = celltext[:idx]
2969 is_title = True
2970 elif (
2971 kind == header_kind
2972 and " + " not in titletext # For "avoir + blah blah"?
2973 and not any(
2974 isinstance(x, WikiNode)
2975 and x.kind == NodeKind.HTML
2976 and x.sarg == "span"
2977 and x.attrs.get("lang") in ("az",)
2978 for x in col.children
2979 )
2980 ):
2981 is_title = True
2982 elif (
2983 candidate_hdr
2984 and cleaned_titletext not in IGNORED_COLVALUES
2985 and distw([cleaned_titletext], word) > 0.3
2986 and cleaned_titletext not in ("I", "es")
2987 ):
2988 is_title = True
2989 # if first column or same style as first column
2990 elif (
2991 style == cellstyle
2992 and
2993 # and title is not identical to word name
2994 titletext != word
2995 and cleaned not in IGNORED_COLVALUES
2996 and cleaned != "dummy-ignored-text-cell"
2997 and
2998 # the style composite string is not broken
2999 not style.startswith("////")
3000 and " + " not in titletext
3001 ):
3002 if not ignored_cell and lang not in LANGUAGES_WITH_CELLS_AS_HEADERS: 3002 ↛ 3003line 3002 didn't jump to line 3003 because the condition on line 3002 was never true
3003 wxr.wtp.debug(
3004 "rejected heuristic header: "
3005 "table cell identified as header based "
3006 "on style, BUT {} is not in "
3007 "LANGUAGES_WITH_CELLS_AS_HEADERS; "
3008 "cleaned text: {}, style: {}".format(lang, cleaned, style),
3009 sortid="inflection/2512",
3010 )
3011 elif ( 3011 ↛ 3015line 3011 didn't jump to line 3015 because the condition on line 3011 was never true
3012 not ignored_cell
3013 and cleaned not in LANGUAGES_WITH_CELLS_AS_HEADERS.get(lang, "")
3014 ):
3015 wxr.wtp.debug(
3016 "rejected heuristic header: "
3017 "table cell identified as header based "
3018 "on style, BUT the cleaned text is "
3019 "not in LANGUAGES_WITH_CELLS_AS_HEADERS[{}]; "
3020 "cleaned text: {}, style: {}".format(lang, cleaned, style),
3021 sortid="inflection/2522",
3022 )
3023 else:
3024 wxr.wtp.debug(
3025 "accepted heuristic header: "
3026 "table cell identified as header based "
3027 "on style, AND the cleaned text is "
3028 "in LANGUAGES_WITH_CELLS_AS_HEADERS[{}]; "
3029 "cleaned text: {}, style: {}".format(lang, cleaned, style),
3030 sortid="inflection/2530",
3031 )
3032 is_title = True
3033 if ( 3033 ↛ 3040line 3033 didn't jump to line 3040 because the condition on line 3033 was never true
3034 not is_title
3035 and len(row) < len(cols_headered)
3036 and cols_headered[len(row)]
3037 ):
3038 # Whole column has title suggesting they are headers
3039 # (e.g. "Case")
3040 is_title = True
3041 if re.match(
3042 r"Conjugation of |Declension of |Inflection of |"
3043 r"Mutation of |Notes\b", # \b is word-boundary
3044 titletext,
3045 ):
3046 is_title = True
3047 return is_title, hdr_expansion, target, celltext
3050class TableContext:
3051 """Saved context used when parsing a table and its subtables."""
3053 __slot__ = (
3054 "stored_hdrspans",
3055 "section_header",
3056 "template_name",
3057 )
3059 def __init__(self, template_name=None):
3060 self.stored_hdrspans = []
3061 self.section_header = []
3062 if not template_name:
3063 self.template_name = ""
3064 else:
3065 self.template_name = template_name
3068def handle_wikitext_or_html_table(
3069 wxr, word, lang, pos, data, tree, titles, source, after, tablecontext=None
3070):
3071 """Parses a table from parsed Wikitext format into rows and columns of
3072 InflCell objects and then calls handle_generic_table() to parse it into
3073 forms. This adds the forms into ``data``."""
3074 assert isinstance(wxr, WiktextractContext)
3075 assert isinstance(word, str)
3076 assert isinstance(lang, str)
3077 assert isinstance(pos, str)
3078 assert isinstance(data, dict)
3079 assert isinstance(tree, WikiNode)
3080 assert tree.kind == NodeKind.TABLE or (
3081 tree.kind == NodeKind.HTML and tree.sarg == "table"
3082 )
3083 assert isinstance(titles, list)
3084 assert isinstance(source, str)
3085 for x in titles:
3086 assert isinstance(x, str)
3087 assert isinstance(after, str)
3088 assert tablecontext is None or isinstance(tablecontext, TableContext)
3089 # Imported here to avoid a circular import
3090 from wiktextract.page import clean_node, recursively_extract
3092 # from wikitextprocessor.parser import print_tree
3093 # print_tree(tree)
3094 # print("-------==========-------")
3096 if not tablecontext:
3097 tablecontext = TableContext()
3099 def handle_table1(
3100 wxr,
3101 tablecontext,
3102 word,
3103 lang,
3104 pos,
3105 data,
3106 tree,
3107 titles,
3108 source,
3109 after,
3110 depth,
3111 ):
3112 """Helper function allowing the 'flattening' out of the table
3113 recursion: instead of handling the tables in the wrong order
3114 (recursively), this function adds to new_row that is then
3115 iterated through in the main function at the end, creating
3116 a longer table (still in pieces) in the correct order."""
3118 assert isinstance(data, dict)
3119 assert isinstance(titles, list)
3120 assert isinstance(source, str)
3121 for x in titles:
3122 assert isinstance(x, str)
3123 assert isinstance(after, str)
3124 assert isinstance(depth, int)
3125 # print("HANDLE_WIKITEXT_TABLE", titles)
3127 col_gap_data = [] # Filling for columns with rowspan > 1
3128 # col_gap_data contains None or InflCell
3129 vertical_still_left = [] # Number of remaining rows for which to fill
3130 # the column; vertical_still_left contains int
3131 cols_headered = [] # [F, T, F, F...]
3132 # True when the whole column contains headers, even
3133 # when the cell is not considered a header; triggered
3134 # by the "*" inflmap meta-tag.
3135 rows = []
3137 sub_ret = []
3139 # from wikitextprocessor.parser import print_tree
3140 # print_tree(tree)
3141 for node in tree.children:
3142 if not isinstance(node, WikiNode):
3143 continue
3144 if node.kind == NodeKind.HTML:
3145 kind = node.sarg
3146 else:
3147 kind = node.kind
3149 # print(" {}".format(node))
3150 if kind in (NodeKind.TABLE_CAPTION, "caption"):
3151 # print(" CAPTION:", node)
3152 pass
3153 elif kind in (NodeKind.TABLE_ROW, "tr"):
3154 if "vsShow" in node.attrs.get("class", "").split():
3155 # vsShow rows are those that are intially shown in tables
3156 # that have more data. The hidden data duplicates these
3157 # rows, so we skip it and just process the hidden data.
3158 continue
3160 # if (
3161 # len(node.children) == 1
3162 # and node.children[0].attrs.get("class") == "separator"
3163 # ):
3164 # print("------------------ skip separator")
3165 # continue
3167 # Parse a table row.
3168 row = []
3169 style = None
3170 row_has_nonempty_cells = False
3171 # Have nonempty cell not from rowspan
3172 for col in get_table_cells(node):
3173 # loop through each cell in the ROW
3175 # The below skip is not needed anymore, because we "skip" in
3176 # get_table_cells, but left here as a comment
3177 # if not isinstance(col, WikiNode):
3178 # # This skip is not used for counting,
3179 # # "None" is not used in
3180 # # indexing or counting or looping.
3181 # continue
3182 if col.kind == NodeKind.HTML:
3183 kind = col.sarg
3184 else:
3185 kind = col.kind
3186 if kind not in ( 3186 ↛ 3192line 3186 didn't jump to line 3192 because the condition on line 3186 was never true
3187 NodeKind.TABLE_HEADER_CELL,
3188 NodeKind.TABLE_CELL,
3189 "th",
3190 "td",
3191 ):
3192 print(" UNEXPECTED ROW CONTENT: {}".format(col))
3193 continue
3195 while (
3196 len(row) < len(vertical_still_left)
3197 and vertical_still_left[len(row)] > 0
3198 ):
3199 # vertical_still_left is [...0, 0, 2...] for each
3200 # column. It is populated at the end of the loop, at the
3201 # same time as col_gap_data. This needs to be looped and
3202 # filled this way because each `for col`-looping jumps
3203 # straight to the next meaningful cell; there is no
3204 # "None" cells, only emptiness between, and rowspan and
3205 # colspan are just to generate the "fill-
3206 vertical_still_left[len(row)] -= 1
3207 row.append(col_gap_data[len(row)])
3209 # appending row is how "indexing" is
3210 # done here; something is appended,
3211 # like a filler-cell here or a "start"
3212 # cell at the end of the row-loop,
3213 # which increased len(row) which is
3214 # then used as the target-index to check
3215 # for gaps. vertical_still_left is
3216 # the countdown to when to stop
3217 # filling in gaps, and goes down to 0,
3218 # and col_gap_data is not touched
3219 # except when a new rowspan is needed,
3220 # at the same time that
3221 # vertical_still_left gets reassigned.
3223 try:
3224 rowspan = int(col.attrs.get("rowspan", "1")) # 🡙
3225 colspan = int(col.attrs.get("colspan", "1")) # 🡘
3226 except ValueError:
3227 rowspan = 1
3228 colspan = 1
3229 # print("COL:", col)
3231 # Too many of these errors
3232 if colspan > 100:
3233 # wxr.wtp.error(
3234 # f"Colspan {colspan} over 30, set to 1",
3235 # sortid="inflection/20250113a",
3236 # )
3237 colspan = 100
3238 if rowspan > 100: 3238 ↛ 3243line 3238 didn't jump to line 3243 because the condition on line 3238 was never true
3239 # wxr.wtp.error(
3240 # f"Rowspan {rowspan} over 30, set to 1",
3241 # sortid="inflection/20250113b",
3242 # )
3243 rowspan = 100
3245 # Process any nested tables recursively.
3246 tables, rest = recursively_extract(
3247 col,
3248 lambda x: isinstance(x, WikiNode)
3249 and (x.kind == NodeKind.TABLE or x.sarg == "table"),
3250 )
3252 # Clean the rest of the cell.
3253 celltext = clean_node(wxr, None, rest)
3254 # print("CLEANED:", celltext)
3255 # print(f"SUBTABLES: {tables}")
3257 # Handle nested tables.
3258 for tbl in tables:
3259 # Some nested tables (e.g., croí/Irish) have subtitles
3260 # as normal paragraphs in the same cell under a descrip-
3261 # tive text that should be treated as a title (e.g.,
3262 # "Forms with the definite article", with "definite" not
3263 # mentioned elsewhere).
3264 new_titles = list(titles)
3265 if celltext:
3266 new_titles.append(celltext)
3267 subtbl = handle_table1(
3268 wxr,
3269 tablecontext,
3270 word,
3271 lang,
3272 pos,
3273 data,
3274 tbl,
3275 new_titles,
3276 source,
3277 "",
3278 depth + 1,
3279 )
3280 if subtbl: 3280 ↛ 3258line 3280 didn't jump to line 3258 because the condition on line 3280 was always true
3281 sub_ret.append((rows, titles, after, depth))
3282 rows = []
3283 titles = []
3284 after = ""
3285 sub_ret.extend(subtbl)
3287 # This magic value is used as part of header detection
3288 cellstyle = (
3289 col.attrs.get("style", "")
3290 + "//"
3291 + col.attrs.get("class", "")
3292 + "//"
3293 + str(kind)
3294 )
3296 if not row: # if first column in row
3297 style = cellstyle
3298 target = None
3299 titletext = celltext.strip()
3300 while titletext and is_superscript(titletext[-1]):
3301 titletext = titletext[:-1]
3303 (
3304 is_title,
3305 hdr_expansion,
3306 target,
3307 celltext,
3308 ) = determine_header(
3309 wxr,
3310 tablecontext,
3311 lang,
3312 word,
3313 pos,
3314 tree.kind,
3315 kind,
3316 style,
3317 row,
3318 col,
3319 celltext,
3320 titletext,
3321 cols_headered,
3322 None,
3323 cellstyle,
3324 )
3326 if is_title:
3327 # If this cell gets a "*" tag, make the whole column
3328 # below it (toggling it in cols_headered = [F, F, T...])
3329 # into headers.
3330 while len(cols_headered) <= len(row):
3331 cols_headered.append(False)
3332 if any("*" in tt for tt in hdr_expansion):
3333 cols_headered[len(row)] = True
3334 celltext = ""
3335 # if row_has_nonempty_cells has been True at some point, it
3336 # keeps on being True.
3337 # if row_has_nonempty_cells or is_title or celltext != "":
3338 # row_has_nonempty_cells = True
3339 # ⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓
3340 row_has_nonempty_cells |= is_title or celltext != ""
3341 cell = InflCell(
3342 celltext, is_title, colspan, rowspan, target
3343 )
3344 for _ in range(0, colspan):
3345 # colspan🡘 current loop (col) or 1
3346 # All the data-filling for colspan
3347 # is done simply in this loop,
3348 # while rowspan needs to use
3349 # vertical_still_left to count gaps
3350 # and col_gap_data to fill in
3351 # those gaps with InflCell data.
3352 if rowspan > 1: # rowspan🡙 current loop (col) or 1
3353 while len(col_gap_data) <= len(row):
3354 # Initialize col_gap_data/ed if
3355 # it is lacking slots
3356 # for each column; col_gap_data and
3357 # vertical_still_left are never
3358 # reset to [], during
3359 # the whole table function.
3360 col_gap_data.append(None)
3361 vertical_still_left.append(0)
3362 # Below is where the "rectangle" block of rowspan
3363 # and colspan is filled for the future.
3364 col_gap_data[len(row)] = cell
3365 # col_gap_data contains cells that
3366 # will be used in the
3367 # future, or None
3368 vertical_still_left[len(row)] = rowspan - 1
3369 # A counter for how many gaps🡙 are still left to be
3370 # filled (row.append or
3371 # row[col_gap_data[len(row)] =>
3372 # rows), it is not reset to [], but decremented to 0
3373 # each time a row gets something from col_gap_data.
3374 # Append this cell 1+ times for colspan🡘
3375 row.append(cell)
3376 if not row:
3377 continue
3378 # After looping the original row-nodes above, fill
3379 # in the rest of the row if the final cell has colspan
3380 # (inherited from above, so a cell with rowspan and colspan)
3381 for i in range(len(row), len(vertical_still_left)):
3382 if vertical_still_left[i] <= 0:
3383 continue
3384 vertical_still_left[i] -= 1
3385 while len(row) < i:
3386 row.append(InflCell("", False, 1, 1, None))
3387 row.append(col_gap_data[i])
3388 # print(" ROW {!r}".format(row))
3389 if row_has_nonempty_cells: 3389 ↛ 3141line 3389 didn't jump to line 3141 because the condition on line 3389 was always true
3390 rows.append(row)
3391 elif kind in ( 3391 ↛ 3141line 3391 didn't jump to line 3141 because the condition on line 3391 was always true
3392 NodeKind.TABLE_HEADER_CELL,
3393 NodeKind.TABLE_CELL,
3394 "th",
3395 "td",
3396 "span",
3397 ):
3398 # print(" TOP-LEVEL CELL", node)
3399 pass
3401 if sub_ret:
3402 main_ret = sub_ret
3403 main_ret.append((rows, titles, after, depth))
3404 else:
3405 main_ret = [(rows, titles, after, depth)]
3406 return main_ret
3408 new_rows = handle_table1(
3409 wxr, tablecontext, word, lang, pos, data, tree, titles, source, after, 0
3410 )
3412 # Now we have a table that has been parsed into rows and columns of
3413 # InflCell objects. Parse the inflection table from that format.
3414 if new_rows: 3414 ↛ exitline 3414 didn't return from function 'handle_wikitext_or_html_table' because the condition on line 3414 was always true
3415 for rows, titles, after, depth in new_rows:
3416 handle_generic_table(
3417 wxr,
3418 tablecontext,
3419 data,
3420 word,
3421 lang,
3422 pos,
3423 rows,
3424 titles,
3425 source,
3426 after,
3427 depth,
3428 )
3431def get_table_cells(node: WikiNode) -> Generator[WikiNode, None, None]:
3432 """If a wikitext table cell contains HTML cells `<td>`, as they sometimes
3433 do because it is easier to write wikitext conditionals that way,
3434 those td-elements are parsed as child elements of the Wikitext cell.
3435 This generator will yield wikitext and HTML direct children of
3436 `node` and if a Wikitext TABLE_CELL has direct td-element children,
3437 those are also yielded."""
3438 for col in node.children:
3439 if not isinstance(col, WikiNode):
3440 continue
3441 if any(
3442 isinstance(c, HTMLNode) and c.sarg in ("th", "td")
3443 for c in col.children
3444 ):
3445 html_cells = []
3446 content = []
3447 for c in col.children:
3448 if isinstance(c, HTMLNode) and c.sarg in ("th", "td"):
3449 html_cells.append(c)
3450 else:
3451 content.append(c)
3452 # Remove td-elements from col so they are not returned twice
3453 col.children = content
3454 yield col
3455 for c in html_cells:
3456 yield c
3457 else:
3458 yield col
3461def handle_html_table(
3462 wxr, word, lang, pos, data, tree, titles, source, after, tablecontext=None
3463):
3464 """A passer-on function for html-tables, XXX, remove these?"""
3465 handle_wikitext_or_html_table(
3466 wxr, word, lang, pos, data, tree, titles, source, after, tablecontext
3467 )
3470def handle_wikitext_table(
3471 wxr, word, lang, pos, data, tree, titles, source, after, tablecontext=None
3472):
3473 """A passer-on function for html-tables, XXX, remove these?"""
3474 handle_wikitext_or_html_table(
3475 wxr, word, lang, pos, data, tree, titles, source, after, tablecontext
3476 )
3479def parse_inflection_section(
3480 wxr, data, word, lang, pos, section, tree, tablecontext=None
3481):
3482 """Parses an inflection section on a page. ``data`` should be the
3483 data for a part-of-speech, and inflections will be added to it."""
3485 # print("PARSE_INFLECTION_SECTION {}/{}/{}/{}"
3486 # .format(word, lang, pos, section))
3487 assert isinstance(wxr, WiktextractContext)
3488 assert isinstance(data, dict)
3489 assert isinstance(word, str)
3490 assert isinstance(lang, str)
3491 assert isinstance(section, str)
3492 assert isinstance(tree, WikiNode)
3493 assert tablecontext is None or isinstance(tablecontext, TableContext)
3494 source = section
3495 tables = []
3496 titleparts = []
3497 preceding_bolded_title = ""
3499 # from wikitextprocessor.parser import print_tree
3500 # print_tree(tree)
3501 # print("--------------******************----------------")
3503 def process_tables():
3504 for kind, node, titles, after in tables:
3505 after = "".join(after).strip()
3506 after = clean_value(wxr, after)
3507 if kind == "wikitext":
3508 handle_wikitext_table(
3509 wxr,
3510 word,
3511 lang,
3512 pos,
3513 data,
3514 node,
3515 titles,
3516 source,
3517 after,
3518 tablecontext=tablecontext,
3519 )
3520 elif kind == "html": 3520 ↛ 3534line 3520 didn't jump to line 3534 because the condition on line 3520 was always true
3521 handle_html_table(
3522 wxr,
3523 word,
3524 lang,
3525 pos,
3526 data,
3527 node,
3528 titles,
3529 source,
3530 after,
3531 tablecontext=tablecontext,
3532 )
3533 else:
3534 raise RuntimeError(
3535 "{}: unimplemented table kind {}".format(word, kind)
3536 )
3538 def recurse_navframe(node, titles):
3539 nonlocal tables
3540 nonlocal titleparts
3541 titleparts = []
3542 old_tables = tables
3543 tables = []
3545 recurse(node, [], navframe=True)
3547 process_tables()
3548 tables = old_tables
3550 def recurse(node, titles, navframe=False):
3551 nonlocal tables
3552 if isinstance(node, (list, tuple)):
3553 for x in node:
3554 recurse(x, titles, navframe)
3555 return
3556 if isinstance(node, str):
3557 if tables:
3558 tables[-1][-1].append(node)
3559 elif navframe:
3560 titleparts.append(node)
3561 return
3562 if not isinstance(node, WikiNode): 3562 ↛ 3563line 3562 didn't jump to line 3563 because the condition on line 3562 was never true
3563 if navframe:
3564 wxr.wtp.debug(
3565 "inflection table: unhandled in NavFrame: {}".format(node),
3566 sortid="inflection/2907",
3567 )
3568 return
3569 kind = node.kind
3570 if navframe:
3571 if kind == NodeKind.HTML:
3572 classes = node.attrs.get("class", "").split()
3573 if "NavToggle" in classes: 3573 ↛ 3574line 3573 didn't jump to line 3574 because the condition on line 3573 was never true
3574 return
3575 if "NavHead" in classes:
3576 # print("NAVHEAD:", node)
3577 recurse(node.children, titles, navframe)
3578 return
3579 if "NavContent" in classes:
3580 # print("NAVCONTENT:", node)
3581 title = "".join(titleparts).strip()
3582 title = html.unescape(title)
3583 title = title.strip()
3584 new_titles = list(titles)
3585 if not re.match(r"(Note:|Notes:)", title): 3585 ↛ 3587line 3585 didn't jump to line 3587 because the condition on line 3585 was always true
3586 new_titles.append(title)
3587 recurse(node, new_titles, navframe=False)
3588 return
3589 else:
3590 if kind == NodeKind.TABLE:
3591 tables.append(["wikitext", node, titles, []])
3592 return
3593 elif kind == NodeKind.HTML and node.sarg == "table":
3594 classes = node.attrs.get("class", ())
3595 if "audiotable" in classes:
3596 return
3597 tables.append(["html", node, titles, []])
3598 return
3599 elif kind in ( 3599 ↛ 3606line 3599 didn't jump to line 3606 because the condition on line 3599 was never true
3600 NodeKind.LEVEL2,
3601 NodeKind.LEVEL3,
3602 NodeKind.LEVEL4,
3603 NodeKind.LEVEL5,
3604 NodeKind.LEVEL6,
3605 ):
3606 return # Skip subsections
3607 if (
3608 kind == NodeKind.HTML
3609 and node.sarg == "div"
3610 and "NavFrame" in node.attrs.get("class", "").split()
3611 ):
3612 recurse_navframe(node, titles)
3613 return
3614 if kind == NodeKind.LINK:
3615 if len(node.largs) > 1:
3616 recurse(node.largs[1:], titles, navframe)
3617 else:
3618 recurse(node.largs[0], titles, navframe)
3619 return
3620 if kind == NodeKind.LIST and node.sarg == ";":
3621 nonlocal preceding_bolded_title
3622 from wiktextract.page import clean_node
3624 preceding_bolded_title = clean_node(wxr, None, node).strip("; ")
3625 for x in node.children:
3626 recurse(x, titles, navframe)
3628 assert tree.kind == NodeKind.ROOT
3629 for x in tree.children:
3630 if preceding_bolded_title != "":
3631 recurse(x, [preceding_bolded_title])
3632 else:
3633 recurse(x, [])
3635 # Process the tables we found
3636 process_tables()
3638 # XXX this code is used for extracting tables for inflection tests
3639 if wxr.config.expand_tables: 3639 ↛ 3640line 3639 didn't jump to line 3640 because the condition on line 3639 was never true
3640 if section != "Mutation":
3641 with open(wxr.config.expand_tables, "w") as f:
3642 f.write(word + "\n")
3643 f.write(lang + "\n")
3644 f.write(pos + "\n")
3645 f.write(section + "\n")
3646 text = wxr.wtp.node_to_wikitext(tree)
3647 f.write(text + "\n")