Coverage for src/wiktextract/extractor/en/inflection.py: 86%
1475 statements
« prev ^ index » next coverage.py v7.6.4, created at 2024-10-25 10:11 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2024-10-25 10:11 +0000
1# Code for parsing inflection tables.
2#
3# Copyright (c) 2021-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org.
5import collections
6import copy
7import functools
8import html
9import itertools
10import re
11import unicodedata
12from typing import Optional, Union
14from wikitextprocessor import MAGIC_FIRST, NodeKind, WikiNode
16from ...clean import clean_value
17from ...datautils import data_append, freeze, split_at_comma_semi
18from ...tags import valid_tags
19from ...wxr_context import WiktextractContext
20from .form_descriptions import (
21 classify_desc,
22 decode_tags,
23 distw,
24 parse_head_final_tags,
25)
26from .inflectiondata import infl_map, infl_start_map, infl_start_re
27from .lang_specific_configs import get_lang_conf, lang_specific_tags
28from .table_headers_heuristics_data import LANGUAGES_WITH_CELLS_AS_HEADERS
29from .type_utils import FormData
31# --debug-text-cell WORD
32# Command-line parameter for debugging. When parsing inflection tables,
33# print out debug messages when encountering this text.
34debug_cell_text: Optional[str] = None
37def set_debug_cell_text(text: str) -> None:
38 global debug_cell_text
39 debug_cell_text = text
42TagSets = list[tuple[str, ...]]
44# Column texts that are interpreted as an empty column.
45IGNORED_COLVALUES = {
46 "-",
47 "־",
48 "᠆",
49 "‐",
50 "‑",
51 "‒",
52 "–",
53 "—",
54 "―",
55 "−",
56 "⸺",
57 "⸻",
58 "﹘",
59 "﹣",
60 "-",
61 "/",
62 "?",
63 "not used",
64 "not applicable",
65}
67# These tags are never inherited from above
68# XXX merge with lang_specific
69noinherit_tags = {
70 "infinitive-i",
71 "infinitive-i-long",
72 "infinitive-ii",
73 "infinitive-iii",
74 "infinitive-iv",
75 "infinitive-v",
76}
78# Subject->object transformation mapping, when using dummy-object-concord
79# to replace subject concord tags with object concord tags
80object_concord_replacements = {
81 "first-person": "object-first-person",
82 "second-person": "object-second-person",
83 "third-person": "object-third-person",
84 "singular": "object-singular",
85 "plural": "object-plural",
86 "definite": "object-definite",
87 "indefinite": "object-indefinite",
88 "class-1": "object-class-1",
89 "class-2": "object-class-2",
90 "class-3": "object-class-3",
91 "class-4": "object-class-4",
92 "class-5": "object-class-5",
93 "class-6": "object-class-6",
94 "class-7": "object-class-7",
95 "class-8": "object-class-8",
96 "class-9": "object-class-9",
97 "class-10": "object-class-10",
98 "class-11": "object-class-11",
99 "class-12": "object-class-12",
100 "class-13": "object-class-13",
101 "class-14": "object-class-14",
102 "class-15": "object-class-15",
103 "class-16": "object-class-16",
104 "class-17": "object-class-17",
105 "class-18": "object-class-18",
106 "masculine": "object-masculine",
107 "feminine": "object-feminine",
108}
110# Words in title that cause addition of tags in all entries
111title_contains_global_map = {
112 "possessive": "possessive",
113 "possessed forms of": "possessive",
114 "predicative forms of": "predicative",
115 "negative": "negative",
116 "positive definite forms": "positive definite",
117 "positive indefinite forms": "positive indefinite",
118 "comparative": "comparative",
119 "superlative": "superlative",
120 "combined forms": "combined-form",
121 "mutation": "mutation",
122 "definite article": "definite",
123 "indefinite article": "indefinite",
124 "indefinite declension": "indefinite",
125 "bare forms": "indefinite", # e.g., cois/Irish
126 "definite declension": "definite",
127 "pre-reform": "dated",
128 "personal pronouns": "personal pronoun",
129 "composed forms of": "multiword-construction",
130 "subordinate-clause forms of": "subordinate-clause",
131 "participles of": "participle",
132 "variation of": "dummy-skip-this", # a'/Scottish Gaelic
133 "command form of": "imperative", # a راتلل/Pashto
134 "historical inflection of": "dummy-skip-this", # kork/Norwegian Nynorsk
135}
136for k, v in title_contains_global_map.items():
137 if any(t not in valid_tags for t in v.split()): 137 ↛ 138line 137 didn't jump to line 138 because the condition on line 137 was never true
138 print("TITLE_CONTAINS_GLOBAL_MAP UNRECOGNIZED TAG: {}: {}".format(k, v))
139table_hdr_ign_part = r"(Inflection|Conjugation|Declension|Mutation) of [^\s]"
141table_hdr_ign_part_re = re.compile(r"(?i)(" + table_hdr_ign_part + ")")
142# (?i) python regex extension, ignore case
143title_contains_global_re = re.compile(
144 r"(?i)(^|\b)({}|{})($|\b)".format(
145 table_hdr_ign_part,
146 "|".join(re.escape(x) for x in title_contains_global_map.keys()),
147 )
148)
150# Words in title that cause addition of tags to table-tags "form"
151title_contains_wordtags_map = {
152 "pf": "perfective",
153 "impf": "imperfective",
154 "strong": "strong",
155 "weak": "weak",
156 "countable": "countable",
157 "uncountable": "uncountable",
158 "inanimate": "inanimate",
159 "animate": "animate",
160 "transitive": "transitive",
161 "intransitive": "intransitive",
162 "ditransitive": "ditransitive",
163 "ambitransitive": "ambitransitive",
164 "archaic": "archaic",
165 "dated": "dated",
166 "affirmative": "affirmative",
167 "negative": "negative",
168 "subject pronouns": "subjective",
169 "object pronouns": "objective",
170 "emphatic": "emphatic",
171 "proper noun": "proper-noun",
172 "no plural": "no-plural",
173 "imperfective": "imperfective",
174 "perfective": "perfective",
175 "no supine stem": "no-supine",
176 "no perfect stem": "no-perfect",
177 "deponent": "deponent",
178 "irregular": "irregular",
179 "no short forms": "no-short-form",
180 "iō-variant": "iō-variant",
181 "1st declension": "declension-1",
182 "2nd declension": "declension-2",
183 "3rd declension": "declension-3",
184 "4th declension": "declension-4",
185 "5th declension": "declension-5",
186 "6th declension": "declension-6",
187 "first declension": "declension-1",
188 "second declension": "declension-2",
189 "third declension": "declension-3",
190 "fourth declension": "declension-4",
191 "fifth declension": "declension-5",
192 "sixth declension": "declension-6",
193 "1st conjugation": "conjugation-1",
194 "2nd conjugation": "conjugation-2",
195 "3rd conjugation": "conjugation-3",
196 "4th conjugation": "conjugation-4",
197 "5th conjugation": "conjugation-5",
198 "6th conjugation": "conjugation-6",
199 "7th conjugation": "conjugation-7",
200 "first conjugation": "conjugation-1",
201 "second conjugation": "conjugation-2",
202 "third conjugation": "conjugation-3",
203 "fourth conjugation": "conjugation-4",
204 "fifth conjugation": "conjugation-5",
205 "sixth conjugation": "conjugation-6",
206 "seventh conjugation": "conjugation-7",
207 # Corsican regional tags in table header
208 "cismontane": "Cismontane",
209 "ultramontane": "Ultramontane",
210 "western lombard": "Western-Lombard",
211 "eastern lombard": "Eastern-Lombard",
212}
213for k, v in title_contains_wordtags_map.items():
214 if any(t not in valid_tags for t in v.split()): 214 ↛ 215line 214 didn't jump to line 215 because the condition on line 214 was never true
215 print(
216 "TITLE_CONTAINS_WORDTAGS_MAP UNRECOGNIZED TAG: {}: {}".format(k, v)
217 )
218title_contains_wordtags_re = re.compile(
219 r"(?i)(^|\b)({}|{})($|\b)".format(
220 table_hdr_ign_part,
221 "|".join(re.escape(x) for x in title_contains_wordtags_map.keys()),
222 )
223)
225# Parenthesized elements in title that are converted to tags in
226# "table-tags" form
227title_elements_map = {
228 "weak": "weak",
229 "strong": "strong",
230 "separable": "separable",
231 "masculine": "masculine",
232 "feminine": "feminine",
233 "neuter": "neuter",
234 "singular": "singular",
235 "plural": "plural",
236 "archaic": "archaic",
237 "dated": "dated",
238 "Attic": "Attic", # e.g. καλός/Greek/Adj
239 "Epic": "Epic", # e.g. καλός/Greek/Adj
240}
241for k, v in title_elements_map.items():
242 if any(t not in valid_tags for t in v.split()): 242 ↛ 243line 242 didn't jump to line 243 because the condition on line 242 was never true
243 print("TITLE_ELEMENTS_MAP UNRECOGNIZED TAG: {}: {}".format(k, v))
245# Parenthized element starts to map them to tags for form for the rest of
246# the element
247title_elemstart_map = {
248 "auxiliary": "auxiliary",
249 "Kotus type": "class",
250 "ÕS type": "class",
251 "class": "class",
252 "short class": "class",
253 "type": "class",
254 "strong class": "class",
255 "weak class": "class",
256 "accent paradigm": "accent-paradigm",
257 "stem in": "class",
258}
259for k, v in title_elemstart_map.items():
260 if any(t not in valid_tags for t in v.split()): 260 ↛ 261line 260 didn't jump to line 261 because the condition on line 260 was never true
261 print("TITLE_ELEMSTART_MAP UNRECOGNIZED TAG: {}: {}".format(k, v))
262title_elemstart_re = re.compile(
263 r"^({}) ".format("|".join(re.escape(x) for x in title_elemstart_map.keys()))
264)
267# Regexp for cell starts that are likely definitions of reference symbols.
268# See also nondef_re.
269def_re = re.compile(
270 r"(\s*•?\s+)?"
271 r"((\*+|[△†0123456789⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻]+)([⁾):]|\s|(?=[A-Z]))|"
272 r"\^(\*+|[△†])|"
273 r"([¹²³⁴⁵⁶⁷⁸⁹])|"
274 r"([ᴬᴮᴰᴱᴳᴴᴵᴶᴷᴸᴹᴺᴼᴾᴿᵀᵁⱽᵂᵃᵇᶜᵈᵉᶠᵍʰⁱʲᵏˡᵐⁿᵒᵖʳˢᵗᵘᵛʷˣʸᶻᵝᵞᵟᶿᶥᵠᵡ]))"
275)
276# ᴺᴸᴴ persan/Old Irish
278# Regexp for cell starts that are exceptions to def_re and do not actually
279# start a definition.
280nondef_re = re.compile(
281 r"(^\s*(1|2|3)\s+(sg|pl)\s*$|" # 1s or 3p etc.
282 r"\s*\d\d?\s*/\s*\d\d?\s*$)"
283) # taka/Swahili "15 / 17"
285# Certain tags are moved from headers in tables into word tags, as they always
286# apply to the whole word.
287TAGS_FORCED_WORDTAGS: set[str] = set(
288 [
289 # This was originally created for a issue with number paradigms in
290 # Arabic, but that is being handled elsewhere now.
291 ]
292)
295class InflCell:
296 """Cell in an inflection table."""
298 __slots__ = (
299 "text",
300 "is_title",
301 "colspan",
302 "rowspan",
303 "target",
304 )
306 def __init__(
307 self,
308 text: str,
309 is_title: bool,
310 colspan: int,
311 rowspan: int,
312 target: Optional[str],
313 ) -> None:
314 assert isinstance(text, str)
315 assert is_title in (True, False)
316 assert isinstance(colspan, int) and colspan >= 1
317 assert isinstance(rowspan, int) and rowspan >= 1
318 assert target is None or isinstance(target, str)
319 self.text = text.strip()
320 self.is_title = text and is_title
321 self.colspan = colspan
322 self.rowspan = rowspan
323 self.target = target
325 def __str__(self) -> str:
326 v = "{}/{}/{}/{!r}".format(
327 self.text, self.is_title, self.colspan, self.rowspan
328 )
329 if self.target:
330 v += ": {!r}".format(self.target)
331 return v
333 def __repr__(self) -> str:
334 return str(self)
337class HdrSpan:
338 """Saved information about a header cell/span during the parsing
339 of a table."""
341 __slots__ = (
342 "start",
343 "colspan",
344 "rowspan",
345 "rownum", # Row number where this occurred
346 "tagsets", # list of tuples
347 "text", # For debugging
348 "all_headers_row",
349 "expanded", # The header has been expanded to cover whole row/part
350 )
352 def __init__(
353 self,
354 start: int,
355 colspan: int,
356 rowspan: int,
357 rownum: int,
358 tagsets: TagSets,
359 text: str,
360 all_headers_row: bool,
361 ) -> None:
362 assert isinstance(start, int) and start >= 0
363 assert isinstance(colspan, int) and colspan >= 1
364 assert isinstance(rownum, int)
365 assert isinstance(tagsets, list)
366 for x in tagsets:
367 assert isinstance(x, tuple)
368 assert all_headers_row in (True, False)
369 self.start = start
370 self.colspan = colspan
371 self.rowspan = rowspan
372 self.rownum = rownum
373 self.tagsets = list(tuple(sorted(set(tags))) for tags in tagsets)
374 self.text = text
375 self.all_headers_row = all_headers_row
376 self.expanded = False
379def is_superscript(ch: str) -> bool:
380 """Returns True if the argument is a superscript character."""
381 assert isinstance(ch, str) and len(ch) == 1
382 try:
383 name = unicodedata.name(ch)
384 except ValueError:
385 return False
386 return (
387 re.match(
388 r"SUPERSCRIPT |"
389 r"MODIFIER LETTER SMALL |"
390 r"MODIFIER LETTER CAPITAL ",
391 name,
392 )
393 is not None
394 )
397def remove_useless_tags(lang: str, pos: str, tags: set[str]) -> None:
398 """Remove certain tag combinations from ``tags`` when they serve no purpose
399 together (cover all options)."""
400 assert isinstance(lang, str)
401 assert isinstance(pos, str)
402 assert isinstance(tags, set)
403 if (
404 "animate" in tags
405 and "inanimate" in tags
406 and get_lang_conf(lang, "animate_inanimate_remove")
407 ):
408 tags.remove("animate")
409 tags.remove("inanimate")
410 if (
411 "virile" in tags
412 and "nonvirile" in tags
413 and get_lang_conf(lang, "virile_nonvirile_remove")
414 ):
415 tags.remove("virile")
416 tags.remove("nonvirile")
417 # If all numbers in the language are listed, remove them all
418 numbers = get_lang_conf(lang, "numbers")
419 if numbers and all(x in tags for x in numbers):
420 for x in numbers:
421 tags.remove(x)
422 # If all genders in the language are listed, remove them all
423 genders = get_lang_conf(lang, "genders")
424 if genders and all(x in tags for x in genders):
425 for x in genders:
426 tags.remove(x)
427 # If all voices in the language are listed, remove them all
428 voices = get_lang_conf(lang, "voices")
429 if voices and all(x in tags for x in voices):
430 for x in voices:
431 tags.remove(x)
432 # If all strengths of the language are listed, remove them all
433 strengths = get_lang_conf(lang, "strengths")
434 if strengths and all(x in tags for x in strengths):
435 for x in strengths:
436 tags.remove(x)
437 # If all persons of the language are listed, remove them all
438 persons = get_lang_conf(lang, "persons")
439 if persons and all(x in tags for x in persons):
440 for x in persons:
441 tags.remove(x)
442 # If all definitenesses of the language are listed, remove them all
443 definitenesses = get_lang_conf(lang, "definitenesses")
444 if definitenesses and all(x in tags for x in definitenesses):
445 for x in definitenesses:
446 tags.remove(x)
449def tagset_cats(tagset: TagSets) -> set[str]:
450 """Returns a set of tag categories for the tagset (merged from all
451 alternatives)."""
452 return set(valid_tags[t] for ts in tagset for t in ts)
455def or_tagsets(
456 lang: str, pos: str, tagsets1: TagSets, tagsets2: TagSets
457) -> TagSets:
458 """Merges two tagsets (the new tagset just merges the tags from both, in
459 all combinations). If they contain simple alternatives (differ in
460 only one category), they are simply merged; otherwise they are split to
461 more alternatives. The tagsets are assumed be sets of sorted tuples."""
462 assert isinstance(tagsets1, list)
463 assert all(isinstance(x, tuple) for x in tagsets1)
464 assert isinstance(tagsets2, list)
465 assert all(isinstance(x, tuple) for x in tagsets1)
466 tagsets: TagSets = [] # This will be the result
468 def add_tags(tags1: tuple[str, ...]) -> None:
469 # CONTINUE
470 if not tags1:
471 return # empty set would merge with anything, won't change result
472 if not tagsets:
473 tagsets.append(tags1)
474 return
475 for tags2 in tagsets:
476 # Determine if tags1 can be merged with tags2
477 num_differ = 0
478 if tags1 and tags2: 478 ↛ 496line 478 didn't jump to line 496 because the condition on line 478 was always true
479 cats1 = set(valid_tags[t] for t in tags1)
480 cats2 = set(valid_tags[t] for t in tags2)
481 cats = cats1 | cats2
482 for cat in cats:
483 tags1_in_cat = set(t for t in tags1 if valid_tags[t] == cat)
484 tags2_in_cat = set(t for t in tags2 if valid_tags[t] == cat)
485 if (
486 tags1_in_cat != tags2_in_cat
487 or not tags1_in_cat
488 or not tags2_in_cat
489 ):
490 num_differ += 1
491 if not tags1_in_cat or not tags2_in_cat:
492 # Prevent merging if one is empty
493 num_differ += 1
494 # print("tags1={} tags2={} num_differ={}"
495 # .format(tags1, tags2, num_differ))
496 if num_differ <= 1:
497 # Yes, they can be merged
498 tagsets.remove(tags2)
499 tags_s = set(tags1) | set(tags2)
500 remove_useless_tags(lang, pos, tags_s)
501 tags_t = tuple(sorted(tags_s))
502 add_tags(tags_t) # Could result in further merging
503 return
504 # If we could not merge, add to tagsets
505 tagsets.append(tags1)
507 for tags in tagsets1:
508 add_tags(tags)
509 for tags in tagsets2:
510 add_tags(tags)
511 if not tagsets:
512 tagsets.append(())
514 # print("or_tagsets: {} + {} -> {}"
515 # .format(tagsets1, tagsets2, tagsets))
516 return tagsets
519def and_tagsets(
520 lang: str,
521 pos: str,
522 tagsets1: list[tuple[str, ...]],
523 tagsets2: list[tuple[str, ...]],
524) -> list[tuple[str, ...]]:
525 """Merges tagsets by taking union of all cobinations, without trying
526 to determine whether they are compatible."""
527 assert isinstance(tagsets1, list) and len(tagsets1) >= 1
528 assert all(isinstance(x, tuple) for x in tagsets1)
529 assert isinstance(tagsets2, list) and len(tagsets2) >= 1
530 assert all(isinstance(x, tuple) for x in tagsets1)
531 new_tagsets = []
532 tags: Union[set[str], tuple[str, ...]]
533 for tags1 in tagsets1:
534 for tags2 in tagsets2:
535 tags = set(tags1) | set(tags2)
536 remove_useless_tags(lang, pos, tags)
537 if "dummy-ignored-text-cell" in tags: 537 ↛ 538line 537 didn't jump to line 538 because the condition on line 537 was never true
538 tags.remove("dummy-ignored-text-cell")
539 tags = tuple(sorted(tags))
540 if tags not in new_tagsets: 540 ↛ 534line 540 didn't jump to line 534 because the condition on line 540 was always true
541 new_tagsets.append(tags)
542 # print("and_tagsets: {} + {} -> {}"
543 # .format(tagsets1, tagsets2, new_tagsets))
544 return new_tagsets
547@functools.lru_cache(65536)
548def extract_cell_content(
549 lang: str, word: str, col: str
550) -> tuple[str, list[str], list[tuple[str, str]], list[str]]:
551 """Cleans a row/column header for later processing. This returns
552 (cleaned, refs, defs, tags)."""
553 # print("EXTRACT_CELL_CONTENT {!r}".format(col))
554 hdr_tags = []
555 col = re.sub(r"(?s)\s*,\s*$", "", col)
556 col = re.sub(r"(?s)\s*•\s*$", "", col)
557 col = re.sub(r"\s+", " ", col)
558 col = col.strip()
559 if re.search(
560 r"^\s*(There are |"
561 r"\* |"
562 r"see |"
563 r"Use |"
564 r"use the |"
565 r"Only used |"
566 r"The forms in |"
567 r"these are also written |"
568 r"The genitive can be |"
569 r"Genitive forms are rare or non-existant|"
570 r"Accusative Note: |"
571 r"Classifier Note: |"
572 r"Noun: Assamese nouns are |"
573 r"the active conjugation|"
574 r"the instrumenal singular|"
575 r"Note:|"
576 r"\^* Note:|"
577 r"possible mutated form |"
578 r"The future tense: )",
579 col,
580 ):
581 return "dummy-ignored-text-cell", [], [], []
583 # Temporarily remove final parenthesized part (if separated by whitespace),
584 # so that we can extract reference markers before it.
585 final_paren = ""
586 m = re.search(r"\s+\([^)]*\)$", col)
587 if m is not None:
588 final_paren = m.group(0)
589 col = col[: m.start()]
591 # Extract references and tag markers
592 refs = []
593 special_references = get_lang_conf(lang, "special_references")
594 while True:
595 m = re.search(r"\^(.|\([^)]*\))$", col)
596 if not m:
597 break
598 r = m.group(1)
599 if r.startswith("(") and r.endswith(")"):
600 r = r[1:-1]
601 for r1 in r.split(","):
602 if r1 == "rare": 602 ↛ 603line 602 didn't jump to line 603 because the condition on line 602 was never true
603 hdr_tags.append("rare")
604 elif special_references and r1 in special_references: 604 ↛ 605line 604 didn't jump to line 605 because the condition on line 604 was never true
605 hdr_tags.extend(special_references[r1].split())
606 else:
607 # v = m.group(1)
608 if r1.startswith("(") and r1.endswith(")"): 608 ↛ 609line 608 didn't jump to line 609 because the condition on line 608 was never true
609 r1 = r1[1:-1]
610 refs.append(unicodedata.normalize("NFKD", r1))
611 col = col[: m.start()]
612 # See if it is a ref definition
613 # print("BEFORE REF CHECK: {!r}".format(col))
614 m = def_re.match(col)
615 # print(f"Before def_re: {refs=}")
616 if m and not nondef_re.match(col):
617 ofs = 0
618 ref = None
619 deflst = []
620 for m in re.finditer(def_re, col):
621 if ref:
622 deflst.append((ref, col[ofs : m.start()].strip()))
623 ref = unicodedata.normalize(
624 "NFKD", m.group(3) or m.group(5) or m.group(6) or ""
625 )
626 ofs = m.end()
627 if ref: 627 ↛ 630line 627 didn't jump to line 630 because the condition on line 627 was always true
628 deflst.append((ref, col[ofs:].strip()))
629 # print("deflst:", deflst)
630 return "", [], deflst, []
631 # See if it *looks* like a reference to a definition
632 # print(f"After def_re: {refs=}")
633 while col:
634 if is_superscript(col[-1]) or col[-1] in ("†",):
635 if col.endswith("ʳᵃʳᵉ"):
636 hdr_tags.append("rare")
637 col = col[:-4].strip()
638 continue
639 if special_references: 639 ↛ 640line 639 didn't jump to line 640 because the condition on line 639 was never true
640 stop_flag = False
641 for r in special_references:
642 if col.endswith(r):
643 hdr_tags.extend(special_references[r].split())
644 col = col[: -len(r)].strip()
645 stop_flag = True
646 break # this for loop
647 if stop_flag:
648 continue # this while loop
649 # Numbers and H/L/N are useful information
650 refs.append(unicodedata.normalize("NFKD", col[-1]))
651 col = col[:-1]
652 else:
653 break
655 # Check for another form of note definition
656 if ( 656 ↛ 662line 656 didn't jump to line 662
657 len(col) > 2
658 and col[1] in (")", " ", ":")
659 and col[0].isdigit()
660 and not re.match(nondef_re, col)
661 ):
662 return "", [], [(col[0], col[2:].strip())], []
663 col = col.strip()
665 # Extract final "*" reference symbols. Sometimes there are multiple.
666 m = re.search(r"\*+$", col)
667 if m is not None:
668 col = col[: m.start()]
669 refs.append(unicodedata.normalize("NFKD", m.group(0)))
670 if col.endswith("(*)"): 670 ↛ 671line 670 didn't jump to line 671 because the condition on line 670 was never true
671 col = col[:-3].strip()
672 refs.append("*")
674 # Put back the final parenthesized part
675 col = col.strip() + final_paren
676 # print("EXTRACT_CELL_CONTENT: orig_col={!r} col={!r} refs={!r} hdr_tags={}"
677 # .format(orig_col, col, refs, hdr_tags))
678 return col.strip(), refs, [], hdr_tags
681@functools.lru_cache(10000)
682def parse_title(
683 title: str, source: str
684) -> tuple[list[str], list[str], list[FormData]]:
685 """Parses inflection table title. This returns (global_tags, table_tags,
686 extra_forms), where ``global_tags`` is tags to be added to each inflection
687 entry, ``table_tags`` are tags for the word but not to be added to every
688 form, and ``extra_forms`` is dictionary describing additional forms to be
689 included in the part-of-speech entry)."""
690 assert isinstance(title, str)
691 assert isinstance(source, str)
692 title = html.unescape(title)
693 title = re.sub(r"(?i)<[^>]*>", "", title).strip()
694 title = re.sub(r"\s+", " ", title)
695 # print("PARSE_TITLE:", title)
696 global_tags = []
697 table_tags = []
698 extra_forms = []
699 # Add certain global tags based on contained words
700 for m in re.finditer(title_contains_global_re, title):
701 v = m.group(0).lower()
702 if re.match(table_hdr_ign_part_re, v): 702 ↛ 703line 702 didn't jump to line 703 because the condition on line 702 was never true
703 continue
704 global_tags.extend(title_contains_global_map[v].split())
705 # Add certain tags to table-tags "form" based on contained words
706 for m in re.finditer(title_contains_wordtags_re, title):
707 v = m.group(0).lower()
708 if re.match(table_hdr_ign_part_re, v): 708 ↛ 709line 708 didn't jump to line 709 because the condition on line 708 was never true
709 continue
710 table_tags.extend(title_contains_wordtags_map[v].split())
711 if re.search(r"Conjugation of (s’|se ).*French verbs", title): 711 ↛ 712line 711 didn't jump to line 712 because the condition on line 711 was never true
712 global_tags.append("reflexive")
713 # Check for <x>-type at the beginning of title (e.g., Armenian) and various
714 # other ways of specifying an inflection class.
715 for m in re.finditer(
716 r"\b("
717 r"[\w/]+-type|"
718 r"accent-\w+|"
719 r"[\w/]+-stem|"
720 r"[^ ]+ gradation|"
721 r"\b(stem in [\w/ ]+)|"
722 r"[^ ]+ alternation|"
723 r"(First|Second|Third|Fourth|Fifth|Sixth|Seventh) "
724 r"(Conjugation|declension)|"
725 r"First and second declension|"
726 r"(1st|2nd|3rd|4th|5th|6th) declension|"
727 r"\w[\w/ ]* harmony"
728 r")\b",
729 title,
730 ):
731 dt: FormData = {"form": m.group(1), "source": source, "tags": ["class"]}
732 extra_forms.append(dt)
733 # Parse parenthesized part from title
734 for m in re.finditer(r"\(([^)]*)\)", title):
735 for elem in m.group(1).split(","):
736 # group(0) is the whole string, group(1) first parens
737 elem = elem.strip()
738 if elem in title_elements_map:
739 table_tags.extend(title_elements_map[elem].split())
740 else:
741 m1 = re.match(title_elemstart_re, elem)
742 if m1:
743 tags = title_elemstart_map[m1.group(1)].split()
744 dt = {
745 "form": elem[m1.end() :],
746 "source": source,
747 "tags": tags,
748 }
749 extra_forms.append(dt)
750 # For titles that contains no parenthesized parts, do some special
751 # handling to still interpret parts from them
752 if "(" not in title:
753 # No parenthesized parts
754 m1 = re.search(r"\b(Portuguese) (-.* verb) ", title)
755 if m1 is not None:
756 dt = {"form": m1.group(2), "tags": ["class"], "source": source}
757 extra_forms.append(dt)
758 for elem in title.split(","):
759 elem = elem.strip()
760 if elem in title_elements_map: 760 ↛ 761line 760 didn't jump to line 761 because the condition on line 760 was never true
761 table_tags.extend(title_elements_map[elem].split())
762 elif elem.endswith("-stem"): 762 ↛ 763line 762 didn't jump to line 763 because the condition on line 762 was never true
763 dt = {"form": elem, "tags": ["class"], "source": source}
764 extra_forms.append(dt)
765 return global_tags, table_tags, extra_forms
768def expand_header(
769 wxr: WiktextractContext,
770 tablecontext: "TableContext",
771 word: str,
772 lang: str,
773 pos: str,
774 text: str,
775 base_tags: Union[list[str], set[str], tuple[str, ...]],
776 silent=False,
777 ignore_tags=False,
778 depth=0,
779) -> list[tuple[str, ...]]:
780 """Expands a cell header to tagset, handling conditional expressions
781 in infl_map. This returns list of tuples of tags, each list element
782 describing an alternative interpretation. ``base_tags`` is combined
783 column and row tags for the cell in which the text is being interpreted
784 (conditional expressions in inflection data may depend on it).
785 If ``silent`` is True, then no warnings will be printed. If ``ignore_tags``
786 is True, then tags listed in "if" will be ignored in the test (this is
787 used when trying to heuristically detect whether a non-<th> cell is anyway
788 a header)."""
789 assert isinstance(wxr, WiktextractContext)
790 assert isinstance(word, str)
791 assert isinstance(lang, str)
792 assert isinstance(pos, str)
793 assert isinstance(text, str)
794 assert isinstance(base_tags, (list, tuple, set))
795 assert silent in (True, False)
796 assert isinstance(depth, int)
797 # print("EXPAND_HDR: text={!r} base_tags={!r}".format(text, base_tags))
798 # First map the text using the inflection map
799 text = clean_value(wxr, text)
800 combined_return: list[tuple[str, ...]] = []
801 parts = split_at_comma_semi(text, separators=[";"])
802 for text in parts:
803 if not text: 803 ↛ 804line 803 didn't jump to line 804 because the condition on line 803 was never true
804 continue
805 if text in infl_map:
806 v = infl_map[text] # list or string
807 else:
808 m = re.match(infl_start_re, text)
809 if m is not None: 809 ↛ 810line 809 didn't jump to line 810 because the condition on line 809 was never true
810 v = infl_start_map[m.group(1)]
811 # print("INFL_START {} -> {}".format(text, v))
812 elif re.match(r"Notes", text):
813 # Ignored header
814 # print("IGNORING NOTES")
815 combined_return = or_tagsets(
816 lang, pos, combined_return, [("dummy-skip-this",)]
817 )
818 # this just adds dummy-skip-this
819 continue
820 elif text in IGNORED_COLVALUES:
821 combined_return = or_tagsets(
822 lang, pos, combined_return, [("dummy-ignore-skipped",)]
823 )
824 continue
825 # Try without final parenthesized part
826 text_without_parens = re.sub(r"[,/]?\s+\([^)]*\)\s*$", "", text)
827 if text_without_parens in infl_map:
828 v = infl_map[text_without_parens]
829 elif m is None: 829 ↛ 845line 829 didn't jump to line 845 because the condition on line 829 was always true
830 if not silent:
831 wxr.wtp.debug(
832 "inflection table: unrecognized header: {}".format(
833 repr(text)
834 ),
835 sortid="inflection/735",
836 )
837 # Unrecognized header
838 combined_return = or_tagsets(
839 lang, pos, combined_return, [("error-unrecognized-form",)]
840 )
841 continue
843 # Then loop interpreting the value, until the value is a simple string.
844 # This may evaluate nested conditional expressions.
845 default_then = None
846 while True:
847 # If it is a string, we are done.
848 if isinstance(v, str):
849 tags = set(v.split())
850 remove_useless_tags(lang, pos, tags)
851 tagset = [tuple(sorted(tags))]
852 break
853 # For a list, just interpret it as alternatives. (Currently the
854 # alternatives must directly be strings.)
855 if isinstance(v, (list, tuple)):
856 tagset = []
857 for x in v:
858 tags = set(x.split())
859 remove_useless_tags(lang, pos, tags)
860 tags_t = tuple(sorted(tags))
861 if tags_t not in tagset: 861 ↛ 857line 861 didn't jump to line 857 because the condition on line 861 was always true
862 tagset.append(tags_t)
863 break
864 # Otherwise the value should be a dictionary describing a
865 # conditional expression.
866 if not isinstance(v, dict): 866 ↛ 867line 866 didn't jump to line 867 because the condition on line 866 was never true
867 wxr.wtp.debug(
868 "inflection table: internal: "
869 "UNIMPLEMENTED INFL_MAP VALUE: {}".format(infl_map[text]),
870 sortid="inflection/767",
871 )
872 tagset = [()]
873 break
874 # Evaluate the conditional expression.
875 assert isinstance(v, dict)
876 cond: Union[bool, str] = "default-true"
877 c: Union[str, list[str], set[str]] = ""
878 # Handle "lang" condition. The value must be either a
879 # single language or a list of languages, and the
880 # condition evaluates to True if the table is one of
881 # those languages.
882 if "lang" in v:
883 c = v["lang"]
884 if isinstance(c, str):
885 cond = c == lang
886 else:
887 assert isinstance(c, (list, tuple, set))
888 cond = lang in c
889 # Handle "nested-table-depth" condition. The value must
890 # be an int or list of ints, and the condition evaluates
891 # True if the depth is one of those values.
892 # "depth" is how deep into a nested table tree the current
893 # table lies. It is first started in handle_wikitext_table,
894 # so only applies to tables-within-tables, not other
895 # WikiNode content. `depth` is currently only passed as a
896 # parameter down the table parsing stack, and not stored.
897 if cond and "nested-table-depth" in v: 897 ↛ 898line 897 didn't jump to line 898 because the condition on line 897 was never true
898 d = v["nested-table-depth"]
899 if isinstance(d, int):
900 cond = d == depth
901 else:
902 assert isinstance(d, (list, tuple, set))
903 cond = depth in d
904 # Handle inflection-template condition. Must be a string
905 # or list of strings, and if tablecontext.template_name is in
906 # those, accept the condition.
907 # TableContext.template_name is passed down from page/
908 # parse_inflection, before parsing and expanding itself
909 # has begun.
910 if cond and tablecontext and "inflection-template" in v:
911 d1 = v["inflection-template"]
912 if isinstance(d1, str): 912 ↛ 915line 912 didn't jump to line 915 because the condition on line 912 was always true
913 cond = d1 == tablecontext.template_name
914 else:
915 assert isinstance(d1, (list, tuple, set))
916 cond = tablecontext.template_name in d1
917 # Handle "pos" condition. The value must be either a single
918 # part-of-speech or a list of them, and the condition evaluates to
919 # True if the part-of-speech is any of those listed.
920 if cond and "pos" in v:
921 c = v["pos"]
922 if isinstance(c, str):
923 cond = c == pos
924 else:
925 assert isinstance(c, (list, tuple, set))
926 cond = pos in c
927 # Handle "if" condition. The value must be a string containing a
928 # space-separated list of tags. The condition evaluates to True if
929 # ``base_tags`` contains all of the listed tags. If the condition
930 # is of the form "any: ...tags...", then any of the tags will be
931 # enough.
932 if cond and "if" in v and not ignore_tags:
933 c = v["if"]
934 assert isinstance(c, str)
935 # "if" condition is true if any of the listed tags is present if
936 # it starts with "any:", otherwise all must be present
937 if c.startswith("any: "):
938 cond = any(t in base_tags for t in c[5:].split())
939 else:
940 cond = all(t in base_tags for t in c.split())
942 # Handle "default" assignment. Store the value to be used
943 # as a default later.
944 if "default" in v:
945 assert isinstance(v["default"], str)
946 default_then = v["default"]
948 # Warning message about missing conditions for debugging.
950 if cond == "default-true" and not default_then and not silent:
951 wxr.wtp.debug(
952 "inflection table: IF MISSING COND: word={} "
953 "lang={} text={} base_tags={} c={} cond={}".format(
954 word, lang, text, base_tags, c, cond
955 ),
956 sortid="inflection/851",
957 )
958 # Based on the result of evaluating the condition, select either
959 # "then" part or "else" part.
960 if cond:
961 v = v.get("then", "")
962 else:
963 v1 = v.get("else")
964 if v1 is None:
965 if default_then:
966 v = default_then
967 else:
968 if not silent:
969 wxr.wtp.debug(
970 "inflection table: IF WITHOUT ELSE EVALS "
971 "False: "
972 "{}/{} {!r} base_tags={}".format(
973 word, lang, text, base_tags
974 ),
975 sortid="inflection/865",
976 )
977 v = "error-unrecognized-form"
978 else:
979 v = v1
981 # Merge the resulting tagset from this header part with the other
982 # tagsets from the whole header
983 combined_return = or_tagsets(lang, pos, combined_return, tagset)
985 # Return the combined tagsets, or empty tagset if we got no tagsets
986 if not combined_return:
987 combined_return = [()]
988 return combined_return
991def compute_coltags(
992 lang: str,
993 pos: str,
994 hdrspans: list[str],
995 start: int,
996 colspan: int,
997 celltext: int,
998) -> list[tuple[str]]:
999 """Computes column tags for a column of the given width based on the
1000 current header spans."""
1001 assert isinstance(lang, str)
1002 assert isinstance(pos, str)
1003 assert isinstance(hdrspans, list)
1004 assert isinstance(start, int) and start >= 0
1005 assert isinstance(colspan, int) and colspan >= 1
1006 assert isinstance(celltext, str) # For debugging only
1007 # print("COMPUTE_COLTAGS CALLED start={} colspan={} celltext={!r}"
1008 # .format(start, colspan, celltext))
1009 # For debugging, set this to the form for whose cell you want debug prints
1010 if celltext == debug_cell_text: 1010 ↛ 1011line 1010 didn't jump to line 1011 because the condition on line 1010 was never true
1011 print(
1012 "COMPUTE_COLTAGS CALLED start={} colspan={} celltext={!r}".format(
1013 start, colspan, celltext
1014 )
1015 )
1016 for hdrspan in hdrspans:
1017 print(
1018 " row={} start={} colspans={} tagsets={}".format(
1019 hdrspan.rownum,
1020 hdrspan.start,
1021 hdrspan.colspan,
1022 hdrspan.tagsets,
1023 )
1024 )
1025 used = set()
1026 coltags = [()]
1027 last_header_row = 1000000
1028 # Iterate through the headers in reverse order, i.e., headers lower in the
1029 # table (closer to the cell) first.
1030 row_tagsets = [()]
1031 row_tagsets_rownum = 1000000
1032 used_hdrspans = set()
1033 for hdrspan in reversed(hdrspans):
1034 if (
1035 hdrspan.start + hdrspan.colspan <= start
1036 or hdrspan.start >= start + colspan
1037 ):
1038 # Does not horizontally overlap current cell. Ignore this hdrspan.
1039 if celltext == debug_cell_text: 1039 ↛ 1040line 1039 didn't jump to line 1040 because the condition on line 1039 was never true
1040 print(
1041 "Ignoring row={} start={} colspan={} tagsets={}".format(
1042 hdrspan.rownum,
1043 hdrspan.start,
1044 hdrspan.colspan,
1045 hdrspan.tagsets,
1046 )
1047 )
1048 continue
1049 # If the cell partially overlaps the current cell, assume we have
1050 # reached something unrelated and abort.
1051 if (
1052 hdrspan.start < start
1053 and hdrspan.start + hdrspan.colspan > start
1054 and hdrspan.start + hdrspan.colspan < start + colspan
1055 ):
1056 if celltext == debug_cell_text: 1056 ↛ 1057line 1056 didn't jump to line 1057 because the condition on line 1056 was never true
1057 print(
1058 "break on partial overlap at start {} {} {}".format(
1059 hdrspan.start, hdrspan.colspan, hdrspan.tagsets
1060 )
1061 )
1062 break
1063 if (
1064 hdrspan.start < start + colspan
1065 and hdrspan.start > start
1066 and hdrspan.start + hdrspan.colspan > start + colspan
1067 and not hdrspan.expanded
1068 ):
1069 if celltext == debug_cell_text: 1069 ↛ 1070line 1069 didn't jump to line 1070 because the condition on line 1069 was never true
1070 print(
1071 "break on partial overlap at end {} {} {}".format(
1072 hdrspan.start, hdrspan.colspan, hdrspan.tagsets
1073 )
1074 )
1075 break
1076 # Check if we have already used this cell.
1077 if id(hdrspan) in used_hdrspans:
1078 continue
1079 # We are going to use this cell.
1080 used_hdrspans.add(id(hdrspan))
1081 tagsets = hdrspan.tagsets
1082 # If the hdrspan is fully inside the current cell and does not cover
1083 # it fully, check if we should merge information from multiple cells.
1084 if not hdrspan.expanded and (
1085 hdrspan.start > start
1086 or hdrspan.start + hdrspan.colspan < start + colspan
1087 ):
1088 # Multiple columns apply to the current cell, only
1089 # gender/number/case tags present
1090 # If there are no tags outside the range in any of the
1091 # categories included in these cells, don't add anything
1092 # (assume all choices valid in the language are possible).
1093 in_cats = set(
1094 valid_tags[t]
1095 for x in hdrspans
1096 if x.rownum == hdrspan.rownum
1097 and x.start >= start
1098 and x.start + x.colspan <= start + colspan
1099 for tt in x.tagsets
1100 for t in tt
1101 )
1102 if celltext == debug_cell_text: 1102 ↛ 1103line 1102 didn't jump to line 1103 because the condition on line 1102 was never true
1103 print("in_cats={} tagsets={}".format(in_cats, tagsets))
1104 # Merge the tagsets into existing tagsets. This merges
1105 # alternatives into the same tagset if there is only one
1106 # category different; otherwise this splits the tagset into
1107 # more alternatives.
1108 includes_all_on_row = True
1109 for x in hdrspans:
1110 # print("X: x.rownum={} x.start={}".format(x.rownum, x.start))
1111 if x.rownum != hdrspan.rownum:
1112 continue
1113 if x.start < start or x.start + x.colspan > start + colspan:
1114 if celltext == debug_cell_text: 1114 ↛ 1115line 1114 didn't jump to line 1115 because the condition on line 1114 was never true
1115 print(
1116 "NOT IN RANGE: {} {} {}".format(
1117 x.start, x.colspan, x.tagsets
1118 )
1119 )
1120 includes_all_on_row = False
1121 continue
1122 if id(x) in used_hdrspans:
1123 if celltext == debug_cell_text: 1123 ↛ 1124line 1123 didn't jump to line 1124 because the condition on line 1123 was never true
1124 print(
1125 "ALREADY USED: {} {} {}".format(
1126 x.start, x.colspan, x.tagsets
1127 )
1128 )
1129 continue
1130 used_hdrspans.add(id(x))
1131 if celltext == debug_cell_text: 1131 ↛ 1132line 1131 didn't jump to line 1132 because the condition on line 1131 was never true
1132 print(
1133 "Merging into wide col: x.rownum={} "
1134 "x.start={} x.colspan={} "
1135 "start={} colspan={} tagsets={} x.tagsets={}".format(
1136 x.rownum,
1137 x.start,
1138 x.colspan,
1139 start,
1140 colspan,
1141 tagsets,
1142 x.tagsets,
1143 )
1144 )
1145 tagsets = or_tagsets(lang, pos, tagsets, x.tagsets)
1146 # If all headers on the row were included, ignore them.
1147 # See e.g. kunna/Swedish/Verb.
1148 ts_cats = tagset_cats(tagsets)
1149 if (
1150 includes_all_on_row
1151 or
1152 # Kludge, see fut/Hungarian/Verb
1153 ("tense" in ts_cats and "object" in ts_cats)
1154 ):
1155 tagsets = [()]
1156 # For limited categories, if the category doesn't appear
1157 # outside, we won't include the category
1158 if not in_cats - set(
1159 ("gender", "number", "person", "case", "category", "voice")
1160 ):
1161 # Sometimes we have masc, fem, neut and plural, so treat
1162 # number and gender as the same here (if one given, look for
1163 # the other too)
1164 if "number" in in_cats or "gender" in in_cats:
1165 in_cats.update(("number", "gender"))
1166 # Determine which categories occur outside on
1167 # the same row. Ignore headers that have been expanded
1168 # to cover the whole row/part of it.
1169 out_cats = set(
1170 valid_tags[t]
1171 for x in hdrspans
1172 if x.rownum == hdrspan.rownum
1173 and not x.expanded
1174 and (
1175 x.start < start or x.start + x.colspan > start + colspan
1176 )
1177 for tt in x.tagsets
1178 for t in tt
1179 )
1180 if celltext == debug_cell_text: 1180 ↛ 1181line 1180 didn't jump to line 1181 because the condition on line 1180 was never true
1181 print("in_cats={} out_cats={}".format(in_cats, out_cats))
1182 # Remove all inside categories that do not appear outside
1184 new_tagsets = []
1185 for ts in tagsets:
1186 tags = tuple(
1187 sorted(t for t in ts if valid_tags[t] in out_cats)
1188 )
1189 if tags not in new_tagsets: 1189 ↛ 1185line 1189 didn't jump to line 1185 because the condition on line 1189 was always true
1190 new_tagsets.append(tags)
1191 if celltext == debug_cell_text and new_tagsets != tagsets: 1191 ↛ 1192line 1191 didn't jump to line 1192 because the condition on line 1191 was never true
1192 print(
1193 "Removed tags that do not "
1194 "appear outside {} -> {}".format(
1195 # have_hdr never used?
1196 tagsets,
1197 new_tagsets,
1198 )
1199 )
1200 tagsets = new_tagsets
1201 key = (hdrspan.start, hdrspan.colspan)
1202 if key in used:
1203 if celltext == debug_cell_text: 1203 ↛ 1204line 1203 didn't jump to line 1204 because the condition on line 1203 was never true
1204 print(
1205 "Cellspan already used: start={} "
1206 "colspan={} rownum={} {}".format(
1207 hdrspan.start,
1208 hdrspan.colspan,
1209 hdrspan.rownum,
1210 hdrspan.tagsets,
1211 )
1212 )
1213 action = get_lang_conf(lang, "reuse_cellspan")
1214 # can be "stop", "skip" or "reuse"
1215 if action == "stop":
1216 break
1217 if action == "skip":
1218 continue
1219 assert action == "reuse"
1220 tcats = tagset_cats(tagsets)
1221 # Most headers block using the same column position above. However,
1222 # "register" tags don't do this (cf. essere/Italian/verb: "formal")
1223 if len(tcats) != 1 or "register" not in tcats:
1224 used.add(key)
1225 # If we have moved to a different row, merge into column tagsets
1226 # (we use different and_tagsets within the row)
1227 if row_tagsets_rownum != hdrspan.rownum:
1228 # row_tagsets_rownum was initialized as 10000000
1229 ret = and_tagsets(lang, pos, coltags, row_tagsets)
1230 if celltext == debug_cell_text: 1230 ↛ 1231line 1230 didn't jump to line 1231 because the condition on line 1230 was never true
1231 print(
1232 "merging rows: {} {} -> {}".format(
1233 coltags, row_tagsets, ret
1234 )
1235 )
1236 coltags = ret
1237 row_tagsets = [()]
1238 row_tagsets_rownum = hdrspan.rownum
1239 # Merge into coltags
1240 if hdrspan.all_headers_row and hdrspan.rownum + 1 == last_header_row:
1241 # If this row is all headers and immediately preceeds the last
1242 # header we accepted, take any header from there.
1243 row_tagsets = and_tagsets(lang, pos, row_tagsets, tagsets)
1244 if celltext == debug_cell_text: 1244 ↛ 1245line 1244 didn't jump to line 1245 because the condition on line 1244 was never true
1245 print("merged (next header row): {}".format(row_tagsets))
1246 else:
1247 # new_cats is for the new tags (higher up in the table)
1248 new_cats = tagset_cats(tagsets)
1249 # cur_cats is for the tags already collected (lower in the table)
1250 cur_cats = tagset_cats(coltags)
1251 if celltext == debug_cell_text: 1251 ↛ 1252line 1251 didn't jump to line 1252 because the condition on line 1251 was never true
1252 print(
1253 "row={} start={} colspan={} tagsets={} coltags={} "
1254 "new_cats={} cur_cats={}".format(
1255 hdrspan.rownum,
1256 hdrspan.start,
1257 hdrspan.colspan,
1258 tagsets,
1259 coltags,
1260 new_cats,
1261 cur_cats,
1262 )
1263 )
1264 if "detail" in new_cats:
1265 if not any(coltags): # Only if no tags so far
1266 coltags = or_tagsets(lang, pos, coltags, tagsets)
1267 if celltext == debug_cell_text: 1267 ↛ 1268line 1267 didn't jump to line 1268 because the condition on line 1267 was never true
1268 print("stopping on detail after merge")
1269 break
1270 # Here, we block bleeding of categories from above
1271 elif "non-finite" in cur_cats and "non-finite" in new_cats:
1272 stop = get_lang_conf(lang, "stop_non_finite_non_finite")
1273 if stop: 1273 ↛ 1299line 1273 didn't jump to line 1299 because the condition on line 1273 was always true
1274 if celltext == debug_cell_text: 1274 ↛ 1275line 1274 didn't jump to line 1275 because the condition on line 1274 was never true
1275 print("stopping on non-finite-non-finite")
1276 break
1277 elif "non-finite" in cur_cats and "voice" in new_cats:
1278 stop = get_lang_conf(lang, "stop_non_finite_voice")
1279 if stop: 1279 ↛ 1299line 1279 didn't jump to line 1299 because the condition on line 1279 was always true
1280 if celltext == debug_cell_text: 1280 ↛ 1281line 1280 didn't jump to line 1281 because the condition on line 1280 was never true
1281 print("stopping on non-finite-voice")
1282 break
1283 elif "non-finite" in new_cats and cur_cats & set(
1284 ("person", "number")
1285 ):
1286 if celltext == debug_cell_text: 1286 ↛ 1287line 1286 didn't jump to line 1287 because the condition on line 1286 was never true
1287 print("stopping on non-finite new")
1288 break
1289 elif "non-finite" in new_cats and "tense" in new_cats:
1290 stop = get_lang_conf(lang, "stop_non_finite_tense")
1291 if stop:
1292 if celltext == debug_cell_text: 1292 ↛ 1293line 1292 didn't jump to line 1293 because the condition on line 1292 was never true
1293 print("stopping on non-finite new")
1294 break
1295 elif "non-finite" in cur_cats and new_cats & set(("mood",)): 1295 ↛ 1296line 1295 didn't jump to line 1296 because the condition on line 1295 was never true
1296 if celltext == debug_cell_text:
1297 print("stopping on non-finite cur")
1298 break
1299 if (
1300 "tense" in new_cats
1301 and any("imperative" in x for x in coltags)
1302 and get_lang_conf(lang, "imperative_no_tense")
1303 ):
1304 if celltext == debug_cell_text: 1304 ↛ 1305line 1304 didn't jump to line 1305 because the condition on line 1304 was never true
1305 print("skipping tense in imperative")
1306 continue
1307 elif (
1308 "mood" in new_cats
1309 and "mood" in cur_cats
1310 and
1311 # Allow if all new tags are already in current set
1312 any(
1313 t not in ts1
1314 for ts1 in coltags # current
1315 for ts2 in tagsets # new (from above)
1316 for t in ts2
1317 )
1318 ):
1319 skip = get_lang_conf(lang, "skip_mood_mood")
1320 if skip:
1321 if celltext == debug_cell_text: 1321 ↛ 1322line 1321 didn't jump to line 1322 because the condition on line 1321 was never true
1322 print("skipping on mood-mood")
1323 # we continue to next header
1324 else:
1325 if celltext == debug_cell_text: 1325 ↛ 1326line 1325 didn't jump to line 1326 because the condition on line 1325 was never true
1326 print("stopping on mood-mood")
1327 break
1328 elif "tense" in new_cats and "tense" in cur_cats:
1329 skip = get_lang_conf(lang, "skip_tense_tense")
1330 if skip:
1331 if celltext == debug_cell_text: 1331 ↛ 1332line 1331 didn't jump to line 1332 because the condition on line 1331 was never true
1332 print("skipping on tense-tense")
1333 # we continue to next header
1334 else:
1335 if celltext == debug_cell_text: 1335 ↛ 1336line 1335 didn't jump to line 1336 because the condition on line 1335 was never true
1336 print("stopping on tense-tense")
1337 break
1338 elif "aspect" in new_cats and "aspect" in cur_cats:
1339 if celltext == debug_cell_text: 1339 ↛ 1340line 1339 didn't jump to line 1340 because the condition on line 1339 was never true
1340 print("skipping on aspect-aspect")
1341 continue
1342 elif "number" in cur_cats and "number" in new_cats:
1343 if celltext == debug_cell_text: 1343 ↛ 1344line 1343 didn't jump to line 1344 because the condition on line 1343 was never true
1344 print("stopping on number-number")
1345 break
1346 elif "number" in cur_cats and "gender" in new_cats:
1347 if celltext == debug_cell_text: 1347 ↛ 1348line 1347 didn't jump to line 1348 because the condition on line 1347 was never true
1348 print("stopping on number-gender")
1349 break
1350 elif "person" in cur_cats and "person" in new_cats:
1351 if celltext == debug_cell_text: 1351 ↛ 1352line 1351 didn't jump to line 1352 because the condition on line 1351 was never true
1352 print("stopping on person-person")
1353 break
1354 else:
1355 # Merge tags and continue to next header up/left in the table.
1356 row_tagsets = and_tagsets(lang, pos, row_tagsets, tagsets)
1357 if celltext == debug_cell_text: 1357 ↛ 1358line 1357 didn't jump to line 1358 because the condition on line 1357 was never true
1358 print("merged: {}".format(coltags))
1359 # Update the row number from which we have last taken headers
1360 last_header_row = hdrspan.rownum
1361 # Merge the final row tagset into coltags
1362 coltags = and_tagsets(lang, pos, coltags, row_tagsets)
1363 # print(
1364 # "HDRSPANS:", list((x.start, x.colspan, x.tagsets) for x in hdrspans)
1365 # )
1366 if celltext == debug_cell_text: 1366 ↛ 1367line 1366 didn't jump to line 1367 because the condition on line 1366 was never true
1367 print("COMPUTE_COLTAGS {} {}: {}".format(start, colspan, coltags))
1368 assert isinstance(coltags, list)
1369 assert all(isinstance(x, tuple) for x in coltags)
1370 return coltags
1373def parse_simple_table(
1374 wxr, tablecontext, word, lang, pos, rows, titles, source, after, depth
1375):
1376 """This is the default table parser. Despite its name, it can parse
1377 complex tables. This returns a list of forms to be added to the
1378 part-of-speech, or None if the table could not be parsed."""
1379 assert isinstance(wxr, WiktextractContext)
1380 assert isinstance(tablecontext, TableContext)
1381 assert isinstance(word, str)
1382 assert isinstance(lang, str)
1383 assert isinstance(pos, str)
1384 assert isinstance(rows, list)
1385 assert isinstance(source, str)
1386 assert isinstance(after, str)
1387 assert isinstance(depth, int)
1388 for row in rows:
1389 for col in row:
1390 assert isinstance(col, InflCell)
1391 assert isinstance(titles, list)
1392 for x in titles:
1393 assert isinstance(x, str)
1395 # print("PARSE_SIMPLE_TABLE: TITLES:", titles)
1396 if debug_cell_text: 1396 ↛ 1397line 1396 didn't jump to line 1397 because the condition on line 1396 was never true
1397 print("ROWS:")
1398 for row in rows:
1399 print(" ", row)
1401 # Check for forced rowspan kludge. See e.g.
1402 # maorski/Serbo-Croatian. These are essentially multi-row
1403 # cells implemented using <br> rather than separate cell. We fix this
1404 # by identifying rows where this happens, and splitting the current row
1405 # to multiple rows by synthesizing additional cells.
1406 new_rows = []
1407 for row in rows:
1408 split_row = (
1409 any(x.is_title and x.text in ("inanimate\nanimate",) for x in row)
1410 and
1411 # x is an InflCell
1412 all(x.rowspan == 1 for x in row)
1413 )
1414 if not split_row:
1415 new_rows.append(row)
1416 continue
1417 row1 = []
1418 row2 = []
1419 for cell in row:
1420 cell1 = copy.deepcopy(cell)
1421 if "\n" in cell.text:
1422 # Has more than one line - split this cell
1423 parts = cell.text.strip().splitlines()
1424 if len(parts) != 2: 1424 ↛ 1425line 1424 didn't jump to line 1425 because the condition on line 1424 was never true
1425 wxr.wtp.debug(
1426 "forced rowspan kludge got {} parts: {!r}".format(
1427 len(parts), cell.text
1428 ),
1429 sortid="inflection/1234",
1430 )
1431 cell2 = copy.deepcopy(cell)
1432 cell1.text = parts[0]
1433 cell2.text = parts[1]
1434 else:
1435 cell1.rowspan = 2
1436 cell2 = cell1 # ref, not a copy
1437 row1.append(cell1)
1438 row2.append(cell2)
1439 new_rows.append(row1)
1440 new_rows.append(row2)
1441 rows = new_rows
1442 # print("ROWS AFTER FORCED ROWSPAN KLUDGE:")
1443 # for row in rows:
1444 # print(" ", row)
1446 # Parse definitions for references (from table itself and from text
1447 # after it)
1448 def_ht = {}
1450 def add_defs(defs: list[tuple[str, str]]) -> None:
1451 for ref, d in defs:
1452 # print("DEF: ref={} d={}".format(ref, d))
1453 d = d.strip()
1454 d = d.split(". ")[0].strip() # text before ". "
1455 if not d: 1455 ↛ 1456line 1455 didn't jump to line 1456 because the condition on line 1455 was never true
1456 continue
1457 if d.endswith("."): # catc ".."??
1458 d = d[:-1]
1459 tags, topics = decode_tags(d, no_unknown_starts=True)
1460 # print(f"{ref=}, {d=}, {tags=}")
1461 if topics or any("error-unknown-tag" in ts for ts in tags):
1462 d = d[0].lower() + d[1:]
1463 tags, topics = decode_tags(d, no_unknown_starts=True)
1464 if topics or any("error-unknown-tag" in ts for ts in tags):
1465 # Failed to parse as tags
1466 # print("Failed: topics={} tags={}"
1467 # .format(topics, tags))
1468 continue
1469 tags1_s: set[str] = set()
1470 for ts in tags:
1471 tags1_s.update(ts)
1472 tags1 = tuple(sorted(tags1_s))
1473 # print("DEFINED: {} -> {}".format(ref, tags1))
1474 def_ht[ref] = tags1
1476 def generate_tags(
1477 rowtags: list[tuple[str]], table_tags: list[str]
1478 ) -> tuple[
1479 list[tuple[str, ...]], list[tuple[str, ...]], list[tuple[str, ...]]
1480 ]:
1481 new_coltags = []
1482 all_hdr_tags = [] # list of tuples
1483 new_rowtags = []
1484 for rt0 in rowtags:
1485 for ct0 in compute_coltags(
1486 lang,
1487 pos,
1488 hdrspans,
1489 col_idx, # col_idx=>start
1490 colspan,
1491 col, # cell_text
1492 ):
1493 base_tags: set[str] = (
1494 set(rt0)
1495 | set(ct0)
1496 | set(global_tags)
1497 | set(itertools.chain.from_iterable(table_tags))
1498 ) # Union.
1499 alt_tags = expand_header(
1500 wxr,
1501 tablecontext,
1502 word,
1503 lang,
1504 pos,
1505 text,
1506 base_tags,
1507 depth=depth,
1508 )
1509 # base_tags are used in infl_map "if"-conds.
1510 for tt in alt_tags:
1511 if tt not in all_hdr_tags:
1512 all_hdr_tags.append(tt)
1513 tt_s = set(tt)
1514 # Certain tags are always moved to word-level tags
1515 if tt_s & TAGS_FORCED_WORDTAGS: 1515 ↛ 1516line 1515 didn't jump to line 1516 because the condition on line 1515 was never true
1516 table_tags.extend(tt_s & TAGS_FORCED_WORDTAGS)
1517 tt_s = tt_s - TAGS_FORCED_WORDTAGS
1518 # Add tags from referenced footnotes
1519 tt_s.update(refs_tags)
1520 # Sort, convert to tuple, and add to set of
1521 # alternatives.
1522 tt = tuple(sorted(tt_s))
1523 if tt not in new_coltags:
1524 new_coltags.append(tt)
1525 # Kludge (saprast/Latvian/Verb): ignore row tags
1526 # if trying to add a non-finite after mood.
1527 if any(valid_tags[t] == "mood" for t in rt0) and any(
1528 valid_tags[t] == "non-finite" for t in tt
1529 ):
1530 tags = tuple(sorted(set(tt) | set(hdr_tags)))
1531 else:
1532 tags = tuple(sorted(set(tt) | set(rt0) | set(hdr_tags)))
1533 if tags not in new_rowtags:
1534 new_rowtags.append(tags)
1535 return new_rowtags, new_coltags, all_hdr_tags
1537 def add_new_hdrspan(
1538 col: str,
1539 hdrspans: list[HdrSpan],
1540 store_new_hdrspan: bool,
1541 col0_followed_by_nonempty: bool,
1542 col0_hdrspan: Optional[HdrSpan],
1543 ) -> tuple[str, bool, Optional[HdrSpan]]:
1544 hdrspan = HdrSpan(
1545 col_idx, colspan, rowspan, rownum, new_coltags, col, all_headers
1546 )
1547 hdrspans.append(hdrspan)
1549 # infl-map tag "dummy-store-hdrspan" causes this new hdrspan
1550 # to be added to a register of stored hdrspans to be used
1551 # later with "dummy-load-stored-hdrspans".
1552 if store_new_hdrspan: 1552 ↛ 1553line 1552 didn't jump to line 1553 because the condition on line 1552 was never true
1553 tablecontext.stored_hdrspans.append(hdrspan)
1555 # Handle headers that are above left-side header
1556 # columns and are followed by personal pronouns in
1557 # remaining columns (basically headers that
1558 # evaluate to no tags). In such cases widen the
1559 # left-side header to the full row.
1560 if previously_seen: # id(cell) in seen_cells previously
1561 col0_followed_by_nonempty = True
1562 return col, col0_followed_by_nonempty, col0_hdrspan
1563 elif col0_hdrspan is None:
1564 col0_hdrspan = hdrspan
1565 elif any(all_hdr_tags): 1565 ↛ 1633line 1565 didn't jump to line 1633 because the condition on line 1565 was always true
1566 col0_cats = tagset_cats(col0_hdrspan.tagsets)
1567 later_cats = tagset_cats(all_hdr_tags)
1568 col0_allowed = get_lang_conf(lang, "hdr_expand_first")
1569 later_allowed = get_lang_conf(lang, "hdr_expand_cont")
1570 later_allowed = later_allowed | set(["dummy"])
1571 # dummy2 has different behavior than plain dummy
1572 # and does not belong here.
1574 # print("col0_cats={} later_cats={} "
1575 # "fol_by_nonempty={} col_idx={} end={} "
1576 # "tagsets={}"
1577 # .format(col0_cats, later_cats,
1578 # col0_followed_by_nonempty, col_idx,
1579 # col0_hdrspan.start +
1580 # col0_hdrspan.colspan,
1581 # col0_hdrspan.tagsets))
1582 # print("col0.rowspan={} rowspan={}"
1583 # .format(col0_hdrspan.rowspan, rowspan))
1584 # Only expand if [col0_cats and later_cats are allowed
1585 # and don't overlap] and [col0 has tags], and there have
1586 # been [no disallowed cells in between].
1587 #
1588 # There are three cases here:
1589 # - col0_hdrspan set, continue with allowed current
1590 # - col0_hdrspan set, expand, start new
1591 # - col0_hdrspan set, no expand, start new
1592 if (
1593 not col0_followed_by_nonempty
1594 and
1595 # XXX Only one cat of tags: kunna/Swedish
1596 # XXX len(col0_cats) == 1 and
1597 col0_hdrspan.rowspan >= rowspan
1598 and
1599 # from hdrspan
1600 not (later_cats - later_allowed)
1601 and not (col0_cats & later_cats)
1602 ):
1603 # First case: col0 set, continue
1604 return col, col0_followed_by_nonempty, col0_hdrspan
1605 # We are going to start new col0_hdrspan. Check if
1606 # we should expand.
1607 if (
1608 not col0_followed_by_nonempty
1609 and not (col0_cats - col0_allowed)
1610 and
1611 # Only "allowed" allowed
1612 # XXX len(col0_cats) == 1 and
1613 col_idx > col0_hdrspan.start + col0_hdrspan.colspan
1614 ):
1615 # col_idx is beyond current colspan
1616 # *Expand* current col0_hdrspan
1617 # print("EXPANDING COL0 MID: {} from {} to {} "
1618 # "cols {}"
1619 # .format(col0_hdrspan.text,
1620 # col0_hdrspan.colspan,
1621 # col_idx - col0_hdrspan.start,
1622 # col0_hdrspan.tagsets))
1623 col0_hdrspan.colspan = col_idx - col0_hdrspan.start
1624 col0_hdrspan.expanded = True
1625 # Clear old col0_hdrspan
1626 if col == debug_cell_text: 1626 ↛ 1627line 1626 didn't jump to line 1627 because the condition on line 1626 was never true
1627 print("START NEW {}".format(hdrspan.tagsets))
1628 col0_hdrspan = None
1629 # Now start new, unless it comes from previous row
1630 if not previously_seen: 1630 ↛ 1633line 1630 didn't jump to line 1633 because the condition on line 1630 was always true
1631 col0_hdrspan = hdrspan
1632 col0_followed_by_nonempty = False
1633 return col, col0_followed_by_nonempty, col0_hdrspan
1635 def split_text_into_alts(col: str) -> tuple[str, list[str], list[str]]:
1636 # Split the cell text into alternatives
1637 split_extra_tags = []
1638 if col and is_superscript(col[0]): 1638 ↛ 1639line 1638 didn't jump to line 1639 because the condition on line 1638 was never true
1639 alts = [col]
1640 else:
1641 separators = [";", "•", r"\n", " or "]
1642 if " + " not in col:
1643 separators.append(",")
1644 if not col.endswith("/"):
1645 separators.append("/")
1646 if col in special_phrase_splits:
1647 # Use language-specific special splits.
1648 # These are phrases and constructions that have
1649 # unique ways of splitting, not specific characters
1650 # to split on like with the default splitting.
1651 alts, tags = special_phrase_splits[col]
1652 split_extra_tags = tags.split()
1653 for x in split_extra_tags:
1654 assert x in valid_tags
1655 assert isinstance(alts, (list, tuple))
1656 assert isinstance(tags, str)
1657 else:
1658 # Use default splitting. However, recognize
1659 # language-specific replacements and change them to magic
1660 # characters before splitting. This way we won't split
1661 # them. This is important for, e.g., recognizing
1662 # alternative pronouns.
1663 # The magic characters are characters out of Unicode scope
1664 # that are given a simple incremental value, int > unicode.
1665 repls = {}
1666 magic_ch = MAGIC_FIRST
1667 trs = get_lang_conf(lang, "form_transformations")
1668 # trs is a list of lists of strings
1669 for _, v, _, _ in trs:
1670 # v is a pattern string, like "^ich"
1671 # form_transformations data is doing double-duty here,
1672 # because the pattern strings are already known to us and
1673 # not meant to be split.
1674 m = re.search(v, col)
1675 if m is not None:
1676 # if pattern found in text
1677 magic = chr(magic_ch)
1678 magic_ch += 1 # next magic character value
1679 col = re.sub(v, magic, col) # replace with magic ch
1680 repls[magic] = m.group(0)
1681 # remember what regex match string each magic char
1682 # replaces. .group(0) is the whole match.
1683 alts0 = split_at_comma_semi(col, separators=separators)
1684 # with magic characters in place, split the text so that
1685 # pre-transformation text is out of the way.
1686 alts = []
1687 for alt in alts0:
1688 # create a new list with the separated items and
1689 # the magic characters replaced with the original texts.
1690 for k, v in repls.items():
1691 alt = re.sub(k, v, alt)
1692 alts.append(alt)
1693 # Remove "*" from beginning of forms, as in non-attested
1694 # or reconstructed forms. Otherwise it might confuse romanization
1695 # detection.
1696 alts = list(re.sub(r"^\*\*?([^ ])", r"\1", x) for x in alts)
1697 alts = list(
1698 x for x in alts if not re.match(r"pronounced with |\(with ", x)
1699 )
1700 alts = list(
1701 re.sub(r"^\((in the sense [^)]*)\)\s+", "", x) for x in alts
1702 )
1703 # Check for parenthesized alternatives, e.g. ripromettersi/Italian
1704 if all( 1704 ↛ 1715line 1704 didn't jump to line 1715 because the condition on line 1704 was never true
1705 re.match(r"\w+( \w+)* \(\w+( \w+)*(, \w+( \w+)*)*\)$", alt)
1706 # word word* \(word word*(, word word*)*\)
1707 and all(
1708 distw([re.sub(r" \(.*", "", alt)], x) < 0.5
1709 # Levenshtein distance
1710 for x in re.sub(r".*\((.*)\)", r"\1", alt).split(", ")
1711 )
1712 # Extract from parentheses for testin
1713 for alt in alts
1714 ):
1715 new_alts = []
1716 for alt in alts:
1717 # Replace parentheses before splitting
1718 alt = alt.replace(" (", ", ")
1719 alt = alt.replace(")", "")
1720 for new_alt in alt.split(", "):
1721 new_alts.append(new_alt)
1722 alts = new_alts
1723 return col, alts, split_extra_tags
1725 def handle_mixed_lines(alts: list[str]) -> list[tuple[str, str, str]]:
1726 # Handle the special case where romanization is given under
1727 # normal form, e.g. in Russian. There can be multiple
1728 # comma-separated forms in each case. We also handle the case
1729 # where instead of romanization we have IPA pronunciation
1730 # (e.g., avoir/French/verb).
1731 len2 = len(alts) // 2
1732 # Check for IPAs (forms first, IPAs under)
1733 # base, base, IPA, IPA
1734 if (
1735 len(alts) % 2 == 0 # Divisibly by two
1736 and all(
1737 re.match(r"^\s*/.*/\s*$", x) # Inside slashes = IPA
1738 for x in alts[len2:]
1739 )
1740 ): # In the second half of alts
1741 nalts = list(
1742 (alts[i], "", alts[i + len2])
1743 # List of tuples: (base, "", ipa)
1744 for i in range(len2)
1745 )
1746 # base, base, base, IPA
1747 elif (
1748 len(alts) > 2
1749 and re.match(r"^\s*/.*/\s*$", alts[-1])
1750 and all(not x.startswith("/") for x in alts[:-1])
1751 ):
1752 # Only if the last alt is IPA
1753 nalts = list((alts[i], "", alts[-1]) for i in range(len(alts) - 1))
1754 # base, IPA, IPA, IPA
1755 elif (
1756 len(alts) > 2
1757 and not alts[0].startswith("/")
1758 and all(
1759 re.match(r"^\s*/.*/\s*$", alts[i]) for i in range(1, len(alts))
1760 )
1761 ):
1762 # First is base and the rest is IPA alternatives
1763 nalts = list((alts[0], "", alts[i]) for i in range(1, len(alts)))
1765 # Check for romanizations, forms first, romanizations under
1766 elif (
1767 len(alts) % 2 == 0
1768 and not any("(" in x for x in alts)
1769 and all(
1770 classify_desc(
1771 re.sub(
1772 r"\^.*$",
1773 "",
1774 # Remove ends of strings starting from ^.
1775 # Supescripts have been already removed
1776 # from the string, while ^xyz needs to be
1777 # removed separately, though it's usually
1778 # something with a single letter?
1779 "".join(xx for xx in x if not is_superscript(xx)),
1780 )
1781 )
1782 == "other"
1783 for x in alts[:len2]
1784 )
1785 and all(
1786 classify_desc(
1787 re.sub(
1788 r"\^.*$",
1789 "",
1790 "".join(xx for xx in x if not is_superscript(xx)),
1791 )
1792 )
1793 in ("romanization", "english")
1794 for x in alts[len2:]
1795 )
1796 ):
1797 nalts = list((alts[i], alts[i + len2], "") for i in range(len2))
1798 # Check for romanizations, forms and romanizations alternating
1799 elif (
1800 len(alts) % 2 == 0
1801 and not any("(" in x for x in alts)
1802 and all(
1803 classify_desc(
1804 re.sub(
1805 r"\^.*$",
1806 "",
1807 "".join(xx for xx in alts[i] if not is_superscript(xx)),
1808 )
1809 )
1810 == "other"
1811 for i in range(0, len(alts), 2)
1812 )
1813 and all(
1814 classify_desc(
1815 re.sub(
1816 r"\^.*$",
1817 "",
1818 "".join(xx for xx in alts[i] if not is_superscript(xx)),
1819 )
1820 )
1821 in ("romanization", "english")
1822 for i in range(1, len(alts), 2)
1823 )
1824 ):
1825 # odds
1826 nalts = list(
1827 (alts[i], alts[i + 1], "") for i in range(0, len(alts), 2)
1828 )
1829 # evens
1830 else:
1831 new_alts = []
1832 for alt in alts:
1833 lst = [""]
1834 idx = 0
1835 for m in re.finditer(
1836 r"(^|\w|\*)\((\w+" r"(/\w+)*)\)",
1837 # start OR letter OR asterisk (word/word*)
1838 # \\___________group 1_______/ \ \_g3_///
1839 # \ \__gr. 2_//
1840 # \_____________group 0________________/
1841 alt,
1842 ):
1843 v = m.group(2) # (word/word/word...)
1844 if (
1845 classify_desc(v) == "tags" # Tags inside parens
1846 or m.group(0) == alt
1847 ): # All in parens
1848 continue
1849 new_lst = []
1850 for x in lst:
1851 x += alt[idx : m.start()] + m.group(1)
1852 # alt until letter or asterisk
1853 idx = m.end()
1854 vparts = v.split("/")
1855 # group(2) = ["word", "wörd"...]
1856 if len(vparts) == 1:
1857 new_lst.append(x)
1858 new_lst.append(x + v)
1859 # "kind(er)" -> ["kind", "kinder"]
1860 else:
1861 for vv in vparts:
1862 new_lst.append(x + vv)
1863 # "lampai(tten/den)" ->
1864 # ["lampaitten", "lampaiden"]
1865 lst = new_lst
1866 for x in lst:
1867 new_alts.append(x + alt[idx:])
1868 # add the end of alt
1869 nalts = list((x, "", "") for x in new_alts)
1870 # [form, no romz, no ipa]
1871 return nalts
1873 def find_semantic_parens(form: str) -> tuple[str, list[str]]:
1874 # "Some languages" (=Greek) use brackets to mark things that
1875 # require tags, like (informality), [rarity] and {archaicity}.
1876 extra_tags = []
1877 if re.match(r"\([^][(){}]*\)$", form):
1878 if get_lang_conf(lang, "parentheses_for_informal"):
1879 form = form[1:-1]
1880 extra_tags.append("informal")
1881 else:
1882 form = form[1:-1]
1883 elif re.match(r"\{\[[^][(){}]*\]\}$", form):
1884 if get_lang_conf( 1884 ↛ 1891line 1884 didn't jump to line 1891 because the condition on line 1884 was always true
1885 lang, "square_brackets_for_rare"
1886 ) and get_lang_conf(lang, "curly_brackets_for_archaic"):
1887 # είμαι/Greek/Verb
1888 form = form[2:-2]
1889 extra_tags.extend(["rare", "archaic"])
1890 else:
1891 form = form[2:-2]
1892 elif re.match(r"\{[^][(){}]*\}$", form):
1893 if get_lang_conf(lang, "curly_brackets_for_archaic"): 1893 ↛ 1898line 1893 didn't jump to line 1898 because the condition on line 1893 was always true
1894 # είμαι/Greek/Verb
1895 form = form[1:-1]
1896 extra_tags.extend(["archaic"])
1897 else:
1898 form = form[1:-1]
1899 elif re.match(r"\[[^][(){}]*\]$", form):
1900 if get_lang_conf(lang, "square_brackets_for_rare"): 1900 ↛ 1905line 1900 didn't jump to line 1905 because the condition on line 1900 was always true
1901 # είμαι/Greek/Verb
1902 form = form[1:-1]
1903 extra_tags.append("rare")
1904 else:
1905 form = form[1:-1]
1906 return form, extra_tags
1908 def handle_parens(
1909 form: str, roman: str, clitic: str, extra_tags: list[str]
1910 ) -> tuple[str, str, str]:
1911 if re.match(r"[’'][a-z]([a-z][a-z]?)?$", paren):
1912 # is there a clitic starting with apostrophe?
1913 clitic = paren
1914 # assume the whole paren is a clitic
1915 # then remove paren from form
1916 form = (form[: m.start()] + subst + form[m.end() :]).strip()
1917 elif classify_desc(paren) == "tags":
1918 tagsets1, topics1 = decode_tags(paren)
1919 if not topics1: 1919 ↛ 1940line 1919 didn't jump to line 1940 because the condition on line 1919 was always true
1920 for ts in tagsets1:
1921 ts = tuple(x for x in ts if " " not in x)
1922 # There are some generated tags containing
1923 # spaces; do not let them through here.
1924 extra_tags.extend(ts)
1925 form = (form[: m.start()] + subst + form[m.end() :]).strip()
1926 # brackets contain romanization
1927 elif ( 1927 ↛ 1936line 1927 didn't jump to line 1936
1928 m.start() > 0
1929 and not roman
1930 and classify_desc(form[: m.start()]) == "other"
1931 and
1932 # "other" ~ text
1933 classify_desc(paren) in ("romanization", "english")
1934 and not re.search(r"^with |-form$", paren)
1935 ):
1936 roman = paren
1937 form = (form[: m.start()] + subst + form[m.end() :]).strip()
1938 elif re.search(r"^with |-form", paren): 1938 ↛ 1939line 1938 didn't jump to line 1939 because the condition on line 1938 was never true
1939 form = (form[: m.start()] + subst + form[m.end() :]).strip()
1940 return form, roman, clitic
1942 def merge_row_and_column_tags(form, some_has_covered_text):
1943 # Merge column tags and row tags. We give preference
1944 # to moods etc coming from rowtags (cf. austteigen/German/Verb
1945 # imperative forms).
1947 # In certain cases, what a tag means depends on whether
1948 # it is a row or column header. Depending on the language,
1949 # we replace certain tags with others if they're in
1950 # a column or row
1952 ret = []
1953 # rtagreplacs = get_lang_conf(lang, "rowtag_replacements")
1954 # ctagreplacs = get_lang_conf(lang, "coltag_replacements")
1955 for rt in sorted(rowtags):
1956 if "dummy-use-as-coltags" in rt: 1956 ↛ 1957line 1956 didn't jump to line 1957 because the condition on line 1956 was never true
1957 continue
1958 # if lang was in rowtag_replacements)
1959 # if not rtagreplacs == None:
1960 # rt = replace_directional_tags(rt, rtagreplacs)
1961 for ct in sorted(coltags):
1962 if "dummy-use-as-rowtags" in ct: 1962 ↛ 1963line 1962 didn't jump to line 1963 because the condition on line 1962 was never true
1963 continue
1964 # if lang was in coltag_replacements
1965 # if not ctagreplacs == None:
1966 # ct = replace_directional_tags(ct,
1967 # ctagreplacs)
1968 tags = set(global_tags)
1969 tags.update(extra_tags)
1970 tags.update(rt)
1971 tags.update(refs_tags)
1972 tags.update(tablecontext.section_header)
1973 # Merge tags from column. For certain kinds of tags,
1974 # those coming from row take precedence.
1975 old_tags = set(tags)
1976 for t in ct:
1977 c = valid_tags[t]
1978 if c in ("mood", "case", "number") and any(
1979 valid_tags[tt] == c for tt in old_tags
1980 ):
1981 continue
1982 tags.add(t)
1984 # Extract language-specific tags from the
1985 # form. This may also adjust the form.
1986 form, lang_tags = lang_specific_tags(lang, pos, form)
1987 tags.update(lang_tags)
1989 # For non-finite verb forms, see if they have
1990 # a gender/class suffix
1991 if pos == "verb" and any(
1992 valid_tags[t] == "non-finite" for t in tags
1993 ):
1994 form, tt = parse_head_final_tags(wxr, lang, form)
1995 tags.update(tt)
1997 # Remove "personal" tag if have nth person; these
1998 # come up with e.g. reconhecer/Portuguese/Verb. But
1999 # not if we also have "pronoun"
2000 if (
2001 "personal" in tags
2002 and "pronoun" not in tags
2003 and any(
2004 x in tags
2005 for x in [
2006 "first-person",
2007 "second-person",
2008 "third-person",
2009 ]
2010 )
2011 ):
2012 tags.remove("personal")
2014 # If we have impersonal, remove person and number.
2015 # This happens with e.g. viajar/Portuguese/Verb
2016 if "impersonal" in tags:
2017 tags = tags - set(
2018 [
2019 "first-person",
2020 "second-person",
2021 "third-person",
2022 "singular",
2023 "plural",
2024 ]
2025 )
2027 # Remove unnecessary "positive" tag from verb forms
2028 if pos == "verb" and "positive" in tags:
2029 if "negative" in tags: 2029 ↛ 2030line 2029 didn't jump to line 2030 because the condition on line 2029 was never true
2030 tags.remove("negative")
2031 tags.remove("positive")
2033 # Many Russian (and other Slavic) inflection tables
2034 # have animate/inanimate distinction that generates
2035 # separate entries for neuter/feminine, but the
2036 # distinction only applies to masculine. Remove them
2037 # form neuter/feminine and eliminate duplicates.
2038 if get_lang_conf(lang, "masc_only_animate"):
2039 for t1 in ("animate", "inanimate"):
2040 for t2 in ("neuter", "feminine"):
2041 if (
2042 t1 in tags
2043 and t2 in tags
2044 and "masculine" not in tags
2045 and "plural" not in tags
2046 ):
2047 tags.remove(t1)
2049 # German adjective tables contain "(keiner)" etc
2050 # for mixed declension plural. When the adjective
2051 # disappears and it becomes just one word, remove
2052 # the "includes-article" tag. e.g. eiskalt/German
2053 if "includes-article" in tags and " " not in form:
2054 tags.remove("includes-article")
2056 # Handle ignored forms. We mark that the form was
2057 # provided. This is important information; some words
2058 # just do not have a certain form. However, there also
2059 # many cases where no word in a language has a
2060 # particular form. Post-processing could detect and
2061 # remove such cases.
2062 if form in IGNORED_COLVALUES:
2063 # if cell text seems to be ignorable
2064 if "dummy-ignore-skipped" in tags:
2065 continue
2066 if (
2067 col_idx not in has_covering_hdr
2068 and some_has_covered_text
2069 ):
2070 continue
2071 # don't ignore this cell if there's been a header
2072 # above it
2073 form = "-"
2074 elif col_idx in has_covering_hdr:
2075 some_has_covered_text = True
2077 # Handle ambiguous object concord. If a header
2078 # gives the "dummy-object-concord"-tag to a word,
2079 # replace person, number and gender tags with
2080 # their "object-" counterparts so that the verb
2081 # agrees with the object instead.
2082 # Use only when the verb has ONLY object agreement!
2083 # a پخول/Pashto
2084 if "dummy-object-concord" in tags: 2084 ↛ 2085line 2084 didn't jump to line 2085 because the condition on line 2084 was never true
2085 for subtag, objtag in object_concord_replacements.items():
2086 if subtag in tags:
2087 tags.remove(subtag)
2088 tags.add(objtag)
2090 # Remove the dummy mood tag that we sometimes
2091 # use to block adding other mood and related
2092 # tags
2093 tags = tags - set(
2094 [
2095 "dummy-mood",
2096 "dummy-tense",
2097 "dummy-ignore-skipped",
2098 "dummy-object-concord",
2099 "dummy-reset-headers",
2100 "dummy-use-as-coltags",
2101 "dummy-use-as-rowtags",
2102 "dummy-store-hdrspan",
2103 "dummy-load-stored-hdrspans",
2104 "dummy-reset-stored-hdrspans",
2105 "dummy-section-header",
2106 ]
2107 )
2109 # Perform language-specific tag replacements according
2110 # to rules in a table.
2111 lang_tag_mappings = get_lang_conf(lang, "lang_tag_mappings")
2112 if lang_tag_mappings is not None: 2112 ↛ 2113line 2112 didn't jump to line 2113 because the condition on line 2112 was never true
2113 for pre, post in lang_tag_mappings.items():
2114 if all(t in tags for t in pre):
2115 tags = (tags - set(pre)) | set(post)
2117 # Warn if there are entries with empty tags
2118 if not tags: 2118 ↛ 2119line 2118 didn't jump to line 2119 because the condition on line 2118 was never true
2119 wxr.wtp.debug(
2120 "inflection table: empty tags for {}".format(form),
2121 sortid="inflection/1826",
2122 )
2124 # Warn if form looks like IPA
2125 ########## XXX ########
2126 # Because IPA is its own unicode block, we could also
2127 # technically do a Unicode name check to see if a string
2128 # contains IPA. Not all valid IPA characters are in the
2129 # IPA extension block, so you can technically have false
2130 # negatives if it's something like /toki/, but it
2131 # shouldn't give false positives.
2132 # Alternatively, you could make a list of IPA-admissible
2133 # characters and reject non-IPA stuff with that.
2134 if re.match(r"\s*/.*/\s*$", form): 2134 ↛ 2135line 2134 didn't jump to line 2135 because the condition on line 2134 was never true
2135 wxr.wtp.debug(
2136 "inflection table form looks like IPA: "
2137 "form={} tags={}".format(form, tags),
2138 sortid="inflection/1840",
2139 )
2141 # Note that this checks `form`, not `in tags`
2142 if form == "dummy-ignored-text-cell": 2142 ↛ 2143line 2142 didn't jump to line 2143 because the condition on line 2142 was never true
2143 continue
2145 if "dummy-remove-this-cell" in tags: 2145 ↛ 2146line 2145 didn't jump to line 2146 because the condition on line 2145 was never true
2146 continue
2148 # Add the form
2149 tags = list(sorted(tags))
2150 dt = {"form": form, "tags": tags, "source": source}
2151 if roman:
2152 dt["roman"] = roman
2153 if ipa:
2154 dt["ipa"] = ipa
2155 ret.append(dt)
2156 # If we got separate clitic form, add it
2157 if clitic:
2158 dt = {
2159 "form": clitic,
2160 "tags": tags + ["clitic"],
2161 "source": source,
2162 }
2163 ret.append(dt)
2164 return ret, form, some_has_covered_text
2166 # First extract definitions from cells
2167 # See defs_ht for footnote defs stuff
2168 for row in rows:
2169 for cell in row:
2170 text, refs, defs, hdr_tags = extract_cell_content(
2171 lang, word, cell.text
2172 )
2173 # refs, defs = footnote stuff, defs -> (ref, def)
2174 add_defs(defs)
2175 # Extract definitions from text after table
2176 text, refs, defs, hdr_tags = extract_cell_content(lang, word, after)
2177 add_defs(defs)
2179 # Then extract the actual forms
2180 ret = []
2181 hdrspans = []
2182 first_col_has_text = False
2183 rownum = 0
2184 title = None
2185 global_tags = []
2186 table_tags = []
2187 special_phrase_splits = get_lang_conf(lang, "special_phrase_splits")
2188 form_replacements = get_lang_conf(lang, "form_replacements")
2189 possibly_ignored_forms = get_lang_conf(lang, "conditionally_ignored_cells")
2190 cleanup_rules = get_lang_conf(lang, "minor_text_cleanups")
2192 for title in titles:
2193 more_global_tags, more_table_tags, extra_forms = parse_title(
2194 title, source
2195 )
2196 global_tags.extend(more_global_tags)
2197 table_tags.extend(more_table_tags)
2198 ret.extend(extra_forms)
2199 cell_rowcnt = collections.defaultdict(int)
2200 seen_cells = set()
2201 has_covering_hdr = set()
2202 some_has_covered_text = False
2203 for row in rows:
2204 # print("ROW:", row)
2205 # print("====")
2206 # print(f"Start of PREVIOUS row hdrspans:"
2207 # f"{tuple(sp.tagsets for sp in hdrspans)}")
2208 # print(f"Start of row txt: {tuple(t.text for t in row)}")
2209 if not row: 2209 ↛ 2210line 2209 didn't jump to line 2210 because the condition on line 2209 was never true
2210 continue # Skip empty rows
2211 all_headers = all(x.is_title or not x.text.strip() for x in row)
2212 text = row[0].text
2213 if (
2214 row[0].is_title
2215 and text
2216 and not is_superscript(text[0])
2217 and text not in infl_map # zealous inflation map?
2218 and (
2219 re.match(r"Inflection ", text)
2220 or re.sub(
2221 r"\s+",
2222 " ", # flatten whitespace
2223 re.sub(
2224 r"\s*\([^)]*\)",
2225 "",
2226 # Remove whitespace+parens
2227 text,
2228 ),
2229 ).strip()
2230 not in infl_map
2231 )
2232 and not re.match(infl_start_re, text)
2233 and all(
2234 x.is_title == row[0].is_title and x.text == text
2235 # all InflCells in `row` have the same is_title and text
2236 for x in row
2237 )
2238 ):
2239 if text and title is None:
2240 # Only if there were no titles previously make the first
2241 # text that is found the title
2242 title = text
2243 if re.match(r"(Note:|Notes:)", title): 2243 ↛ 2244line 2243 didn't jump to line 2244 because the condition on line 2243 was never true
2244 continue # not a title
2245 more_global_tags, more_table_tags, extra_forms = parse_title(
2246 title, source
2247 )
2248 global_tags.extend(more_global_tags)
2249 table_tags.extend(more_table_tags)
2250 ret.extend(extra_forms)
2251 continue # Skip title rows without incrementing i
2252 if "dummy-skip-this" in global_tags: 2252 ↛ 2253line 2252 didn't jump to line 2253 because the condition on line 2252 was never true
2253 return []
2254 rowtags = [()]
2255 # have_hdr = False
2256 # have_hdr never used?
2257 have_text = False
2258 samecell_cnt = 0
2259 col0_hdrspan = None # col0 or later header (despite its name)
2260 col0_followed_by_nonempty = False
2261 row_empty = True
2262 for col_idx, cell in enumerate(row):
2263 colspan = cell.colspan # >= 1
2264 rowspan = cell.rowspan # >= 1
2265 previously_seen = id(cell) in seen_cells
2266 # checks to see if this cell was in the previous ROW
2267 seen_cells.add(id(cell))
2268 if samecell_cnt == 0:
2269 # First column of a (possible multi-column) cell
2270 samecell_cnt = colspan - 1
2271 else:
2272 assert samecell_cnt > 0
2273 samecell_cnt -= 1
2274 continue
2276 # is_first_row_of_cell = cell_rowcnt[id(cell)] == 0
2277 # never used?
2279 # defaultdict(int) around line 1900
2280 cell_rowcnt[id(cell)] += 1
2281 # => how many cols this spans
2282 col = cell.text
2283 if not col:
2284 continue
2285 row_empty = False
2286 is_title = cell.is_title
2288 # If the cell has a target, i.e., text after colon, interpret
2289 # it as simply specifying a value for that value and ignore
2290 # it otherwise.
2291 if cell.target:
2292 text, refs, defs, hdr_tags = extract_cell_content(
2293 lang, word, col
2294 )
2295 if not text: 2295 ↛ 2296line 2295 didn't jump to line 2296 because the condition on line 2295 was never true
2296 continue
2297 refs_tags = set()
2298 for ref in refs: # gets tags from footnotes 2298 ↛ 2299line 2298 didn't jump to line 2299 because the loop on line 2298 never started
2299 if ref in def_ht:
2300 refs_tags.update(def_ht[ref])
2301 rowtags = expand_header(
2302 wxr,
2303 tablecontext,
2304 word,
2305 lang,
2306 pos,
2307 text,
2308 [],
2309 silent=True,
2310 depth=depth,
2311 )
2312 rowtags = list(
2313 set(tuple(sorted(set(x) | refs_tags)) for x in rowtags)
2314 )
2315 is_title = False
2316 col = cell.target
2318 # print(rownum, col_idx, col)
2319 # print(f"is_title: {is_title}")
2320 if is_title:
2321 # It is a header cell
2322 text, refs, defs, hdr_tags = extract_cell_content(
2323 lang, word, col
2324 )
2325 if not text:
2326 continue
2327 # Extract tags from referenced footnotes
2328 refs_tags = set()
2329 for ref in refs:
2330 if ref in def_ht:
2331 refs_tags.update(def_ht[ref])
2333 # Expand header to tags
2334 v = expand_header(
2335 wxr,
2336 tablecontext,
2337 word,
2338 lang,
2339 pos,
2340 text,
2341 [],
2342 silent=True,
2343 depth=depth,
2344 )
2345 # print("EXPANDED {!r} to {}".format(text, v))
2347 if col_idx == 0:
2348 # first_col_has_text is used for a test to ignore
2349 # upper-left cells that are just text without
2350 # header info
2351 first_col_has_text = True
2352 # Check if the header expands to reset hdrspans
2353 if any("dummy-reset-headers" in tt for tt in v):
2354 new_hdrspans = []
2355 for hdrspan in hdrspans:
2356 # if there are HdrSpan objects (abstract headers with
2357 # row- and column-spans) that are to the left or at the
2358 # same row or below, KEEP those; things above and to
2359 # the right of the hdrspan with dummy-reset-headers
2360 # are discarded. Tags from the header together with
2361 # dummy-reset-headers are kept as normal.
2362 if (
2363 hdrspan.start + hdrspan.colspan < col_idx
2364 or hdrspan.rownum > rownum - cell.rowspan
2365 ):
2366 new_hdrspans.append(hdrspan)
2367 hdrspans = new_hdrspans
2369 for tt in v:
2370 if "dummy-section-header" in tt: 2370 ↛ 2371line 2370 didn't jump to line 2371 because the condition on line 2370 was never true
2371 tablecontext.section_header = tt
2372 break
2373 if "dummy-reset-section-header" in tt: 2373 ↛ 2374line 2373 didn't jump to line 2374 because the condition on line 2373 was never true
2374 tablecontext.section_header = []
2375 # Text between headers on a row causes earlier headers to
2376 # be reset
2377 if have_text:
2378 # print(" HAVE_TEXT BEFORE HDR:", col)
2379 # Reset rowtags if new title column after previous
2380 # text cells
2381 # +-----+-----+-----+-----+
2382 # |hdr-a|txt-a|hdr-B|txt-B|
2383 # +-----+-----+-----+-----+
2384 # ^reset rowtags=>
2385 # XXX beware of header "—": "" - must not clear on that if
2386 # it expands to no tags
2387 rowtags = [()]
2388 # have_hdr = True
2389 # have_hdr never used?
2390 # print("HAVE_HDR: {} rowtags={}".format(col, rowtags))
2391 # Update rowtags and coltags
2392 has_covering_hdr.add(col_idx) # col_idx == current column
2393 # has_covering_hdr is a set that has the col_idx-ids of columns
2394 # that have previously had some kind of header. It is never
2395 # resetted inside the col_idx-loops OR the bigger rows-loop, so
2396 # applies to the whole table.
2398 rowtags, new_coltags, all_hdr_tags = generate_tags(
2399 rowtags, table_tags
2400 )
2402 if any("dummy-skip-this" in ts for ts in rowtags):
2403 continue # Skip this cell
2405 if any("dummy-load-stored-hdrspans" in ts for ts in v): 2405 ↛ 2406line 2405 didn't jump to line 2406 because the condition on line 2405 was never true
2406 hdrspans.extend(tablecontext.stored_hdrspans)
2408 if any("dummy-reset-stored-hdrspans" in ts for ts in v): 2408 ↛ 2409line 2408 didn't jump to line 2409 because the condition on line 2408 was never true
2409 tablecontext.stored_hdrspans = []
2411 if any("dummy-store-hdrspan" in ts for ts in v): 2411 ↛ 2413line 2411 didn't jump to line 2413 because the condition on line 2411 was never true
2412 # print(f"STORED: {col}")
2413 store_new_hdrspan = True
2414 else:
2415 store_new_hdrspan = False
2417 new_coltags = list(
2418 x
2419 for x in new_coltags
2420 if not any(t in noinherit_tags for t in x)
2421 )
2422 # print("new_coltags={} previously_seen={} all_hdr_tags={}"
2423 # .format(new_coltags, previously_seen, all_hdr_tags))
2424 if any(new_coltags):
2425 (
2426 col,
2427 col0_followed_by_nonempty,
2428 col0_hdrspan,
2429 ) = add_new_hdrspan(
2430 col,
2431 hdrspans,
2432 store_new_hdrspan,
2433 col0_followed_by_nonempty,
2434 col0_hdrspan,
2435 )
2437 continue
2439 # These values are ignored, at least for now
2440 if re.match(r"^(# |\(see )", col): 2440 ↛ 2441line 2440 didn't jump to line 2441 because the condition on line 2440 was never true
2441 continue
2443 if any("dummy-skip-this" in ts for ts in rowtags):
2444 continue # Skip this cell
2446 # If the word has no rowtags and is a multi-row cell, then
2447 # ignore this. This happens with empty separator rows
2448 # within a rowspan>1 cell. cf. wander/English/Conjugation.
2449 if rowtags == [()] and rowspan > 1:
2450 continue
2452 # Minor cleanup. See e.g. είμαι/Greek/Verb present participle.
2453 if cleanup_rules:
2454 for regx, substitution in cleanup_rules.items():
2455 col = re.sub(regx, substitution, col)
2457 if ( 2457 ↛ 2462line 2457 didn't jump to line 2462
2458 col_idx == 0
2459 and not first_col_has_text
2460 and get_lang_conf(lang, "ignore_top_left_text_cell") is True
2461 ):
2462 continue # Skip text at top left, as in Icelandic, Faroese
2464 # if col0_hdrspan is not None:
2465 # print("COL0 FOLLOWED NONHDR: {!r} by {!r}"
2466 # .format(col0_hdrspan.text, col))
2467 col0_followed_by_nonempty = True
2468 have_text = True
2470 # Determine column tags for the multi-column cell
2471 combined_coltags = compute_coltags(
2472 lang, pos, hdrspans, col_idx, colspan, col
2473 )
2474 if any("dummy-ignored-text-cell" in ts for ts in combined_coltags): 2474 ↛ 2475line 2474 didn't jump to line 2475 because the condition on line 2474 was never true
2475 continue
2477 # print("HAVE_TEXT:", repr(col))
2478 # Split the text into separate forms. First simplify spaces except
2479 # newline.
2480 col = re.sub(r"[ \t\r]+", " ", col)
2481 # Split the cell text into alternatives
2483 col, alts, split_extra_tags = split_text_into_alts(col)
2485 # Some cells have mixed form content, like text and romanization,
2486 # or text and IPA. Handle these.
2487 alts = handle_mixed_lines(alts)
2489 alts = list((x, combined_coltags) for x in alts)
2491 # Generate forms from the alternatives
2492 # alts is a list of (tuple of forms, tuple of tags)
2493 for (form, base_roman, ipa), coltags in alts:
2494 form = form.strip()
2495 extra_tags = []
2496 extra_tags.extend(split_extra_tags)
2497 # Handle special splits again here, so that we can have custom
2498 # mappings from form to form and tags.
2499 if form in form_replacements:
2500 replacement, tags = form_replacements[form]
2501 for x in tags.split():
2502 assert x in valid_tags
2503 assert isinstance(replacement, str)
2504 assert isinstance(tags, str)
2505 form = replacement
2506 extra_tags.extend(tags.split())
2507 # Clean the value, extracting reference symbols
2508 form, refs, defs, hdr_tags = extract_cell_content(
2509 lang, word, form
2510 )
2511 # if refs:
2512 # print("REFS:", refs)
2513 extra_tags.extend(hdr_tags)
2514 # Extract tags from referenced footnotes
2515 # Extract tags from referenced footnotes
2516 refs_tags = set()
2517 for ref in refs:
2518 if ref in def_ht:
2519 refs_tags.update(def_ht[ref])
2521 if base_roman:
2522 base_roman, _, _, hdr_tags = extract_cell_content(
2523 lang, word, base_roman
2524 )
2525 extra_tags.extend(hdr_tags)
2527 # Do some additional cleanup on the cell.
2528 form = re.sub(r"^\s*,\s*", "", form)
2529 form = re.sub(r"\s*,\s*$", "", form)
2530 form = re.sub(r"\s*(,\s*)+", ", ", form)
2531 form = re.sub(r"(?i)^Main:", "", form)
2532 form = re.sub(r"\s+", " ", form)
2533 form = form.strip()
2535 # Look for parentheses that have semantic meaning
2536 form, et = find_semantic_parens(form)
2537 extra_tags.extend(et)
2539 # Handle parentheses in the table element. We parse
2540 # tags anywhere and romanizations anywhere but beginning.
2541 roman = base_roman
2542 paren = None
2543 clitic = None
2544 m = re.search(r"(\s+|^)\(([^)]*)\)", form)
2545 # start|spaces + (anything)
2546 if m is not None:
2547 subst = m.group(1)
2548 paren = m.group(2)
2549 else:
2550 m = re.search(r"\(([^)]*)\)(\s+|$)", form)
2551 # (anything) + spaces|end
2552 if m is not None: 2552 ↛ 2553line 2552 didn't jump to line 2553 because the condition on line 2552 was never true
2553 paren = m.group(1)
2554 subst = m.group(2)
2555 if paren is not None:
2556 form, roman, clitic = handle_parens(
2557 form, roman, clitic, extra_tags
2558 )
2560 # Ignore certain forms that are not really forms,
2561 # unless they're really, really close to the article title
2562 if form in ( 2562 ↛ 2567line 2562 didn't jump to line 2567 because the condition on line 2562 was never true
2563 "",
2564 "unchanged",
2565 "after an", # in sona/Irish/Adj/Mutation
2566 ):
2567 Lev = distw([form], word)
2568 if form and Lev < 0.1:
2569 wxr.wtp.debug(
2570 "accepted possible false positive '{}' with"
2571 "> 0.1 Levenshtein distance in {}/{}".format(
2572 form, word, lang
2573 ),
2574 sortid="inflection/2213",
2575 )
2576 elif form and Lev < 0.3:
2577 wxr.wtp.debug(
2578 "skipped possible match '{}' with > 0.3"
2579 "Levenshtein distance in {}/{}".format(
2580 form, word, lang
2581 ),
2582 sortid="inflection/2218",
2583 )
2584 continue
2585 else:
2586 continue
2587 # print("ROWTAGS={} COLTAGS={} REFS_TAGS={} "
2588 # "FORM={!r} ROMAN={!r}"
2589 # .format(rowtags, coltags, refs_tags,
2590 # form, roman))
2592 # Merge tags from row and column and do miscellaneous
2593 # tag-related handling.
2594 (
2595 merge_ret,
2596 form,
2597 some_has_covered_text,
2598 ) = merge_row_and_column_tags(form, some_has_covered_text)
2599 ret.extend(merge_ret)
2601 # End of row.
2602 rownum += 1
2603 # For certain languages, if the row was empty, reset
2604 # hdrspans (saprast/Latvian/Verb, but not aussteigen/German/Verb).
2605 if row_empty and get_lang_conf(lang, "empty_row_resets"):
2606 hdrspans = []
2607 # Check if we should expand col0_hdrspan.
2608 if col0_hdrspan is not None:
2609 col0_allowed = get_lang_conf(lang, "hdr_expand_first")
2610 col0_cats = tagset_cats(col0_hdrspan.tagsets)
2611 # Only expand if col0_cats and later_cats are allowed
2612 # and don't overlap and col0 has tags, and there have
2613 # been no disallowed cells in between.
2614 if (
2615 not col0_followed_by_nonempty
2616 and not (col0_cats - col0_allowed)
2617 and
2618 # len(col0_cats) == 1 and
2619 col_idx > col0_hdrspan.start + col0_hdrspan.colspan
2620 ):
2621 # If an earlier header is only followed by headers that yield
2622 # no tags, expand it to entire row
2623 # print("EXPANDING COL0: {} from {} to {} cols {}"
2624 # .format(col0_hdrspan.text, col0_hdrspan.colspan,
2625 # len(row) - col0_hdrspan.start,
2626 # col0_hdrspan.tagsets))
2627 col0_hdrspan.colspan = len(row) - col0_hdrspan.start
2628 col0_hdrspan.expanded = True
2629 # XXX handle refs and defs
2630 # for x in hdrspans:
2631 # print(" HDRSPAN {} {} {} {!r}"
2632 # .format(x.start, x.colspan, x.tagsets, x.text))
2634 # Post-process German nouns with articles in separate columns. We move the
2635 # definite/indefinite/usually-without-article markers into the noun and
2636 # remove the article entries.
2637 if get_lang_conf(lang, "articles_in_separate_columns") and any(
2638 "noun" in x["tags"] for x in ret
2639 ):
2640 new_ret = []
2641 saved_tags = set()
2642 had_noun = False
2643 for dt in ret:
2644 tags = dt["tags"]
2645 # print(tags)
2646 if "noun" in tags:
2647 tags = list(
2648 sorted(set(t for t in tags if t != "noun") | saved_tags)
2649 )
2650 had_noun = True
2651 elif ( 2651 ↛ 2678line 2651 didn't jump to line 2678
2652 "indefinite" in tags
2653 or "definite" in tags
2654 or "usually-without-article" in tags
2655 or "without-article" in tags
2656 ):
2657 if had_noun:
2658 saved_tags = set(tags)
2659 else:
2660 saved_tags = saved_tags | set(tags) # E.g. Haus/German
2661 remove_useless_tags(lang, pos, saved_tags)
2662 saved_tags = saved_tags & set(
2663 [
2664 "masculine",
2665 "feminine",
2666 "neuter",
2667 "singular",
2668 "plural",
2669 "indefinite",
2670 "definite",
2671 "usually-without-article",
2672 "without-article",
2673 ]
2674 )
2675 had_noun = False
2676 continue # Skip the articles
2678 dt = dt.copy()
2679 dt["tags"] = tags
2680 new_ret.append(dt)
2681 ret = new_ret
2683 elif possibly_ignored_forms:
2684 # Some languages have tables with cells that are kind of separated
2685 # and difficult to handle, like eulersche Formel/German where
2686 # the definite and indefinite articles are just floating.
2687 # If a language has a dict of conditionally_ignored_cells,
2688 # and if the contents of a cell is found in one of the rules
2689 # there, ignore that cell if it
2690 # 1. Does not have the appropriate tag (like "definite" for "die")
2691 # and
2692 # 2. The title of the article is not one of the other co-words
2693 # (ie. it's an article for the definite articles in german etc.)
2694 # pass
2695 new_ret = []
2696 for cell_data in ret:
2697 tags = cell_data["tags"]
2698 text = cell_data["form"]
2699 skip_this = False
2700 for key_tag, ignored_forms in possibly_ignored_forms.items():
2701 if text not in ignored_forms: 2701 ↛ 2703line 2701 didn't jump to line 2703 because the condition on line 2701 was always true
2702 continue
2703 if word in ignored_forms:
2704 continue
2705 if key_tag not in tags:
2706 skip_this = True
2708 if skip_this: 2708 ↛ 2709line 2708 didn't jump to line 2709 because the condition on line 2708 was never true
2709 continue
2710 new_ret.append(cell_data)
2712 ret = new_ret
2714 # Post-process English inflection tables, addding "multiword-construction"
2715 # when the number of words has increased.
2716 if lang == "English" and pos == "verb":
2717 word_words = len(word.split())
2718 new_ret = []
2719 for dt in ret:
2720 form = dt.get("form", "")
2721 if len(form.split()) > word_words:
2722 dt = dt.copy()
2723 dt["tags"] = list(dt.get("tags", []))
2724 # This strange copy-assigning shuffle is preventative black
2725 # magic; do not touch lest you invoke deep bugs.
2726 data_append(dt, "tags", "multiword-construction")
2727 new_ret.append(dt)
2728 ret = new_ret
2730 # Always insert "table-tags" detail as the first entry in any inflection
2731 # table. This way we can reliably detect where a new table starts.
2732 # Table-tags applies until the next table-tags entry.
2733 if ret or table_tags:
2734 table_tags = list(sorted(set(table_tags)))
2735 dt = {
2736 "form": " ".join(table_tags),
2737 "source": source,
2738 "tags": ["table-tags"],
2739 }
2740 if dt["form"] == "":
2741 dt["form"] = "no-table-tags"
2742 if tablecontext.template_name:
2743 tn = {
2744 "form": tablecontext.template_name,
2745 "source": source,
2746 "tags": ["inflection-template"],
2747 }
2748 ret = [dt] + [tn] + ret
2749 else:
2750 ret = [dt] + ret
2752 return ret
2755def handle_generic_table(
2756 wxr, tablecontext, data, word, lang, pos, rows, titles, source, after, depth
2757):
2758 assert isinstance(wxr, WiktextractContext)
2759 assert isinstance(data, dict)
2760 assert isinstance(word, str)
2761 assert isinstance(lang, str)
2762 assert isinstance(pos, str)
2763 assert isinstance(rows, list)
2764 assert isinstance(source, str)
2765 assert isinstance(after, str)
2766 assert isinstance(depth, int)
2767 for row in rows:
2768 assert isinstance(row, list)
2769 for x in row:
2770 assert isinstance(x, InflCell)
2771 assert isinstance(titles, list)
2772 for x in titles:
2773 assert isinstance(x, str)
2775 # Try to parse the table as a simple table
2776 ret = parse_simple_table(
2777 wxr, tablecontext, word, lang, pos, rows, titles, source, after, depth
2778 )
2779 if ret is None: 2779 ↛ 2782line 2779 didn't jump to line 2782 because the condition on line 2779 was never true
2780 # XXX handle other table formats
2781 # We were not able to handle the table
2782 wxr.wtp.debug(
2783 "unhandled inflection table format, {}/{}".format(word, lang),
2784 sortid="inflection/2370",
2785 )
2786 return
2788 # Add the returned forms but eliminate duplicates.
2789 have_forms = set()
2790 for dt in ret:
2791 fdt = freeze(dt)
2792 if fdt in have_forms:
2793 continue # Don't add duplicates
2794 # Some Russian words have Declension and Pre-reform declension partially
2795 # duplicating same data. Don't add "dated" tags variant if already have
2796 # the same without "dated" from the modern declension table
2798 tags = dt.get("tags", [])
2799 for dated_tag in ("dated",):
2800 if dated_tag in tags:
2801 dt2 = dt.copy()
2802 tags2 = list(x for x in tags if x != dated_tag)
2803 dt2["tags"] = tags2
2804 if tags2 and freeze(dt2) in have_forms: 2804 ↛ 2805line 2804 didn't jump to line 2805 because the condition on line 2804 was never true
2805 break # Already have without archaic
2806 else:
2807 if "table-tags" not in tags:
2808 have_forms.add(fdt)
2809 data_append(data, "forms", dt)
2812def determine_header(
2813 wxr,
2814 tablecontext,
2815 lang,
2816 word,
2817 pos,
2818 table_kind,
2819 kind,
2820 style,
2821 row,
2822 col,
2823 celltext,
2824 titletext,
2825 cols_headered,
2826 target,
2827 cellstyle,
2828):
2829 assert isinstance(table_kind, NodeKind)
2830 assert isinstance(kind, (NodeKind, str))
2831 assert style is None or isinstance(style, str)
2832 assert cellstyle is None or isinstance(cellstyle, str)
2834 if table_kind == NodeKind.TABLE:
2835 header_kind = NodeKind.TABLE_HEADER_CELL
2836 elif table_kind == NodeKind.HTML: 2836 ↛ 2838line 2836 didn't jump to line 2838 because the condition on line 2836 was always true
2837 header_kind = "th"
2838 idx = celltext.find(": ")
2839 is_title = False
2840 # remove anything in parentheses, compress whitespace, .strip()
2841 cleaned_titletext = re.sub(
2842 r"\s+", " ", re.sub(r"\s*\([^)]*\)", "", titletext)
2843 ).strip()
2844 cleaned, _, _, _ = extract_cell_content(lang, word, celltext)
2845 cleaned = re.sub(r"\s+", " ", cleaned)
2846 hdr_expansion = expand_header(
2847 wxr,
2848 tablecontext,
2849 word,
2850 lang,
2851 pos,
2852 cleaned,
2853 [],
2854 silent=True,
2855 ignore_tags=True,
2856 )
2857 candidate_hdr = not any(
2858 any(t.startswith("error-") for t in ts) for ts in hdr_expansion
2859 )
2860 # KJ candidate_hdr says that a specific cell is a candidate
2861 # for being a header because it passed through expand_header
2862 # without getting any "error-" tags; that is, the contents
2863 # is "valid" for being a header; these are the false positives
2864 # we want to catch
2865 ignored_cell = any(
2866 any(t.startswith("dummy-") for t in ts) for ts in hdr_expansion
2867 )
2868 # ignored_cell should NOT be used to filter for headers, like
2869 # candidate_hdr is used, but only to filter for related *debug
2870 # messages*: some dummy-tags are actually half-way to headers,
2871 # like ones with "Notes", so they MUST be headers, but later
2872 # on they're ignored *as* headers so they don't need to print
2873 # out any cells-as-headers debug messages.
2874 if (
2875 candidate_hdr
2876 and kind != header_kind
2877 and cleaned != ""
2878 and cleaned != "dummy-ignored-text-cell"
2879 and cleaned not in IGNORED_COLVALUES
2880 ):
2881 # print("col: {}".format(col))
2882 if not ignored_cell and lang not in LANGUAGES_WITH_CELLS_AS_HEADERS:
2883 wxr.wtp.debug(
2884 "rejected heuristic header: "
2885 "table cell identified as header and given "
2886 "candidate status, BUT {} is not in "
2887 "LANGUAGES_WITH_CELLS_AS_HEADERS; "
2888 "cleaned text: {}".format(lang, cleaned),
2889 sortid="inflection/2447",
2890 )
2891 candidate_hdr = False
2892 elif cleaned not in LANGUAGES_WITH_CELLS_AS_HEADERS.get(lang, ""):
2893 wxr.wtp.debug(
2894 "rejected heuristic header: "
2895 "table cell identified as header and given "
2896 "candidate status, BUT the cleaned text is "
2897 "not in LANGUAGES_WITH_CELLS_AS_HEADERS[{}]; "
2898 "cleaned text: {}".format(lang, cleaned),
2899 sortid="inflection/2457",
2900 )
2901 candidate_hdr = False
2902 else:
2903 wxr.wtp.debug(
2904 "accepted heuristic header: "
2905 "table cell identified as header and given "
2906 "candidate status, AND the cleaned text is "
2907 "in LANGUAGES_WITH_CELLS_AS_HEADERS[{}]; "
2908 "cleaned text: {}".format(lang, cleaned),
2909 sortid="inflection/2466",
2910 )
2912 # If the cell starts with something that could start a
2913 # definition (typically a reference symbol), make it a candidate
2914 # regardless of whether the language is listed.
2915 if re.match(def_re, cleaned) and not re.match(nondef_re, cleaned): 2915 ↛ 2916line 2915 didn't jump to line 2916 because the condition on line 2915 was never true
2916 candidate_hdr = True
2918 # print("titletext={!r} hdr_expansion={!r} candidate_hdr={!r} "
2919 # "lang={} pos={}"
2920 # .format(titletext, hdr_expansion, candidate_hdr,
2921 # lang, pos))
2922 if idx >= 0 and titletext[:idx] in infl_map:
2923 target = titletext[idx + 2 :].strip()
2924 celltext = celltext[:idx]
2925 is_title = True
2926 elif (
2927 kind == header_kind
2928 and " + " not in titletext # For "avoir + blah blah"?
2929 and not any(
2930 isinstance(x, WikiNode)
2931 and x.kind == NodeKind.HTML
2932 and x.sarg == "span"
2933 and x.attrs.get("lang") in ("az",)
2934 for x in col.children
2935 )
2936 ):
2937 is_title = True
2938 elif (
2939 candidate_hdr
2940 and cleaned_titletext not in IGNORED_COLVALUES
2941 and distw([cleaned_titletext], word) > 0.3
2942 and cleaned_titletext not in ("I", "es")
2943 ):
2944 is_title = True
2945 # if first column or same style as first column
2946 elif (
2947 style == cellstyle
2948 and
2949 # and title is not identical to word name
2950 titletext != word
2951 and cleaned not in IGNORED_COLVALUES
2952 and cleaned != "dummy-ignored-text-cell"
2953 and
2954 # the style composite string is not broken
2955 not style.startswith("////")
2956 and " + " not in titletext
2957 ):
2958 if not ignored_cell and lang not in LANGUAGES_WITH_CELLS_AS_HEADERS: 2958 ↛ 2959line 2958 didn't jump to line 2959 because the condition on line 2958 was never true
2959 wxr.wtp.debug(
2960 "rejected heuristic header: "
2961 "table cell identified as header based "
2962 "on style, BUT {} is not in "
2963 "LANGUAGES_WITH_CELLS_AS_HEADERS; "
2964 "cleaned text: {}, style: {}".format(lang, cleaned, style),
2965 sortid="inflection/2512",
2966 )
2967 elif ( 2967 ↛ 2971line 2967 didn't jump to line 2971
2968 not ignored_cell
2969 and cleaned not in LANGUAGES_WITH_CELLS_AS_HEADERS.get(lang, "")
2970 ):
2971 wxr.wtp.debug(
2972 "rejected heuristic header: "
2973 "table cell identified as header based "
2974 "on style, BUT the cleaned text is "
2975 "not in LANGUAGES_WITH_CELLS_AS_HEADERS[{}]; "
2976 "cleaned text: {}, style: {}".format(lang, cleaned, style),
2977 sortid="inflection/2522",
2978 )
2979 else:
2980 wxr.wtp.debug(
2981 "accepted heuristic header: "
2982 "table cell identified as header based "
2983 "on style, AND the cleaned text is "
2984 "in LANGUAGES_WITH_CELLS_AS_HEADERS[{}]; "
2985 "cleaned text: {}, style: {}".format(lang, cleaned, style),
2986 sortid="inflection/2530",
2987 )
2988 is_title = True
2989 if ( 2989 ↛ 2996line 2989 didn't jump to line 2996
2990 not is_title
2991 and len(row) < len(cols_headered)
2992 and cols_headered[len(row)]
2993 ):
2994 # Whole column has title suggesting they are headers
2995 # (e.g. "Case")
2996 is_title = True
2997 if re.match(
2998 r"Conjugation of |Declension of |Inflection of |"
2999 r"Mutation of |Notes\b", # \b is word-boundary
3000 titletext,
3001 ):
3002 is_title = True
3003 return is_title, hdr_expansion, target, celltext
3006class TableContext:
3007 """Saved context used when parsing a table and its subtables."""
3009 __slot__ = (
3010 "stored_hdrspans",
3011 "section_header",
3012 "template_name",
3013 )
3015 def __init__(self, template_name=None):
3016 self.stored_hdrspans = []
3017 self.section_header = []
3018 if not template_name:
3019 self.template_name = ""
3020 else:
3021 self.template_name = template_name
3024def handle_wikitext_or_html_table(
3025 wxr, word, lang, pos, data, tree, titles, source, after, tablecontext=None
3026):
3027 """Parses a table from parsed Wikitext format into rows and columns of
3028 InflCell objects and then calls handle_generic_table() to parse it into
3029 forms. This adds the forms into ``data``."""
3030 assert isinstance(wxr, WiktextractContext)
3031 assert isinstance(word, str)
3032 assert isinstance(lang, str)
3033 assert isinstance(pos, str)
3034 assert isinstance(data, dict)
3035 assert isinstance(tree, WikiNode)
3036 assert tree.kind == NodeKind.TABLE or (
3037 tree.kind == NodeKind.HTML and tree.sarg == "table"
3038 )
3039 assert isinstance(titles, list)
3040 assert isinstance(source, str)
3041 for x in titles:
3042 assert isinstance(x, str)
3043 assert isinstance(after, str)
3044 assert tablecontext is None or isinstance(tablecontext, TableContext)
3045 # Imported here to avoid a circular import
3046 from wiktextract.page import clean_node, recursively_extract
3048 if not tablecontext:
3049 tablecontext = TableContext()
3051 def handle_table1(
3052 wxr,
3053 tablecontext,
3054 word,
3055 lang,
3056 pos,
3057 data,
3058 tree,
3059 titles,
3060 source,
3061 after,
3062 depth,
3063 ):
3064 """Helper function allowing the 'flattening' out of the table
3065 recursion: instead of handling the tables in the wrong order
3066 (recursively), this function adds to new_row that is then
3067 iterated through in the main function at the end, creating
3068 a longer table (still in pieces) in the correct order."""
3070 assert isinstance(data, dict)
3071 assert isinstance(titles, list)
3072 assert isinstance(source, str)
3073 for x in titles:
3074 assert isinstance(x, str)
3075 assert isinstance(after, str)
3076 assert isinstance(depth, int)
3077 # print("HANDLE_WIKITEXT_TABLE", titles)
3079 col_gap_data = [] # Filling for columns with rowspan > 1
3080 # col_gap_data contains None or InflCell
3081 vertical_still_left = [] # Number of remaining rows for which to fill
3082 # the column; vertical_still_left contains int
3083 cols_headered = [] # [F, T, F, F...]
3084 # True when the whole column contains headers, even
3085 # when the cell is not considered a header; triggered
3086 # by the "*" inflmap meta-tag.
3087 rows = []
3089 sub_ret = []
3091 for node in tree.children:
3092 if not isinstance(node, WikiNode):
3093 continue
3094 if node.kind == NodeKind.HTML:
3095 kind = node.sarg
3096 else:
3097 kind = node.kind
3099 # print(" {}".format(node))
3100 if kind in (NodeKind.TABLE_CAPTION, "caption"):
3101 # print(" CAPTION:", node)
3102 pass
3103 elif kind in (NodeKind.TABLE_ROW, "tr"):
3104 if "vsShow" in node.attrs.get("class", "").split():
3105 # vsShow rows are those that are intially shown in tables
3106 # that have more data. The hidden data duplicates these
3107 # rows, so we skip it and just process the hidden data.
3108 continue
3110 # Parse a table row.
3111 row = []
3112 style = None
3113 row_has_nonempty_cells = False
3114 # Have nonempty cell not from rowspan
3115 for col in node.children:
3116 # loop through each cell in the ROW
3117 if not isinstance(col, WikiNode):
3118 # This skip is not used for counting,
3119 # "None" is not used in
3120 # indexing or counting or looping.
3121 continue
3122 if col.kind == NodeKind.HTML:
3123 kind = col.sarg
3124 else:
3125 kind = col.kind
3126 if kind not in ( 3126 ↛ 3132line 3126 didn't jump to line 3132 because the condition on line 3126 was never true
3127 NodeKind.TABLE_HEADER_CELL,
3128 NodeKind.TABLE_CELL,
3129 "th",
3130 "td",
3131 ):
3132 print(" UNEXPECTED ROW CONTENT: {}".format(col))
3133 continue
3135 while (
3136 len(row) < len(vertical_still_left)
3137 and vertical_still_left[len(row)] > 0
3138 ):
3139 # vertical_still_left is [...0, 0, 2...] for each
3140 # column. It is populated at the end of the loop, at the
3141 # same time as col_gap_data. This needs to be looped and
3142 # filled this way because each `for col`-looping jumps
3143 # straight to the next meaningful cell; there is no
3144 # "None" cells, only emptiness between, and rowspan and
3145 # colspan are just to generate the "fill-
3146 vertical_still_left[len(row)] -= 1
3147 row.append(col_gap_data[len(row)])
3149 # appending row is how "indexing" is
3150 # done here; something is appended,
3151 # like a filler-cell here or a "start"
3152 # cell at the end of the row-loop,
3153 # which increased len(row) which is
3154 # then used as the target-index to check
3155 # for gaps. vertical_still_left is
3156 # the countdown to when to stop
3157 # filling in gaps, and goes down to 0,
3158 # and col_gap_data is not touched
3159 # except when a new rowspan is needed,
3160 # at the same time that
3161 # vertical_still_left gets reassigned.
3163 try:
3164 rowspan = int(col.attrs.get("rowspan", "1")) # 🡙
3165 colspan = int(col.attrs.get("colspan", "1")) # 🡘
3166 except ValueError:
3167 rowspan = 1
3168 colspan = 1
3169 # print("COL:", col)
3171 # Process any nested tables recursively.
3172 tables, rest = recursively_extract(
3173 col,
3174 lambda x: isinstance(x, WikiNode)
3175 and (x.kind == NodeKind.TABLE or x.sarg == "table"),
3176 )
3178 # Clean the rest of the cell.
3179 celltext = clean_node(wxr, None, rest)
3180 # print("CLEANED:", celltext)
3182 # Handle nested tables.
3183 for tbl in tables:
3184 # Some nested tables (e.g., croí/Irish) have subtitles
3185 # as normal paragraphs in the same cell under a descrip-
3186 # tive text that should be treated as a title (e.g.,
3187 # "Forms with the definite article", with "definite" not
3188 # mentioned elsewhere).
3189 new_titles = list(titles)
3190 if celltext:
3191 new_titles.append(celltext)
3192 subtbl = handle_table1(
3193 wxr,
3194 tablecontext,
3195 word,
3196 lang,
3197 pos,
3198 data,
3199 tbl,
3200 new_titles,
3201 source,
3202 "",
3203 depth + 1,
3204 )
3205 if subtbl: 3205 ↛ 3183line 3205 didn't jump to line 3183 because the condition on line 3205 was always true
3206 sub_ret.append((rows, titles, after, depth))
3207 rows = []
3208 titles = []
3209 after = ""
3210 sub_ret.extend(subtbl)
3212 # This magic value is used as part of header detection
3213 cellstyle = (
3214 col.attrs.get("style", "")
3215 + "//"
3216 + col.attrs.get("class", "")
3217 + "//"
3218 + str(kind)
3219 )
3221 if not row: # if first column in row
3222 style = cellstyle
3223 target = None
3224 titletext = celltext.strip()
3225 while titletext and is_superscript(titletext[-1]):
3226 titletext = titletext[:-1]
3228 (
3229 is_title,
3230 hdr_expansion,
3231 target,
3232 celltext,
3233 ) = determine_header(
3234 wxr,
3235 tablecontext,
3236 lang,
3237 word,
3238 pos,
3239 tree.kind,
3240 kind,
3241 style,
3242 row,
3243 col,
3244 celltext,
3245 titletext,
3246 cols_headered,
3247 None,
3248 cellstyle,
3249 )
3251 if is_title:
3252 # If this cell gets a "*" tag, make the whole column
3253 # below it (toggling it in cols_headered = [F, F, T...])
3254 # into headers.
3255 while len(cols_headered) <= len(row):
3256 cols_headered.append(False)
3257 if any("*" in tt for tt in hdr_expansion):
3258 cols_headered[len(row)] = True
3259 celltext = ""
3260 # if row_has_nonempty_cells has been True at some point, it
3261 # keeps on being True.
3262 # if row_has_nonempty_cells or is_title or celltext != "":
3263 # row_has_nonempty_cells = True
3264 # ⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓
3265 row_has_nonempty_cells |= is_title or celltext != ""
3266 cell = InflCell(
3267 celltext, is_title, colspan, rowspan, target
3268 )
3269 for _ in range(0, colspan):
3270 # colspan🡘 current loop (col) or 1
3271 # All the data-filling for colspan
3272 # is done simply in this loop,
3273 # while rowspan needs to use
3274 # vertical_still_left to count gaps
3275 # and col_gap_data to fill in
3276 # those gaps with InflCell data.
3277 if rowspan > 1: # rowspan🡙 current loop (col) or 1
3278 while len(col_gap_data) <= len(row):
3279 # Initialize col_gap_data/ed if
3280 # it is lacking slots
3281 # for each column; col_gap_data and
3282 # vertical_still_left are never
3283 # reset to [], during
3284 # the whole table function.
3285 col_gap_data.append(None)
3286 vertical_still_left.append(0)
3287 # Below is where the "rectangle" block of rowspan
3288 # and colspan is filled for the future.
3289 col_gap_data[len(row)] = cell
3290 # col_gap_data contains cells that
3291 # will be used in the
3292 # future, or None
3293 vertical_still_left[len(row)] = rowspan - 1
3294 # A counter for how many gaps🡙 are still left to be
3295 # filled (row.append or
3296 # row[col_gap_data[len(row)] =>
3297 # rows), it is not reset to [], but decremented to 0
3298 # each time a row gets something from col_gap_data.
3299 # Append this cell 1+ times for colspan🡘
3300 row.append(cell)
3301 if not row:
3302 continue
3303 # After looping the original row-nodes above, fill
3304 # in the rest of the row if the final cell has colspan
3305 # (inherited from above, so a cell with rowspan and colspan)
3306 for i in range(len(row), len(vertical_still_left)):
3307 if vertical_still_left[i] <= 0:
3308 continue
3309 vertical_still_left[i] -= 1
3310 while len(row) < i:
3311 row.append(InflCell("", False, 1, 1, None))
3312 row.append(col_gap_data[i])
3313 # print(" ROW {!r}".format(row))
3314 if row_has_nonempty_cells: 3314 ↛ 3091line 3314 didn't jump to line 3091 because the condition on line 3314 was always true
3315 rows.append(row)
3316 elif kind in ( 3316 ↛ 3091line 3316 didn't jump to line 3091 because the condition on line 3316 was always true
3317 NodeKind.TABLE_HEADER_CELL,
3318 NodeKind.TABLE_CELL,
3319 "th",
3320 "td",
3321 "span",
3322 ):
3323 # print(" TOP-LEVEL CELL", node)
3324 pass
3326 if sub_ret:
3327 main_ret = sub_ret
3328 main_ret.append((rows, titles, after, depth))
3329 else:
3330 main_ret = [(rows, titles, after, depth)]
3331 return main_ret
3333 new_rows = handle_table1(
3334 wxr, tablecontext, word, lang, pos, data, tree, titles, source, after, 0
3335 )
3337 # Now we have a table that has been parsed into rows and columns of
3338 # InflCell objects. Parse the inflection table from that format.
3339 if new_rows: 3339 ↛ exitline 3339 didn't return from function 'handle_wikitext_or_html_table' because the condition on line 3339 was always true
3340 for rows, titles, after, depth in new_rows:
3341 handle_generic_table(
3342 wxr,
3343 tablecontext,
3344 data,
3345 word,
3346 lang,
3347 pos,
3348 rows,
3349 titles,
3350 source,
3351 after,
3352 depth,
3353 )
3356def handle_html_table(
3357 wxr, word, lang, pos, data, tree, titles, source, after, tablecontext=None
3358):
3359 """A passer-on function for html-tables, XXX, remove these?"""
3360 handle_wikitext_or_html_table(
3361 wxr, word, lang, pos, data, tree, titles, source, after, tablecontext
3362 )
3365def handle_wikitext_table(
3366 wxr, word, lang, pos, data, tree, titles, source, after, tablecontext=None
3367):
3368 """A passer-on function for html-tables, XXX, remove these?"""
3369 handle_wikitext_or_html_table(
3370 wxr, word, lang, pos, data, tree, titles, source, after, tablecontext
3371 )
3374def parse_inflection_section(
3375 wxr, data, word, lang, pos, section, tree, tablecontext=None
3376):
3377 """Parses an inflection section on a page. ``data`` should be the
3378 data for a part-of-speech, and inflections will be added to it."""
3380 # print("PARSE_INFLECTION_SECTION {}/{}/{}/{}"
3381 # .format(word, lang, pos, section))
3382 assert isinstance(wxr, WiktextractContext)
3383 assert isinstance(data, dict)
3384 assert isinstance(word, str)
3385 assert isinstance(lang, str)
3386 assert isinstance(section, str)
3387 assert isinstance(tree, WikiNode)
3388 assert tablecontext is None or isinstance(tablecontext, TableContext)
3389 source = section
3390 tables = []
3391 titleparts = []
3393 def process_tables():
3394 for kind, node, titles, after in tables:
3395 after = "".join(after).strip()
3396 after = clean_value(wxr, after)
3397 if kind == "wikitext":
3398 handle_wikitext_table(
3399 wxr,
3400 word,
3401 lang,
3402 pos,
3403 data,
3404 node,
3405 titles,
3406 source,
3407 after,
3408 tablecontext=tablecontext,
3409 )
3410 elif kind == "html": 3410 ↛ 3424line 3410 didn't jump to line 3424 because the condition on line 3410 was always true
3411 handle_html_table(
3412 wxr,
3413 word,
3414 lang,
3415 pos,
3416 data,
3417 node,
3418 titles,
3419 source,
3420 after,
3421 tablecontext=tablecontext,
3422 )
3423 else:
3424 raise RuntimeError(
3425 "{}: unimplemented table kind {}".format(word, kind)
3426 )
3428 def recurse_navframe(node, titles):
3429 nonlocal tables
3430 nonlocal titleparts
3431 titleparts = []
3432 old_tables = tables
3433 tables = []
3435 recurse(node, [], navframe=True)
3437 process_tables()
3438 tables = old_tables
3440 def recurse(node, titles, navframe=False):
3441 nonlocal tables
3442 if isinstance(node, (list, tuple)):
3443 for x in node:
3444 recurse(x, titles, navframe)
3445 return
3446 if isinstance(node, str):
3447 if tables:
3448 tables[-1][-1].append(node)
3449 elif navframe:
3450 titleparts.append(node)
3451 return
3452 if not isinstance(node, WikiNode): 3452 ↛ 3453line 3452 didn't jump to line 3453 because the condition on line 3452 was never true
3453 if navframe:
3454 wxr.wtp.debug(
3455 "inflection table: unhandled in NavFrame: {}".format(node),
3456 sortid="inflection/2907",
3457 )
3458 return
3459 kind = node.kind
3460 if navframe:
3461 if kind == NodeKind.HTML:
3462 classes = node.attrs.get("class", "").split()
3463 if "NavToggle" in classes: 3463 ↛ 3464line 3463 didn't jump to line 3464 because the condition on line 3463 was never true
3464 return
3465 if "NavHead" in classes:
3466 # print("NAVHEAD:", node)
3467 recurse(node.children, titles, navframe)
3468 return
3469 if "NavContent" in classes:
3470 # print("NAVCONTENT:", node)
3471 title = "".join(titleparts).strip()
3472 title = html.unescape(title)
3473 title = title.strip()
3474 new_titles = list(titles)
3475 if not re.match(r"(Note:|Notes:)", title): 3475 ↛ 3477line 3475 didn't jump to line 3477 because the condition on line 3475 was always true
3476 new_titles.append(title)
3477 recurse(node, new_titles, navframe=False)
3478 return
3479 else:
3480 if kind == NodeKind.TABLE:
3481 tables.append(["wikitext", node, titles, []])
3482 return
3483 elif kind == NodeKind.HTML and node.sarg == "table":
3484 classes = node.attrs.get("class", ())
3485 if "audiotable" in classes:
3486 return
3487 tables.append(["html", node, titles, []])
3488 return
3489 elif kind in ( 3489 ↛ 3496line 3489 didn't jump to line 3496 because the condition on line 3489 was never true
3490 NodeKind.LEVEL2,
3491 NodeKind.LEVEL3,
3492 NodeKind.LEVEL4,
3493 NodeKind.LEVEL5,
3494 NodeKind.LEVEL6,
3495 ):
3496 return # Skip subsections
3497 if (
3498 kind == NodeKind.HTML
3499 and node.sarg == "div"
3500 and "NavFrame" in node.attrs.get("class", "").split()
3501 ):
3502 recurse_navframe(node, titles)
3503 return
3504 if kind == NodeKind.LINK:
3505 if len(node.largs) > 1:
3506 recurse(node.largs[1:], titles, navframe)
3507 else:
3508 recurse(node.largs[0], titles, navframe)
3509 return
3510 for x in node.children:
3511 recurse(x, titles, navframe)
3513 assert tree.kind == NodeKind.ROOT
3514 for x in tree.children:
3515 recurse(x, [])
3517 # Process the tables we found
3518 process_tables()
3520 # XXX this code is used for extracting tables for inflection tests
3521 if wxr.config.expand_tables: 3521 ↛ 3522line 3521 didn't jump to line 3522 because the condition on line 3521 was never true
3522 if section != "Mutation":
3523 with open(wxr.config.expand_tables, "w") as f:
3524 f.write(word + "\n")
3525 f.write(lang + "\n")
3526 f.write(pos + "\n")
3527 f.write(section + "\n")
3528 text = wxr.wtp.node_to_wikitext(tree)
3529 f.write(text + "\n")