Coverage for src/wiktextract/extractor/en/inflection.py: 87%
1513 statements
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-15 05:18 +0000
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-15 05:18 +0000
1# Code for parsing inflection tables.
2#
3# Copyright (c) 2021-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org.
5import collections
6import copy
7import functools
8import html
9import itertools
10import re
11import unicodedata
12from typing import Generator, Optional, Union
14from wikitextprocessor import MAGIC_FIRST, HTMLNode, NodeKind, WikiNode
16from ...clean import clean_value
17from ...datautils import data_append, freeze, split_at_comma_semi
18from ...tags import valid_tags
19from ...wxr_context import WiktextractContext
20from .form_descriptions import (
21 classify_desc,
22 decode_tags,
23 distw,
24 parse_head_final_tags,
25)
26from .inflection_kludges import ka_decl_noun_template_cell
27from .inflectiondata import infl_map, infl_start_map, infl_start_re
28from .lang_specific_configs import get_lang_conf, lang_specific_tags
29from .table_headers_heuristics_data import LANGUAGES_WITH_CELLS_AS_HEADERS
30from .type_utils import FormData
32# --debug-text-cell WORD
33# Command-line parameter for debugging. When parsing inflection tables,
34# print out debug messages when encountering this text.
35debug_cell_text: Optional[str] = None
38def set_debug_cell_text(text: str) -> None:
39 global debug_cell_text
40 debug_cell_text = text
43TagSets = list[tuple[str, ...]]
45# Column texts that are interpreted as an empty column.
46IGNORED_COLVALUES = {
47 "-",
48 "־",
49 "᠆",
50 "‐",
51 "‑",
52 "‒",
53 "–",
54 "—",
55 "―",
56 "−",
57 "⸺",
58 "⸻",
59 "﹘",
60 "﹣",
61 "-",
62 "/",
63 "?",
64 "not used",
65 "not applicable",
66}
68# These tags are never inherited from above
69# XXX merge with lang_specific
70noinherit_tags = {
71 "infinitive-i",
72 "infinitive-i-long",
73 "infinitive-ii",
74 "infinitive-iii",
75 "infinitive-iv",
76 "infinitive-v",
77}
79# Subject->object transformation mapping, when using dummy-object-concord
80# to replace subject concord tags with object concord tags
81object_concord_replacements = {
82 "first-person": "object-first-person",
83 "second-person": "object-second-person",
84 "third-person": "object-third-person",
85 "singular": "object-singular",
86 "plural": "object-plural",
87 "definite": "object-definite",
88 "indefinite": "object-indefinite",
89 "class-1": "object-class-1",
90 "class-2": "object-class-2",
91 "class-3": "object-class-3",
92 "class-4": "object-class-4",
93 "class-5": "object-class-5",
94 "class-6": "object-class-6",
95 "class-7": "object-class-7",
96 "class-8": "object-class-8",
97 "class-9": "object-class-9",
98 "class-10": "object-class-10",
99 "class-11": "object-class-11",
100 "class-12": "object-class-12",
101 "class-13": "object-class-13",
102 "class-14": "object-class-14",
103 "class-15": "object-class-15",
104 "class-16": "object-class-16",
105 "class-17": "object-class-17",
106 "class-18": "object-class-18",
107 "masculine": "object-masculine",
108 "feminine": "object-feminine",
109}
111# Words in title that cause addition of tags in all entries
112title_contains_global_map = {
113 "possessive": "possessive",
114 "possessed forms of": "possessive",
115 "predicative forms of": "predicative",
116 "negative": "negative",
117 "positive definite forms": "positive definite",
118 "positive indefinite forms": "positive indefinite",
119 "comparative": "comparative",
120 "superlative": "superlative",
121 "combined forms": "combined-form",
122 "mutation": "mutation",
123 "definite article": "definite",
124 "indefinite article": "indefinite",
125 "indefinite declension": "indefinite",
126 "bare forms": "indefinite", # e.g., cois/Irish
127 "definite declension": "definite",
128 "pre-reform": "dated",
129 "personal pronouns": "personal pronoun",
130 "composed forms of": "multiword-construction",
131 "subordinate-clause forms of": "subordinate-clause",
132 "participles of": "participle",
133 "variation of": "dummy-skip-this", # a'/Scottish Gaelic
134 "command form of": "imperative", # a راتلل/Pashto
135 "historical inflection of": "dummy-skip-this", # kork/Norwegian Nynorsk
136 "obsolete declension": "obsolete", # März/German 20241111
137}
138for k, v in title_contains_global_map.items():
139 if any(t not in valid_tags for t in v.split()): 139 ↛ 140line 139 didn't jump to line 140 because the condition on line 139 was never true
140 print("TITLE_CONTAINS_GLOBAL_MAP UNRECOGNIZED TAG: {}: {}".format(k, v))
141table_hdr_ign_part = r"(Inflection|Conjugation|Declension|Mutation) of [^\s]"
143table_hdr_ign_part_re = re.compile(r"(?i)(" + table_hdr_ign_part + ")")
144# (?i) python regex extension, ignore case
145title_contains_global_re = re.compile(
146 r"(?i)(^|\b)({}|{})($|\b)".format(
147 table_hdr_ign_part,
148 "|".join(re.escape(x) for x in title_contains_global_map.keys()),
149 )
150)
152# Words in title that cause addition of tags to table-tags "form"
153title_contains_wordtags_map = {
154 "pf": "perfective",
155 "impf": "imperfective",
156 "strong": "strong",
157 "weak": "weak",
158 "countable": "countable",
159 "uncountable": "uncountable",
160 "inanimate": "inanimate",
161 "animate": "animate",
162 "transitive": "transitive",
163 "intransitive": "intransitive",
164 "ditransitive": "ditransitive",
165 "ambitransitive": "ambitransitive",
166 "archaic": "archaic",
167 "dated": "dated",
168 "affirmative": "affirmative",
169 "negative": "negative",
170 "subject pronouns": "subjective",
171 "object pronouns": "objective",
172 "emphatic": "emphatic",
173 "proper noun": "proper-noun",
174 "no plural": "no-plural",
175 "imperfective": "imperfective",
176 "perfective": "perfective",
177 "no supine stem": "no-supine",
178 "no perfect stem": "no-perfect",
179 "deponent": "deponent",
180 "irregular": "irregular",
181 "no short forms": "no-short-form",
182 "iō-variant": "iō-variant",
183 "1st declension": "declension-1",
184 "2nd declension": "declension-2",
185 "3rd declension": "declension-3",
186 "4th declension": "declension-4",
187 "5th declension": "declension-5",
188 "6th declension": "declension-6",
189 "first declension": "declension-1",
190 "second declension": "declension-2",
191 "third declension": "declension-3",
192 "fourth declension": "declension-4",
193 "fifth declension": "declension-5",
194 "sixth declension": "declension-6",
195 "1st conjugation": "conjugation-1",
196 "2nd conjugation": "conjugation-2",
197 "3rd conjugation": "conjugation-3",
198 "4th conjugation": "conjugation-4",
199 "5th conjugation": "conjugation-5",
200 "6th conjugation": "conjugation-6",
201 "7th conjugation": "conjugation-7",
202 "first conjugation": "conjugation-1",
203 "second conjugation": "conjugation-2",
204 "third conjugation": "conjugation-3",
205 "fourth conjugation": "conjugation-4",
206 "fifth conjugation": "conjugation-5",
207 "sixth conjugation": "conjugation-6",
208 "seventh conjugation": "conjugation-7",
209 # Corsican regional tags in table header
210 "cismontane": "Cismontane",
211 "ultramontane": "Ultramontane",
212 "western lombard": "Western-Lombard",
213 "eastern lombard": "Eastern-Lombard",
214}
215for k, v in title_contains_wordtags_map.items():
216 if any(t not in valid_tags for t in v.split()): 216 ↛ 217line 216 didn't jump to line 217 because the condition on line 216 was never true
217 print(
218 "TITLE_CONTAINS_WORDTAGS_MAP UNRECOGNIZED TAG: {}: {}".format(k, v)
219 )
220title_contains_wordtags_re = re.compile(
221 r"(?i)(^|\b)({}|{})($|\b)".format(
222 table_hdr_ign_part,
223 "|".join(re.escape(x) for x in title_contains_wordtags_map.keys()),
224 )
225)
227# Parenthesized elements in title that are converted to tags in
228# "table-tags" form
229title_elements_map = {
230 "weak": "weak",
231 "strong": "strong",
232 "separable": "separable",
233 "masculine": "masculine",
234 "feminine": "feminine",
235 "neuter": "neuter",
236 "singular": "singular",
237 "plural": "plural",
238 "archaic": "archaic",
239 "dated": "dated",
240 "Attic": "Attic", # e.g. καλός/Greek/Adj
241 "Epic": "Epic", # e.g. καλός/Greek/Adj
242}
243for k, v in title_elements_map.items():
244 if any(t not in valid_tags for t in v.split()): 244 ↛ 245line 244 didn't jump to line 245 because the condition on line 244 was never true
245 print("TITLE_ELEMENTS_MAP UNRECOGNIZED TAG: {}: {}".format(k, v))
247# Parenthized element starts to map them to tags for form for the rest of
248# the element
249title_elemstart_map = {
250 "auxiliary": "auxiliary",
251 "Kotus type": "class",
252 "ÕS type": "class",
253 "class": "class",
254 "short class": "class",
255 "type": "class",
256 "strong class": "class",
257 "weak class": "class",
258 "accent paradigm": "accent-paradigm",
259 "stem in": "class",
260}
261for k, v in title_elemstart_map.items():
262 if any(t not in valid_tags for t in v.split()): 262 ↛ 263line 262 didn't jump to line 263 because the condition on line 262 was never true
263 print("TITLE_ELEMSTART_MAP UNRECOGNIZED TAG: {}: {}".format(k, v))
264title_elemstart_re = re.compile(
265 r"^({}) ".format("|".join(re.escape(x) for x in title_elemstart_map.keys()))
266)
269# Regexp for cell starts that are likely definitions of reference symbols.
270# See also nondef_re.
271def_re = re.compile(
272 r"(\s*•?\s+)?"
273 r"((\*+|[△†0123456789⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻]+)([⁾):]|\s|(?=[A-Z]))|"
274 r"\^(\*+|[△†])|"
275 r"([¹²³⁴⁵⁶⁷⁸⁹])|"
276 r"([ᴬᴮᴰᴱᴳᴴᴵᴶᴷᴸᴹᴺᴼᴾᴿᵀᵁⱽᵂᵃᵇᶜᵈᵉᶠᵍʰⁱʲᵏˡᵐⁿᵒᵖʳˢᵗᵘᵛʷˣʸᶻᵝᵞᵟᶿᶥᵠᵡ]))"
277)
278# ᴺᴸᴴ persan/Old Irish
280# Regexp for cell starts that are exceptions to def_re and do not actually
281# start a definition.
282nondef_re = re.compile(
283 r"(^\s*(1|2|3)\s+(sg|pl)\s*$|" # 1s or 3p etc.
284 r"\s*\d\d?\s*/\s*\d\d?\s*$)"
285) # taka/Swahili "15 / 17"
287# Certain tags are moved from headers in tables into word tags, as they always
288# apply to the whole word.
289TAGS_FORCED_WORDTAGS: set[str] = set(
290 [
291 # This was originally created for a issue with number paradigms in
292 # Arabic, but that is being handled elsewhere now.
293 ]
294)
297class InflCell:
298 """Cell in an inflection table."""
300 __slots__ = (
301 "text",
302 "is_title",
303 "colspan",
304 "rowspan",
305 "target",
306 )
308 def __init__(
309 self,
310 text: str,
311 is_title: bool,
312 colspan: int,
313 rowspan: int,
314 target: Optional[str],
315 ) -> None:
316 assert isinstance(text, str)
317 assert is_title in (True, False)
318 assert isinstance(colspan, int) and colspan >= 1
319 assert isinstance(rowspan, int) and rowspan >= 1
320 assert target is None or isinstance(target, str)
321 self.text = text.strip()
322 self.is_title = text and is_title
323 self.colspan = colspan
324 self.rowspan = rowspan
325 self.target = target
327 def __str__(self) -> str:
328 v = "{}/{}/{}/{!r}".format(
329 self.text, self.is_title, self.colspan, self.rowspan
330 )
331 if self.target:
332 v += ": {!r}".format(self.target)
333 return v
335 def __repr__(self) -> str:
336 return str(self)
339class HdrSpan:
340 """Saved information about a header cell/span during the parsing
341 of a table."""
343 __slots__ = (
344 "start",
345 "colspan",
346 "rowspan",
347 "rownum", # Row number where this occurred
348 "tagsets", # list of tuples
349 "text", # For debugging
350 "all_headers_row",
351 "expanded", # The header has been expanded to cover whole row/part
352 )
354 def __init__(
355 self,
356 start: int,
357 colspan: int,
358 rowspan: int,
359 rownum: int,
360 tagsets: TagSets,
361 text: str,
362 all_headers_row: bool,
363 ) -> None:
364 assert isinstance(start, int) and start >= 0
365 assert isinstance(colspan, int) and colspan >= 1
366 assert isinstance(rownum, int)
367 assert isinstance(tagsets, list)
368 for x in tagsets:
369 assert isinstance(x, tuple)
370 assert all_headers_row in (True, False)
371 self.start = start
372 self.colspan = colspan
373 self.rowspan = rowspan
374 self.rownum = rownum
375 self.tagsets = list(tuple(sorted(set(tags))) for tags in tagsets)
376 self.text = text
377 self.all_headers_row = all_headers_row
378 self.expanded = False
381def is_superscript(ch: str) -> bool:
382 """Returns True if the argument is a superscript character."""
383 assert isinstance(ch, str) and len(ch) == 1
384 try:
385 name = unicodedata.name(ch)
386 except ValueError:
387 return False
388 return (
389 re.match(
390 r"SUPERSCRIPT |"
391 r"MODIFIER LETTER SMALL |"
392 r"MODIFIER LETTER CAPITAL ",
393 name,
394 )
395 is not None
396 )
399def remove_useless_tags(lang: str, pos: str, tags: set[str]) -> None:
400 """Remove certain tag combinations from ``tags`` when they serve no purpose
401 together (cover all options)."""
402 assert isinstance(lang, str)
403 assert isinstance(pos, str)
404 assert isinstance(tags, set)
405 if (
406 "animate" in tags
407 and "inanimate" in tags
408 and get_lang_conf(lang, "animate_inanimate_remove")
409 ):
410 tags.remove("animate")
411 tags.remove("inanimate")
412 if (
413 "virile" in tags
414 and "nonvirile" in tags
415 and get_lang_conf(lang, "virile_nonvirile_remove")
416 ):
417 tags.remove("virile")
418 tags.remove("nonvirile")
419 # If all numbers in the language are listed, remove them all
420 numbers = get_lang_conf(lang, "numbers")
421 if numbers and all(x in tags for x in numbers):
422 for x in numbers:
423 tags.remove(x)
424 # If all genders in the language are listed, remove them all
425 genders = get_lang_conf(lang, "genders")
426 if genders and all(x in tags for x in genders):
427 for x in genders:
428 tags.remove(x)
429 # If all voices in the language are listed, remove them all
430 voices = get_lang_conf(lang, "voices")
431 if voices and all(x in tags for x in voices):
432 for x in voices:
433 tags.remove(x)
434 # If all strengths of the language are listed, remove them all
435 strengths = get_lang_conf(lang, "strengths")
436 if strengths and all(x in tags for x in strengths):
437 for x in strengths:
438 tags.remove(x)
439 # If all persons of the language are listed, remove them all
440 persons = get_lang_conf(lang, "persons")
441 if persons and all(x in tags for x in persons):
442 for x in persons:
443 tags.remove(x)
444 # If all definitenesses of the language are listed, remove them all
445 definitenesses = get_lang_conf(lang, "definitenesses")
446 if definitenesses and all(x in tags for x in definitenesses):
447 for x in definitenesses:
448 tags.remove(x)
451def tagset_cats(tagset: TagSets) -> set[str]:
452 """Returns a set of tag categories for the tagset (merged from all
453 alternatives)."""
454 return set(valid_tags[t] for ts in tagset for t in ts)
457def or_tagsets(
458 lang: str, pos: str, tagsets1: TagSets, tagsets2: TagSets
459) -> TagSets:
460 """Merges two tagsets (the new tagset just merges the tags from both, in
461 all combinations). If they contain simple alternatives (differ in
462 only one category), they are simply merged; otherwise they are split to
463 more alternatives. The tagsets are assumed be sets of sorted tuples."""
464 assert isinstance(tagsets1, list)
465 assert all(isinstance(x, tuple) for x in tagsets1)
466 assert isinstance(tagsets2, list)
467 assert all(isinstance(x, tuple) for x in tagsets1)
468 tagsets: TagSets = [] # This will be the result
470 def add_tags(tags1: tuple[str, ...]) -> None:
471 # CONTINUE
472 if not tags1:
473 return # empty set would merge with anything, won't change result
474 if not tagsets:
475 tagsets.append(tags1)
476 return
477 for tags2 in tagsets:
478 # Determine if tags1 can be merged with tags2
479 num_differ = 0
480 if tags1 and tags2: 480 ↛ 498line 480 didn't jump to line 498 because the condition on line 480 was always true
481 cats1 = set(valid_tags[t] for t in tags1)
482 cats2 = set(valid_tags[t] for t in tags2)
483 cats = cats1 | cats2
484 for cat in cats:
485 tags1_in_cat = set(t for t in tags1 if valid_tags[t] == cat)
486 tags2_in_cat = set(t for t in tags2 if valid_tags[t] == cat)
487 if (
488 tags1_in_cat != tags2_in_cat
489 or not tags1_in_cat
490 or not tags2_in_cat
491 ):
492 num_differ += 1
493 if not tags1_in_cat or not tags2_in_cat:
494 # Prevent merging if one is empty
495 num_differ += 1
496 # print("tags1={} tags2={} num_differ={}"
497 # .format(tags1, tags2, num_differ))
498 if num_differ <= 1:
499 # Yes, they can be merged
500 tagsets.remove(tags2)
501 tags_s = set(tags1) | set(tags2)
502 remove_useless_tags(lang, pos, tags_s)
503 tags_t = tuple(sorted(tags_s))
504 add_tags(tags_t) # Could result in further merging
505 return
506 # If we could not merge, add to tagsets
507 tagsets.append(tags1)
509 for tags in tagsets1:
510 add_tags(tags)
511 for tags in tagsets2:
512 add_tags(tags)
513 if not tagsets:
514 tagsets.append(())
516 # print("or_tagsets: {} + {} -> {}"
517 # .format(tagsets1, tagsets2, tagsets))
518 return tagsets
521def and_tagsets(
522 lang: str,
523 pos: str,
524 tagsets1: list[tuple[str, ...]],
525 tagsets2: list[tuple[str, ...]],
526) -> list[tuple[str, ...]]:
527 """Merges tagsets by taking union of all cobinations, without trying
528 to determine whether they are compatible."""
529 assert isinstance(tagsets1, list) and len(tagsets1) >= 1
530 assert all(isinstance(x, tuple) for x in tagsets1)
531 assert isinstance(tagsets2, list) and len(tagsets2) >= 1
532 assert all(isinstance(x, tuple) for x in tagsets1)
533 new_tagsets = []
534 tags: Union[set[str], tuple[str, ...]]
535 for tags1 in tagsets1:
536 for tags2 in tagsets2:
537 tags = set(tags1) | set(tags2)
538 remove_useless_tags(lang, pos, tags)
539 if "dummy-ignored-text-cell" in tags: 539 ↛ 540line 539 didn't jump to line 540 because the condition on line 539 was never true
540 tags.remove("dummy-ignored-text-cell")
541 tags = tuple(sorted(tags))
542 if tags not in new_tagsets: 542 ↛ 536line 542 didn't jump to line 536 because the condition on line 542 was always true
543 new_tagsets.append(tags)
544 # print("and_tagsets: {} + {} -> {}"
545 # .format(tagsets1, tagsets2, new_tagsets))
546 return new_tagsets
549@functools.lru_cache(65536)
550def extract_cell_content(
551 lang: str, word: str, col: str
552) -> tuple[str, list[str], list[tuple[str, str]], list[str]]:
553 """Cleans a row/column header for later processing. This returns
554 (cleaned, refs, defs, tags)."""
555 # print("EXTRACT_CELL_CONTENT {!r}".format(col))
556 hdr_tags = []
557 col = re.sub(r"(?s)\s*,\s*$", "", col)
558 col = re.sub(r"(?s)\s*•\s*$", "", col)
559 col = re.sub(r"\s+", " ", col)
560 col = col.strip()
561 if re.search(
562 r"^\s*(There are |"
563 r"\* |"
564 r"see |"
565 r"Use |"
566 r"use the |"
567 r"Only used |"
568 r"The forms in |"
569 r"these are also written |"
570 r"The genitive can be |"
571 r"Genitive forms are rare or non-existant|"
572 r"Accusative Note: |"
573 r"Classifier Note: |"
574 r"Noun: Assamese nouns are |"
575 r"the active conjugation|"
576 r"the instrumenal singular|"
577 r"Note:|"
578 r"\^* Note:|"
579 r"possible mutated form |"
580 r"The future tense: )",
581 col,
582 ):
583 return "dummy-ignored-text-cell", [], [], []
585 # Temporarily remove final parenthesized part (if separated by whitespace),
586 # so that we can extract reference markers before it.
587 final_paren = ""
588 m = re.search(r"\s+\([^)]*\)$", col)
589 if m is not None:
590 final_paren = m.group(0)
591 col = col[: m.start()]
593 # Extract references and tag markers
594 refs = []
595 special_references = get_lang_conf(lang, "special_references")
596 while True:
597 m = re.search(r"\^(.|\([^)]*\))$", col)
598 if not m:
599 break
600 r = m.group(1)
601 if r.startswith("(") and r.endswith(")"):
602 r = r[1:-1]
603 for r1 in r.split(","):
604 if r1 == "rare": 604 ↛ 605line 604 didn't jump to line 605 because the condition on line 604 was never true
605 hdr_tags.append("rare")
606 elif special_references and r1 in special_references:
607 hdr_tags.extend(special_references[r1].split())
608 else:
609 # v = m.group(1)
610 if r1.startswith("(") and r1.endswith(")"): 610 ↛ 611line 610 didn't jump to line 611 because the condition on line 610 was never true
611 r1 = r1[1:-1]
612 refs.append(unicodedata.normalize("NFKD", r1))
613 col = col[: m.start()]
614 # See if it is a ref definition
615 # print("BEFORE REF CHECK: {!r}".format(col))
616 m = def_re.match(col)
617 # print(f"Before def_re: {refs=}")
618 if m and not nondef_re.match(col):
619 ofs = 0
620 ref = None
621 deflst = []
622 for m in re.finditer(def_re, col):
623 if ref:
624 deflst.append((ref, col[ofs : m.start()].strip()))
625 ref = unicodedata.normalize(
626 "NFKD", m.group(3) or m.group(5) or m.group(6) or ""
627 )
628 ofs = m.end()
629 if ref: 629 ↛ 632line 629 didn't jump to line 632 because the condition on line 629 was always true
630 deflst.append((ref, col[ofs:].strip()))
631 # print("deflst:", deflst)
632 return "", [], deflst, []
633 # See if it *looks* like a reference to a definition
634 # print(f"After def_re: {refs=}")
635 while col:
636 if is_superscript(col[-1]) or col[-1] in ("†",):
637 if col.endswith("ʳᵃʳᵉ"):
638 hdr_tags.append("rare")
639 col = col[:-4].strip()
640 continue
641 if special_references:
642 stop_flag = False
643 for r in special_references:
644 if col.endswith(r):
645 hdr_tags.extend(special_references[r].split())
646 col = col[: -len(r)].strip()
647 stop_flag = True
648 break # this for loop
649 if stop_flag:
650 continue # this while loop
651 # Numbers and H/L/N are useful information
652 refs.append(unicodedata.normalize("NFKD", col[-1]))
653 col = col[:-1]
654 else:
655 break
657 # Check for another form of note definition
658 if ( 658 ↛ 664line 658 didn't jump to line 664 because the condition on line 658 was never true
659 len(col) > 2
660 and col[1] in (")", " ", ":")
661 and col[0].isdigit()
662 and not re.match(nondef_re, col)
663 ):
664 return "", [], [(col[0], col[2:].strip())], []
665 col = col.strip()
667 # Extract final "*" reference symbols. Sometimes there are multiple.
668 m = re.search(r"\*+$", col)
669 if m is not None:
670 col = col[: m.start()]
671 refs.append(unicodedata.normalize("NFKD", m.group(0)))
672 if col.endswith("(*)"): 672 ↛ 673line 672 didn't jump to line 673 because the condition on line 672 was never true
673 col = col[:-3].strip()
674 refs.append("*")
676 # Put back the final parenthesized part
677 col = col.strip() + final_paren
678 # print("EXTRACT_CELL_CONTENT: orig_col={!r} col={!r} refs={!r} hdr_tags={}"
679 # .format(orig_col, col, refs, hdr_tags))
680 return col.strip(), refs, [], hdr_tags
683@functools.lru_cache(10000)
684def parse_title(
685 title: str, source: str
686) -> tuple[list[str], list[str], list[FormData]]:
687 """Parses inflection table title. This returns (global_tags, table_tags,
688 extra_forms), where ``global_tags`` is tags to be added to each inflection
689 entry, ``table_tags`` are tags for the word but not to be added to every
690 form, and ``extra_forms`` is dictionary describing additional forms to be
691 included in the part-of-speech entry)."""
692 assert isinstance(title, str)
693 assert isinstance(source, str)
694 title = html.unescape(title)
695 title = re.sub(r"(?i)<[^>]*>", "", title).strip()
696 title = re.sub(r"\s+", " ", title)
697 # print("PARSE_TITLE:", title)
698 global_tags = []
699 table_tags = []
700 extra_forms = []
701 # Add certain global tags based on contained words
702 for m in re.finditer(title_contains_global_re, title):
703 v = m.group(0).lower()
704 if re.match(table_hdr_ign_part_re, v): 704 ↛ 705line 704 didn't jump to line 705 because the condition on line 704 was never true
705 continue
706 global_tags.extend(title_contains_global_map[v].split())
707 # Add certain tags to table-tags "form" based on contained words
708 for m in re.finditer(title_contains_wordtags_re, title):
709 v = m.group(0).lower()
710 if re.match(table_hdr_ign_part_re, v): 710 ↛ 711line 710 didn't jump to line 711 because the condition on line 710 was never true
711 continue
712 table_tags.extend(title_contains_wordtags_map[v].split())
713 if re.search(r"Conjugation of (s’|se ).*French verbs", title): 713 ↛ 714line 713 didn't jump to line 714 because the condition on line 713 was never true
714 global_tags.append("reflexive")
715 # Check for <x>-type at the beginning of title (e.g., Armenian) and various
716 # other ways of specifying an inflection class.
717 for m in re.finditer(
718 r"\b("
719 r"[\w/]+-type|"
720 r"accent-\w+|"
721 r"[\w/]+-stem|"
722 r"[^ ]+ gradation|"
723 r"\b(stem in [\w/ ]+)|"
724 r"[^ ]+ alternation|"
725 r"(First|Second|Third|Fourth|Fifth|Sixth|Seventh) "
726 r"(Conjugation|declension)|"
727 r"First and second declension|"
728 r"(1st|2nd|3rd|4th|5th|6th) declension|"
729 r"\w[\w/ ]* harmony"
730 r")\b",
731 title,
732 ):
733 dt: FormData = {"form": m.group(1), "source": source, "tags": ["class"]}
734 extra_forms.append(dt)
735 # Parse parenthesized part from title
736 for m in re.finditer(r"\(([^)]*)\)", title):
737 for elem in m.group(1).split(","):
738 # group(0) is the whole string, group(1) first parens
739 elem = elem.strip()
740 if elem in title_elements_map:
741 table_tags.extend(title_elements_map[elem].split())
742 else:
743 m1 = re.match(title_elemstart_re, elem)
744 if m1:
745 tags = title_elemstart_map[m1.group(1)].split()
746 dt = {
747 "form": elem[m1.end() :],
748 "source": source,
749 "tags": tags,
750 }
751 extra_forms.append(dt)
752 # For titles that contains no parenthesized parts, do some special
753 # handling to still interpret parts from them
754 if "(" not in title:
755 # No parenthesized parts
756 m1 = re.search(r"\b(Portuguese) (-.* verb) ", title)
757 if m1 is not None:
758 dt = {"form": m1.group(2), "tags": ["class"], "source": source}
759 extra_forms.append(dt)
760 for elem in title.split(","):
761 elem = elem.strip()
762 if elem in title_elements_map: 762 ↛ 763line 762 didn't jump to line 763 because the condition on line 762 was never true
763 table_tags.extend(title_elements_map[elem].split())
764 elif elem.endswith("-stem"): 764 ↛ 765line 764 didn't jump to line 765 because the condition on line 764 was never true
765 dt = {"form": elem, "tags": ["class"], "source": source}
766 extra_forms.append(dt)
767 return global_tags, table_tags, extra_forms
770def expand_header(
771 wxr: WiktextractContext,
772 tablecontext: "TableContext",
773 word: str,
774 lang: str,
775 pos: str,
776 text: str,
777 base_tags: Union[list[str], set[str], tuple[str, ...]],
778 silent=False,
779 ignore_tags=False,
780 depth=0,
781) -> list[tuple[str, ...]]:
782 """Expands a cell header to tagset, handling conditional expressions
783 in infl_map. This returns list of tuples of tags, each list element
784 describing an alternative interpretation. ``base_tags`` is combined
785 column and row tags for the cell in which the text is being interpreted
786 (conditional expressions in inflection data may depend on it).
787 If ``silent`` is True, then no warnings will be printed. If ``ignore_tags``
788 is True, then tags listed in "if" will be ignored in the test (this is
789 used when trying to heuristically detect whether a non-<th> cell is anyway
790 a header)."""
791 assert isinstance(wxr, WiktextractContext)
792 assert isinstance(word, str)
793 assert isinstance(lang, str)
794 assert isinstance(pos, str)
795 assert isinstance(text, str)
796 assert isinstance(base_tags, (list, tuple, set))
797 assert silent in (True, False)
798 assert isinstance(depth, int)
799 # print("EXPAND_HDR: text={!r} base_tags={!r}".format(text, base_tags))
800 # First map the text using the inflection map
801 text = clean_value(wxr, text)
802 combined_return: list[tuple[str, ...]] = []
803 parts = split_at_comma_semi(text, separators=[";"])
804 for text in parts:
805 if not text: 805 ↛ 806line 805 didn't jump to line 806 because the condition on line 805 was never true
806 continue
807 if text in infl_map:
808 v = infl_map[text] # list or string
809 else:
810 m = re.match(infl_start_re, text)
811 if m is not None: 811 ↛ 812line 811 didn't jump to line 812 because the condition on line 811 was never true
812 v = infl_start_map[m.group(1)]
813 # print("INFL_START {} -> {}".format(text, v))
814 elif re.match(r"Notes", text):
815 # Ignored header
816 # print("IGNORING NOTES")
817 combined_return = or_tagsets(
818 lang, pos, combined_return, [("dummy-skip-this",)]
819 )
820 # this just adds dummy-skip-this
821 continue
822 elif text in IGNORED_COLVALUES:
823 combined_return = or_tagsets(
824 lang, pos, combined_return, [("dummy-ignore-skipped",)]
825 )
826 continue
827 # Try without final parenthesized part
828 text_without_parens = re.sub(r"[,/]?\s+\([^)]*\)\s*$", "", text)
829 if text_without_parens in infl_map:
830 v = infl_map[text_without_parens]
831 elif m is None: 831 ↛ 847line 831 didn't jump to line 847 because the condition on line 831 was always true
832 if not silent:
833 wxr.wtp.debug(
834 "inflection table: unrecognized header: {}".format(
835 repr(text)
836 ),
837 sortid="inflection/735",
838 )
839 # Unrecognized header
840 combined_return = or_tagsets(
841 lang, pos, combined_return, [("error-unrecognized-form",)]
842 )
843 continue
845 # Then loop interpreting the value, until the value is a simple string.
846 # This may evaluate nested conditional expressions.
847 default_then = None
848 while True:
849 # If it is a string, we are done.
850 if isinstance(v, str):
851 tags = set(v.split())
852 remove_useless_tags(lang, pos, tags)
853 tagset = [tuple(sorted(tags))]
854 break
855 # For a list, just interpret it as alternatives. (Currently the
856 # alternatives must directly be strings.)
857 if isinstance(v, (list, tuple)):
858 tagset = []
859 for x in v:
860 tags = set(x.split())
861 remove_useless_tags(lang, pos, tags)
862 tags_t = tuple(sorted(tags))
863 if tags_t not in tagset: 863 ↛ 859line 863 didn't jump to line 859 because the condition on line 863 was always true
864 tagset.append(tags_t)
865 break
866 # Otherwise the value should be a dictionary describing a
867 # conditional expression.
868 if not isinstance(v, dict): 868 ↛ 869line 868 didn't jump to line 869 because the condition on line 868 was never true
869 wxr.wtp.debug(
870 "inflection table: internal: "
871 "UNIMPLEMENTED INFL_MAP VALUE: {}".format(infl_map[text]),
872 sortid="inflection/767",
873 )
874 tagset = [()]
875 break
876 # Evaluate the conditional expression.
877 assert isinstance(v, dict)
878 cond: Union[bool, str] = "default-true"
879 c: Union[str, list[str], set[str]] = ""
880 # Handle "lang" condition. The value must be either a
881 # single language or a list of languages, and the
882 # condition evaluates to True if the table is one of
883 # those languages.
884 if "lang" in v:
885 c = v["lang"]
886 if isinstance(c, str):
887 cond = c == lang
888 else:
889 assert isinstance(c, (list, tuple, set))
890 cond = lang in c
891 # Handle "nested-table-depth" condition. The value must
892 # be an int or list of ints, and the condition evaluates
893 # True if the depth is one of those values.
894 # "depth" is how deep into a nested table tree the current
895 # table lies. It is first started in handle_wikitext_table,
896 # so only applies to tables-within-tables, not other
897 # WikiNode content. `depth` is currently only passed as a
898 # parameter down the table parsing stack, and not stored.
899 if cond and "nested-table-depth" in v: 899 ↛ 900line 899 didn't jump to line 900 because the condition on line 899 was never true
900 d = v["nested-table-depth"]
901 if isinstance(d, int):
902 cond = d == depth
903 else:
904 assert isinstance(d, (list, tuple, set))
905 cond = depth in d
906 # Handle inflection-template condition. Must be a string
907 # or list of strings, and if tablecontext.template_name is in
908 # those, accept the condition.
909 # TableContext.template_name is passed down from page/
910 # parse_inflection, before parsing and expanding itself
911 # has begun.
912 if cond and tablecontext and "inflection-template" in v:
913 d1 = v["inflection-template"]
914 if isinstance(d1, str): 914 ↛ 917line 914 didn't jump to line 917 because the condition on line 914 was always true
915 cond = d1 == tablecontext.template_name
916 else:
917 assert isinstance(d1, (list, tuple, set))
918 cond = tablecontext.template_name in d1
919 # Handle "pos" condition. The value must be either a single
920 # part-of-speech or a list of them, and the condition evaluates to
921 # True if the part-of-speech is any of those listed.
922 if cond and "pos" in v:
923 c = v["pos"]
924 if isinstance(c, str):
925 cond = c == pos
926 else:
927 assert isinstance(c, (list, tuple, set))
928 cond = pos in c
929 # Handle "if" condition. The value must be a string containing a
930 # space-separated list of tags. The condition evaluates to True if
931 # ``base_tags`` contains all of the listed tags. If the condition
932 # is of the form "any: ...tags...", then any of the tags will be
933 # enough.
934 if cond and "if" in v and not ignore_tags:
935 c = v["if"]
936 assert isinstance(c, str)
937 # "if" condition is true if any of the listed tags is present if
938 # it starts with "any:", otherwise all must be present
939 if c.startswith("any: "):
940 cond = any(t in base_tags for t in c[5:].split())
941 else:
942 cond = all(t in base_tags for t in c.split())
944 # Handle "default" assignment. Store the value to be used
945 # as a default later.
946 if "default" in v:
947 assert isinstance(v["default"], str)
948 default_then = v["default"]
950 # Warning message about missing conditions for debugging.
952 if cond == "default-true" and not default_then and not silent:
953 wxr.wtp.debug(
954 "inflection table: IF MISSING COND: word={} "
955 "lang={} text={} base_tags={} c={} cond={}".format(
956 word, lang, text, base_tags, c, cond
957 ),
958 sortid="inflection/851",
959 )
960 # Based on the result of evaluating the condition, select either
961 # "then" part or "else" part.
962 if cond:
963 v = v.get("then", "")
964 else:
965 v1 = v.get("else")
966 if v1 is None:
967 if default_then:
968 v = default_then
969 else:
970 if not silent:
971 wxr.wtp.debug(
972 "inflection table: IF WITHOUT ELSE EVALS "
973 "False: "
974 "{}/{} {!r} base_tags={}".format(
975 word, lang, text, base_tags
976 ),
977 sortid="inflection/865",
978 )
979 v = "error-unrecognized-form"
980 else:
981 v = v1
983 # Merge the resulting tagset from this header part with the other
984 # tagsets from the whole header
985 combined_return = or_tagsets(lang, pos, combined_return, tagset)
987 # Return the combined tagsets, or empty tagset if we got no tagsets
988 if not combined_return:
989 combined_return = [()]
990 return combined_return
993def compute_coltags(
994 lang: str,
995 pos: str,
996 hdrspans: list[str],
997 start: int,
998 colspan: int,
999 celltext: int,
1000) -> list[tuple[str]]:
1001 """Computes column tags for a column of the given width based on the
1002 current header spans."""
1003 assert isinstance(lang, str)
1004 assert isinstance(pos, str)
1005 assert isinstance(hdrspans, list)
1006 assert isinstance(start, int) and start >= 0
1007 assert isinstance(colspan, int) and colspan >= 1
1008 assert isinstance(celltext, str) # For debugging only
1009 # print("COMPUTE_COLTAGS CALLED start={} colspan={} celltext={!r}"
1010 # .format(start, colspan, celltext))
1011 # For debugging, set this to the form for whose cell you want debug prints
1012 if celltext == debug_cell_text: 1012 ↛ 1013line 1012 didn't jump to line 1013 because the condition on line 1012 was never true
1013 print(
1014 "COMPUTE_COLTAGS CALLED start={} colspan={} celltext={!r}".format(
1015 start, colspan, celltext
1016 )
1017 )
1018 for hdrspan in hdrspans:
1019 print(
1020 " row={} start={} colspans={} tagsets={}".format(
1021 hdrspan.rownum,
1022 hdrspan.start,
1023 hdrspan.colspan,
1024 hdrspan.tagsets,
1025 )
1026 )
1027 used = set()
1028 coltags = [()]
1029 last_header_row = 1000000
1030 # Iterate through the headers in reverse order, i.e., headers lower in the
1031 # table (closer to the cell) first.
1032 row_tagsets = [()]
1033 row_tagsets_rownum = 1000000
1034 used_hdrspans = set()
1035 for hdrspan in reversed(hdrspans):
1036 if (
1037 hdrspan.start + hdrspan.colspan <= start
1038 or hdrspan.start >= start + colspan
1039 ):
1040 # Does not horizontally overlap current cell. Ignore this hdrspan.
1041 if celltext == debug_cell_text: 1041 ↛ 1042line 1041 didn't jump to line 1042 because the condition on line 1041 was never true
1042 print(
1043 "Ignoring row={} start={} colspan={} tagsets={}".format(
1044 hdrspan.rownum,
1045 hdrspan.start,
1046 hdrspan.colspan,
1047 hdrspan.tagsets,
1048 )
1049 )
1050 continue
1051 # If the cell partially overlaps the current cell, assume we have
1052 # reached something unrelated and abort.
1053 if (
1054 hdrspan.start < start
1055 and hdrspan.start + hdrspan.colspan > start
1056 and hdrspan.start + hdrspan.colspan < start + colspan
1057 ):
1058 if celltext == debug_cell_text: 1058 ↛ 1059line 1058 didn't jump to line 1059 because the condition on line 1058 was never true
1059 print(
1060 "break on partial overlap at start {} {} {}".format(
1061 hdrspan.start, hdrspan.colspan, hdrspan.tagsets
1062 )
1063 )
1064 break
1065 if (
1066 hdrspan.start < start + colspan
1067 and hdrspan.start > start
1068 and hdrspan.start + hdrspan.colspan > start + colspan
1069 and not hdrspan.expanded
1070 ):
1071 if celltext == debug_cell_text: 1071 ↛ 1072line 1071 didn't jump to line 1072 because the condition on line 1071 was never true
1072 print(
1073 "break on partial overlap at end {} {} {}".format(
1074 hdrspan.start, hdrspan.colspan, hdrspan.tagsets
1075 )
1076 )
1077 break
1078 # Check if we have already used this cell.
1079 if id(hdrspan) in used_hdrspans:
1080 continue
1081 # We are going to use this cell.
1082 used_hdrspans.add(id(hdrspan))
1083 tagsets = hdrspan.tagsets
1084 # If the hdrspan is fully inside the current cell and does not cover
1085 # it fully, check if we should merge information from multiple cells.
1086 if not hdrspan.expanded and (
1087 hdrspan.start > start
1088 or hdrspan.start + hdrspan.colspan < start + colspan
1089 ):
1090 # Multiple columns apply to the current cell, only
1091 # gender/number/case tags present
1092 # If there are no tags outside the range in any of the
1093 # categories included in these cells, don't add anything
1094 # (assume all choices valid in the language are possible).
1095 in_cats = set(
1096 valid_tags[t]
1097 for x in hdrspans
1098 if x.rownum == hdrspan.rownum
1099 and x.start >= start
1100 and x.start + x.colspan <= start + colspan
1101 for tt in x.tagsets
1102 for t in tt
1103 )
1104 if celltext == debug_cell_text: 1104 ↛ 1105line 1104 didn't jump to line 1105 because the condition on line 1104 was never true
1105 print("in_cats={} tagsets={}".format(in_cats, tagsets))
1106 # Merge the tagsets into existing tagsets. This merges
1107 # alternatives into the same tagset if there is only one
1108 # category different; otherwise this splits the tagset into
1109 # more alternatives.
1110 includes_all_on_row = True
1111 for x in hdrspans:
1112 # print("X: x.rownum={} x.start={}".format(x.rownum, x.start))
1113 if x.rownum != hdrspan.rownum:
1114 continue
1115 if x.start < start or x.start + x.colspan > start + colspan:
1116 if celltext == debug_cell_text: 1116 ↛ 1117line 1116 didn't jump to line 1117 because the condition on line 1116 was never true
1117 print(
1118 "NOT IN RANGE: {} {} {}".format(
1119 x.start, x.colspan, x.tagsets
1120 )
1121 )
1122 includes_all_on_row = False
1123 continue
1124 if id(x) in used_hdrspans:
1125 if celltext == debug_cell_text: 1125 ↛ 1126line 1125 didn't jump to line 1126 because the condition on line 1125 was never true
1126 print(
1127 "ALREADY USED: {} {} {}".format(
1128 x.start, x.colspan, x.tagsets
1129 )
1130 )
1131 continue
1132 used_hdrspans.add(id(x))
1133 if celltext == debug_cell_text: 1133 ↛ 1134line 1133 didn't jump to line 1134 because the condition on line 1133 was never true
1134 print(
1135 "Merging into wide col: x.rownum={} "
1136 "x.start={} x.colspan={} "
1137 "start={} colspan={} tagsets={} x.tagsets={}".format(
1138 x.rownum,
1139 x.start,
1140 x.colspan,
1141 start,
1142 colspan,
1143 tagsets,
1144 x.tagsets,
1145 )
1146 )
1147 tagsets = or_tagsets(lang, pos, tagsets, x.tagsets)
1148 # If all headers on the row were included, ignore them.
1149 # See e.g. kunna/Swedish/Verb.
1150 ts_cats = tagset_cats(tagsets)
1151 if (
1152 includes_all_on_row
1153 or
1154 # Kludge, see fut/Hungarian/Verb
1155 ("tense" in ts_cats and "object" in ts_cats)
1156 ):
1157 tagsets = [()]
1158 # For limited categories, if the category doesn't appear
1159 # outside, we won't include the category
1160 if not in_cats - set(
1161 ("gender", "number", "person", "case", "category", "voice")
1162 ):
1163 # Sometimes we have masc, fem, neut and plural, so treat
1164 # number and gender as the same here (if one given, look for
1165 # the other too)
1166 if "number" in in_cats or "gender" in in_cats:
1167 in_cats.update(("number", "gender"))
1168 # Determine which categories occur outside on
1169 # the same row. Ignore headers that have been expanded
1170 # to cover the whole row/part of it.
1171 out_cats = set(
1172 valid_tags[t]
1173 for x in hdrspans
1174 if x.rownum == hdrspan.rownum
1175 and not x.expanded
1176 and (
1177 x.start < start or x.start + x.colspan > start + colspan
1178 )
1179 for tt in x.tagsets
1180 for t in tt
1181 )
1182 if celltext == debug_cell_text: 1182 ↛ 1183line 1182 didn't jump to line 1183 because the condition on line 1182 was never true
1183 print("in_cats={} out_cats={}".format(in_cats, out_cats))
1184 # Remove all inside categories that do not appear outside
1186 new_tagsets = []
1187 for ts in tagsets:
1188 tags = tuple(
1189 sorted(t for t in ts if valid_tags[t] in out_cats)
1190 )
1191 if tags not in new_tagsets: 1191 ↛ 1187line 1191 didn't jump to line 1187 because the condition on line 1191 was always true
1192 new_tagsets.append(tags)
1193 if celltext == debug_cell_text and new_tagsets != tagsets: 1193 ↛ 1194line 1193 didn't jump to line 1194 because the condition on line 1193 was never true
1194 print(
1195 "Removed tags that do not "
1196 "appear outside {} -> {}".format(
1197 # have_hdr never used?
1198 tagsets,
1199 new_tagsets,
1200 )
1201 )
1202 tagsets = new_tagsets
1203 key = (hdrspan.start, hdrspan.colspan)
1204 if key in used:
1205 if celltext == debug_cell_text: 1205 ↛ 1206line 1205 didn't jump to line 1206 because the condition on line 1205 was never true
1206 print(
1207 "Cellspan already used: start={} "
1208 "colspan={} rownum={} {}".format(
1209 hdrspan.start,
1210 hdrspan.colspan,
1211 hdrspan.rownum,
1212 hdrspan.tagsets,
1213 )
1214 )
1215 action = get_lang_conf(lang, "reuse_cellspan")
1216 # can be "stop", "skip" or "reuse"
1217 if action == "stop":
1218 break
1219 if action == "skip":
1220 continue
1221 assert action == "reuse"
1222 tcats = tagset_cats(tagsets)
1223 # Most headers block using the same column position above. However,
1224 # "register" tags don't do this (cf. essere/Italian/verb: "formal")
1225 if len(tcats) != 1 or "register" not in tcats:
1226 used.add(key)
1227 # If we have moved to a different row, merge into column tagsets
1228 # (we use different and_tagsets within the row)
1229 if row_tagsets_rownum != hdrspan.rownum:
1230 # row_tagsets_rownum was initialized as 10000000
1231 ret = and_tagsets(lang, pos, coltags, row_tagsets)
1232 if celltext == debug_cell_text: 1232 ↛ 1233line 1232 didn't jump to line 1233 because the condition on line 1232 was never true
1233 print(
1234 "merging rows: {} {} -> {}".format(
1235 coltags, row_tagsets, ret
1236 )
1237 )
1238 coltags = ret
1239 row_tagsets = [()]
1240 row_tagsets_rownum = hdrspan.rownum
1241 # Merge into coltags
1242 if hdrspan.all_headers_row and hdrspan.rownum + 1 == last_header_row:
1243 # If this row is all headers and immediately preceeds the last
1244 # header we accepted, take any header from there.
1245 row_tagsets = and_tagsets(lang, pos, row_tagsets, tagsets)
1246 if celltext == debug_cell_text: 1246 ↛ 1247line 1246 didn't jump to line 1247 because the condition on line 1246 was never true
1247 print("merged (next header row): {}".format(row_tagsets))
1248 else:
1249 # new_cats is for the new tags (higher up in the table)
1250 new_cats = tagset_cats(tagsets)
1251 # cur_cats is for the tags already collected (lower in the table)
1252 cur_cats = tagset_cats(coltags)
1253 if celltext == debug_cell_text: 1253 ↛ 1254line 1253 didn't jump to line 1254 because the condition on line 1253 was never true
1254 print(
1255 "row={} start={} colspan={} tagsets={} coltags={} "
1256 "new_cats={} cur_cats={}".format(
1257 hdrspan.rownum,
1258 hdrspan.start,
1259 hdrspan.colspan,
1260 tagsets,
1261 coltags,
1262 new_cats,
1263 cur_cats,
1264 )
1265 )
1266 if "detail" in new_cats:
1267 if not any(coltags): # Only if no tags so far
1268 coltags = or_tagsets(lang, pos, coltags, tagsets)
1269 if celltext == debug_cell_text: 1269 ↛ 1270line 1269 didn't jump to line 1270 because the condition on line 1269 was never true
1270 print("stopping on detail after merge")
1271 break
1272 # Here, we block bleeding of categories from above
1273 elif "non-finite" in cur_cats and "non-finite" in new_cats:
1274 stop = get_lang_conf(lang, "stop_non_finite_non_finite")
1275 if stop: 1275 ↛ 1301line 1275 didn't jump to line 1301 because the condition on line 1275 was always true
1276 if celltext == debug_cell_text: 1276 ↛ 1277line 1276 didn't jump to line 1277 because the condition on line 1276 was never true
1277 print("stopping on non-finite-non-finite")
1278 break
1279 elif "non-finite" in cur_cats and "voice" in new_cats:
1280 stop = get_lang_conf(lang, "stop_non_finite_voice")
1281 if stop: 1281 ↛ 1301line 1281 didn't jump to line 1301 because the condition on line 1281 was always true
1282 if celltext == debug_cell_text: 1282 ↛ 1283line 1282 didn't jump to line 1283 because the condition on line 1282 was never true
1283 print("stopping on non-finite-voice")
1284 break
1285 elif "non-finite" in new_cats and cur_cats & set(
1286 ("person", "number")
1287 ):
1288 if celltext == debug_cell_text: 1288 ↛ 1289line 1288 didn't jump to line 1289 because the condition on line 1288 was never true
1289 print("stopping on non-finite new")
1290 break
1291 elif "non-finite" in new_cats and "tense" in new_cats:
1292 stop = get_lang_conf(lang, "stop_non_finite_tense")
1293 if stop:
1294 if celltext == debug_cell_text: 1294 ↛ 1295line 1294 didn't jump to line 1295 because the condition on line 1294 was never true
1295 print("stopping on non-finite new")
1296 break
1297 elif "non-finite" in cur_cats and new_cats & set(("mood",)): 1297 ↛ 1298line 1297 didn't jump to line 1298 because the condition on line 1297 was never true
1298 if celltext == debug_cell_text:
1299 print("stopping on non-finite cur")
1300 break
1301 if (
1302 "tense" in new_cats
1303 and any("imperative" in x for x in coltags)
1304 and get_lang_conf(lang, "imperative_no_tense")
1305 ):
1306 if celltext == debug_cell_text: 1306 ↛ 1307line 1306 didn't jump to line 1307 because the condition on line 1306 was never true
1307 print("skipping tense in imperative")
1308 continue
1309 elif (
1310 "mood" in new_cats
1311 and "mood" in cur_cats
1312 and
1313 # Allow if all new tags are already in current set
1314 any(
1315 t not in ts1
1316 for ts1 in coltags # current
1317 for ts2 in tagsets # new (from above)
1318 for t in ts2
1319 )
1320 ):
1321 skip = get_lang_conf(lang, "skip_mood_mood")
1322 if skip:
1323 if celltext == debug_cell_text: 1323 ↛ 1324line 1323 didn't jump to line 1324 because the condition on line 1323 was never true
1324 print("skipping on mood-mood")
1325 # we continue to next header
1326 else:
1327 if celltext == debug_cell_text: 1327 ↛ 1328line 1327 didn't jump to line 1328 because the condition on line 1327 was never true
1328 print("stopping on mood-mood")
1329 break
1330 elif "tense" in new_cats and "tense" in cur_cats:
1331 skip = get_lang_conf(lang, "skip_tense_tense")
1332 if skip:
1333 if celltext == debug_cell_text: 1333 ↛ 1334line 1333 didn't jump to line 1334 because the condition on line 1333 was never true
1334 print("skipping on tense-tense")
1335 # we continue to next header
1336 else:
1337 if celltext == debug_cell_text: 1337 ↛ 1338line 1337 didn't jump to line 1338 because the condition on line 1337 was never true
1338 print("stopping on tense-tense")
1339 break
1340 elif "aspect" in new_cats and "aspect" in cur_cats:
1341 if celltext == debug_cell_text: 1341 ↛ 1342line 1341 didn't jump to line 1342 because the condition on line 1341 was never true
1342 print("skipping on aspect-aspect")
1343 continue
1344 elif "number" in cur_cats and "number" in new_cats:
1345 if celltext == debug_cell_text: 1345 ↛ 1346line 1345 didn't jump to line 1346 because the condition on line 1345 was never true
1346 print("stopping on number-number")
1347 break
1348 elif "number" in cur_cats and "gender" in new_cats:
1349 if celltext == debug_cell_text: 1349 ↛ 1350line 1349 didn't jump to line 1350 because the condition on line 1349 was never true
1350 print("stopping on number-gender")
1351 break
1352 elif "person" in cur_cats and "person" in new_cats:
1353 if celltext == debug_cell_text: 1353 ↛ 1354line 1353 didn't jump to line 1354 because the condition on line 1353 was never true
1354 print("stopping on person-person")
1355 break
1356 else:
1357 # Merge tags and continue to next header up/left in the table.
1358 row_tagsets = and_tagsets(lang, pos, row_tagsets, tagsets)
1359 if celltext == debug_cell_text: 1359 ↛ 1360line 1359 didn't jump to line 1360 because the condition on line 1359 was never true
1360 print("merged: {}".format(coltags))
1361 # Update the row number from which we have last taken headers
1362 last_header_row = hdrspan.rownum
1363 # Merge the final row tagset into coltags
1364 coltags = and_tagsets(lang, pos, coltags, row_tagsets)
1365 # print(
1366 # "HDRSPANS:", list((x.start, x.colspan, x.tagsets) for x in hdrspans)
1367 # )
1368 if celltext == debug_cell_text: 1368 ↛ 1369line 1368 didn't jump to line 1369 because the condition on line 1368 was never true
1369 print("COMPUTE_COLTAGS {} {}: {}".format(start, colspan, coltags))
1370 assert isinstance(coltags, list)
1371 assert all(isinstance(x, tuple) for x in coltags)
1372 return coltags
1375def parse_simple_table(
1376 wxr, tablecontext, word, lang, pos, rows, titles, source, after, depth
1377):
1378 """This is the default table parser. Despite its name, it can parse
1379 complex tables. This returns a list of forms to be added to the
1380 part-of-speech, or None if the table could not be parsed."""
1381 assert isinstance(wxr, WiktextractContext)
1382 assert isinstance(tablecontext, TableContext)
1383 assert isinstance(word, str)
1384 assert isinstance(lang, str)
1385 assert isinstance(pos, str)
1386 assert isinstance(rows, list)
1387 assert isinstance(source, str)
1388 assert isinstance(after, str)
1389 assert isinstance(depth, int)
1390 for row in rows:
1391 for col in row:
1392 assert isinstance(col, InflCell)
1393 assert isinstance(titles, list)
1394 for x in titles:
1395 assert isinstance(x, str)
1397 # print("PARSE_SIMPLE_TABLE: TITLES:", titles)
1398 if debug_cell_text: 1398 ↛ 1399line 1398 didn't jump to line 1399 because the condition on line 1398 was never true
1399 print("ROWS:")
1400 for row in rows:
1401 print(" ", row)
1403 # Check for forced rowspan kludge. See e.g.
1404 # maorski/Serbo-Croatian. These are essentially multi-row
1405 # cells implemented using <br> rather than separate cell. We fix this
1406 # by identifying rows where this happens, and splitting the current row
1407 # to multiple rows by synthesizing additional cells.
1408 new_rows = []
1409 for row in rows:
1410 split_row = (
1411 any(x.is_title and x.text in ("inanimate\nanimate",) for x in row)
1412 and
1413 # x is an InflCell
1414 all(x.rowspan == 1 for x in row)
1415 )
1416 if not split_row:
1417 new_rows.append(row)
1418 continue
1419 row1 = []
1420 row2 = []
1421 for cell in row:
1422 cell1 = copy.deepcopy(cell)
1423 if "\n" in cell.text:
1424 # Has more than one line - split this cell
1425 parts = cell.text.strip().splitlines()
1426 if len(parts) != 2: 1426 ↛ 1427line 1426 didn't jump to line 1427 because the condition on line 1426 was never true
1427 wxr.wtp.debug(
1428 "forced rowspan kludge got {} parts: {!r}".format(
1429 len(parts), cell.text
1430 ),
1431 sortid="inflection/1234",
1432 )
1433 cell2 = copy.deepcopy(cell)
1434 cell1.text = parts[0]
1435 cell2.text = parts[1]
1436 else:
1437 cell1.rowspan = 2
1438 cell2 = cell1 # ref, not a copy
1439 row1.append(cell1)
1440 row2.append(cell2)
1441 new_rows.append(row1)
1442 new_rows.append(row2)
1443 rows = new_rows
1444 # print("ROWS AFTER FORCED ROWSPAN KLUDGE:")
1445 # for row in rows:
1446 # print(" ", row)
1448 # Parse definitions for references (from table itself and from text
1449 # after it)
1450 def_ht = {}
1452 def add_defs(defs: list[tuple[str, str]]) -> None:
1453 for ref, d in defs:
1454 # print("DEF: ref={} d={}".format(ref, d))
1455 d = d.strip()
1456 d = d.split(". ")[0].strip() # text before ". "
1457 if not d: 1457 ↛ 1458line 1457 didn't jump to line 1458 because the condition on line 1457 was never true
1458 continue
1459 if d.endswith("."): # catc ".."??
1460 d = d[:-1]
1461 tags, topics = decode_tags(d, no_unknown_starts=True)
1462 # print(f"{ref=}, {d=}, {tags=}")
1463 if topics or any("error-unknown-tag" in ts for ts in tags):
1464 d = d[0].lower() + d[1:]
1465 tags, topics = decode_tags(d, no_unknown_starts=True)
1466 if topics or any("error-unknown-tag" in ts for ts in tags):
1467 # Failed to parse as tags
1468 # print("Failed: topics={} tags={}"
1469 # .format(topics, tags))
1470 continue
1471 tags1_s: set[str] = set()
1472 for ts in tags:
1473 tags1_s.update(ts)
1474 tags1 = tuple(sorted(tags1_s))
1475 # print("DEFINED: {} -> {}".format(ref, tags1))
1476 def_ht[ref] = tags1
1478 def generate_tags(
1479 rowtags: list[tuple[str]], table_tags: list[str]
1480 ) -> tuple[
1481 list[tuple[str, ...]], list[tuple[str, ...]], list[tuple[str, ...]]
1482 ]:
1483 new_coltags = []
1484 all_hdr_tags = [] # list of tuples
1485 new_rowtags = []
1486 for rt0 in rowtags:
1487 for ct0 in compute_coltags(
1488 lang,
1489 pos,
1490 hdrspans,
1491 col_idx, # col_idx=>start
1492 colspan,
1493 col, # cell_text
1494 ):
1495 base_tags: set[str] = (
1496 set(rt0)
1497 | set(ct0)
1498 | set(global_tags)
1499 | set(itertools.chain.from_iterable(table_tags))
1500 ) # Union.
1501 alt_tags = expand_header(
1502 wxr,
1503 tablecontext,
1504 word,
1505 lang,
1506 pos,
1507 text,
1508 base_tags,
1509 depth=depth,
1510 )
1511 # base_tags are used in infl_map "if"-conds.
1512 for tt in alt_tags:
1513 if tt not in all_hdr_tags:
1514 all_hdr_tags.append(tt)
1515 tt_s = set(tt)
1516 # Certain tags are always moved to word-level tags
1517 if tt_s & TAGS_FORCED_WORDTAGS: 1517 ↛ 1518line 1517 didn't jump to line 1518 because the condition on line 1517 was never true
1518 table_tags.extend(tt_s & TAGS_FORCED_WORDTAGS)
1519 tt_s = tt_s - TAGS_FORCED_WORDTAGS
1520 # Add tags from referenced footnotes
1521 tt_s.update(refs_tags)
1522 # Sort, convert to tuple, and add to set of
1523 # alternatives.
1524 tt = tuple(sorted(tt_s))
1525 if tt not in new_coltags:
1526 new_coltags.append(tt)
1527 # Kludge (saprast/Latvian/Verb): ignore row tags
1528 # if trying to add a non-finite after mood.
1529 if any(valid_tags[t] == "mood" for t in rt0) and any(
1530 valid_tags[t] == "non-finite" for t in tt
1531 ):
1532 tags = tuple(sorted(set(tt) | set(hdr_tags)))
1533 else:
1534 tags = tuple(sorted(set(tt) | set(rt0) | set(hdr_tags)))
1535 if tags not in new_rowtags:
1536 new_rowtags.append(tags)
1537 return new_rowtags, new_coltags, all_hdr_tags
1539 def add_new_hdrspan(
1540 col: str,
1541 hdrspans: list[HdrSpan],
1542 store_new_hdrspan: bool,
1543 col0_followed_by_nonempty: bool,
1544 col0_hdrspan: Optional[HdrSpan],
1545 ) -> tuple[str, bool, Optional[HdrSpan]]:
1546 hdrspan = HdrSpan(
1547 col_idx, colspan, rowspan, rownum, new_coltags, col, all_headers
1548 )
1549 hdrspans.append(hdrspan)
1551 # infl-map tag "dummy-store-hdrspan" causes this new hdrspan
1552 # to be added to a register of stored hdrspans to be used
1553 # later with "dummy-load-stored-hdrspans".
1554 if store_new_hdrspan: 1554 ↛ 1555line 1554 didn't jump to line 1555 because the condition on line 1554 was never true
1555 tablecontext.stored_hdrspans.append(hdrspan)
1557 # Handle headers that are above left-side header
1558 # columns and are followed by personal pronouns in
1559 # remaining columns (basically headers that
1560 # evaluate to no tags). In such cases widen the
1561 # left-side header to the full row.
1562 if previously_seen: # id(cell) in seen_cells previously
1563 col0_followed_by_nonempty = True
1564 return col, col0_followed_by_nonempty, col0_hdrspan
1565 elif col0_hdrspan is None:
1566 col0_hdrspan = hdrspan
1567 elif any(all_hdr_tags): 1567 ↛ 1635line 1567 didn't jump to line 1635 because the condition on line 1567 was always true
1568 col0_cats = tagset_cats(col0_hdrspan.tagsets)
1569 later_cats = tagset_cats(all_hdr_tags)
1570 col0_allowed = get_lang_conf(lang, "hdr_expand_first")
1571 later_allowed = get_lang_conf(lang, "hdr_expand_cont")
1572 later_allowed = later_allowed | set(["dummy"])
1573 # dummy2 has different behavior than plain dummy
1574 # and does not belong here.
1576 # print("col0_cats={} later_cats={} "
1577 # "fol_by_nonempty={} col_idx={} end={} "
1578 # "tagsets={}"
1579 # .format(col0_cats, later_cats,
1580 # col0_followed_by_nonempty, col_idx,
1581 # col0_hdrspan.start +
1582 # col0_hdrspan.colspan,
1583 # col0_hdrspan.tagsets))
1584 # print("col0.rowspan={} rowspan={}"
1585 # .format(col0_hdrspan.rowspan, rowspan))
1586 # Only expand if [col0_cats and later_cats are allowed
1587 # and don't overlap] and [col0 has tags], and there have
1588 # been [no disallowed cells in between].
1589 #
1590 # There are three cases here:
1591 # - col0_hdrspan set, continue with allowed current
1592 # - col0_hdrspan set, expand, start new
1593 # - col0_hdrspan set, no expand, start new
1594 if (
1595 not col0_followed_by_nonempty
1596 and
1597 # XXX Only one cat of tags: kunna/Swedish
1598 # XXX len(col0_cats) == 1 and
1599 col0_hdrspan.rowspan >= rowspan
1600 and
1601 # from hdrspan
1602 not (later_cats - later_allowed)
1603 and not (col0_cats & later_cats)
1604 ):
1605 # First case: col0 set, continue
1606 return col, col0_followed_by_nonempty, col0_hdrspan
1607 # We are going to start new col0_hdrspan. Check if
1608 # we should expand.
1609 if (
1610 not col0_followed_by_nonempty
1611 and not (col0_cats - col0_allowed)
1612 and
1613 # Only "allowed" allowed
1614 # XXX len(col0_cats) == 1 and
1615 col_idx > col0_hdrspan.start + col0_hdrspan.colspan
1616 ):
1617 # col_idx is beyond current colspan
1618 # *Expand* current col0_hdrspan
1619 # print("EXPANDING COL0 MID: {} from {} to {} "
1620 # "cols {}"
1621 # .format(col0_hdrspan.text,
1622 # col0_hdrspan.colspan,
1623 # col_idx - col0_hdrspan.start,
1624 # col0_hdrspan.tagsets))
1625 col0_hdrspan.colspan = col_idx - col0_hdrspan.start
1626 col0_hdrspan.expanded = True
1627 # Clear old col0_hdrspan
1628 if col == debug_cell_text: 1628 ↛ 1629line 1628 didn't jump to line 1629 because the condition on line 1628 was never true
1629 print("START NEW {}".format(hdrspan.tagsets))
1630 col0_hdrspan = None
1631 # Now start new, unless it comes from previous row
1632 if not previously_seen: 1632 ↛ 1635line 1632 didn't jump to line 1635 because the condition on line 1632 was always true
1633 col0_hdrspan = hdrspan
1634 col0_followed_by_nonempty = False
1635 return col, col0_followed_by_nonempty, col0_hdrspan
1637 def split_text_into_alts(col: str) -> tuple[str, list[str], list[str]]:
1638 # Split the cell text into alternatives
1639 split_extra_tags = []
1640 if col and is_superscript(col[0]): 1640 ↛ 1641line 1640 didn't jump to line 1641 because the condition on line 1640 was never true
1641 alts = [col]
1642 else:
1643 separators = [";", "•", r"\n", " or "]
1644 if " + " not in col:
1645 separators.append(",")
1646 if not col.endswith("/"):
1647 separators.append("/")
1648 if col in special_phrase_splits:
1649 # Use language-specific special splits.
1650 # These are phrases and constructions that have
1651 # unique ways of splitting, not specific characters
1652 # to split on like with the default splitting.
1653 alts, tags = special_phrase_splits[col]
1654 split_extra_tags = tags.split()
1655 for x in split_extra_tags:
1656 assert x in valid_tags
1657 assert isinstance(alts, (list, tuple))
1658 assert isinstance(tags, str)
1659 else:
1660 # Use default splitting. However, recognize
1661 # language-specific replacements and change them to magic
1662 # characters before splitting. This way we won't split
1663 # them. This is important for, e.g., recognizing
1664 # alternative pronouns.
1665 # The magic characters are characters out of Unicode scope
1666 # that are given a simple incremental value, int > unicode.
1667 repls = {}
1668 magic_ch = MAGIC_FIRST
1669 trs = get_lang_conf(lang, "form_transformations")
1670 # trs is a list of lists of strings
1671 for _, v, _, _ in trs:
1672 # v is a pattern string, like "^ich"
1673 # form_transformations data is doing double-duty here,
1674 # because the pattern strings are already known to us and
1675 # not meant to be split.
1676 m = re.search(v, col)
1677 if m is not None:
1678 # if pattern found in text
1679 magic = chr(magic_ch)
1680 magic_ch += 1 # next magic character value
1681 col = re.sub(v, magic, col) # replace with magic ch
1682 repls[magic] = m.group(0)
1683 # remember what regex match string each magic char
1684 # replaces. .group(0) is the whole match.
1685 alts0 = split_at_comma_semi(col, separators=separators)
1686 # with magic characters in place, split the text so that
1687 # pre-transformation text is out of the way.
1688 alts = []
1689 for alt in alts0:
1690 # create a new list with the separated items and
1691 # the magic characters replaced with the original texts.
1692 for k, v in repls.items():
1693 alt = re.sub(k, v, alt)
1694 alts.append(alt)
1696 # Remove "*" from beginning of forms, as in non-attested
1697 # or reconstructed forms. Otherwise it might confuse romanization
1698 # detection.
1699 alts = list(re.sub(r"^\*\*?([^ ])", r"\1", x) for x in alts)
1700 alts = list(
1701 x for x in alts if not re.match(r"pronounced with |\(with ", x)
1702 )
1703 alts = list(
1704 re.sub(r"^\((in the sense [^)]*)\)\s+", "", x) for x in alts
1705 )
1706 return col, alts, split_extra_tags
1708 def handle_mixed_lines(alts: list[str]) -> list[tuple[str, str, str]]:
1709 # Handle the special case where romanization is given under
1710 # normal form, e.g. in Russian. There can be multiple
1711 # comma-separated forms in each case. We also handle the case
1712 # where instead of romanization we have IPA pronunciation
1713 # (e.g., avoir/French/verb).
1714 len2 = len(alts) // 2
1715 # Check for IPAs (forms first, IPAs under)
1716 # base, base, IPA, IPA
1717 if (
1718 len(alts) % 2 == 0 # Divisibly by two
1719 and all(
1720 re.match(r"^\s*/.*/\s*$", x) # Inside slashes = IPA
1721 for x in alts[len2:]
1722 )
1723 ): # In the second half of alts
1724 nalts = list(
1725 (alts[i], "", alts[i + len2])
1726 # List of tuples: (base, "", ipa)
1727 for i in range(len2)
1728 )
1729 # base, base, base, IPA
1730 elif (
1731 len(alts) > 2
1732 and re.match(r"^\s*/.*/\s*$", alts[-1])
1733 and all(not x.startswith("/") for x in alts[:-1])
1734 ):
1735 # Only if the last alt is IPA
1736 nalts = list((alts[i], "", alts[-1]) for i in range(len(alts) - 1))
1737 # base, IPA, IPA, IPA
1738 elif (
1739 len(alts) > 2
1740 and not alts[0].startswith("/")
1741 and all(
1742 re.match(r"^\s*/.*/\s*$", alts[i]) for i in range(1, len(alts))
1743 )
1744 ):
1745 # First is base and the rest is IPA alternatives
1746 nalts = list((alts[0], "", alts[i]) for i in range(1, len(alts)))
1748 # Check for romanizations, forms first, romanizations under
1749 elif (
1750 len(alts) % 2 == 0
1751 and not any("(" in x for x in alts)
1752 and all(
1753 classify_desc(
1754 re.sub(
1755 r"\^.*$",
1756 "",
1757 # Remove ends of strings starting from ^.
1758 # Supescripts have been already removed
1759 # from the string, while ^xyz needs to be
1760 # removed separately, though it's usually
1761 # something with a single letter?
1762 "".join(xx for xx in x if not is_superscript(xx)),
1763 )
1764 )
1765 == "other"
1766 for x in alts[:len2]
1767 )
1768 and all(
1769 classify_desc(
1770 re.sub(
1771 r"\^.*$",
1772 "",
1773 "".join(xx for xx in x if not is_superscript(xx)),
1774 )
1775 )
1776 in ("romanization", "english")
1777 for x in alts[len2:]
1778 )
1779 ):
1780 nalts = list((alts[i], alts[i + len2], "") for i in range(len2))
1781 # Check for romanizations, forms and romanizations alternating
1782 elif (
1783 len(alts) % 2 == 0
1784 and not any("(" in x for x in alts)
1785 and all(
1786 classify_desc(
1787 re.sub(
1788 r"\^.*$",
1789 "",
1790 "".join(xx for xx in alts[i] if not is_superscript(xx)),
1791 )
1792 )
1793 == "other"
1794 for i in range(0, len(alts), 2)
1795 )
1796 and all(
1797 classify_desc(
1798 re.sub(
1799 r"\^.*$",
1800 "",
1801 "".join(xx for xx in alts[i] if not is_superscript(xx)),
1802 )
1803 )
1804 in ("romanization", "english")
1805 for i in range(1, len(alts), 2)
1806 )
1807 ):
1808 # odds
1809 nalts = list(
1810 (alts[i], alts[i + 1], "") for i in range(0, len(alts), 2)
1811 )
1812 # evens
1813 # Handle complex Georgian entries with alternative forms and*
1814 # *romanizations. It's a bit of a mess. Remove this kludge if not
1815 # needed anymore. NOTE THAT THE PARENTHESES ON THE WEBSITE ARE NOT
1816 # DISPLAYED. They are put inside their own span elements that are
1817 # then hidden with some CSS.
1818 # https://en.wiktionary.org/wiki/%E1%83%90%E1%83%9B%E1%83%94%E1%83%A0%E1%83%98%E1%83%99%E1%83%98%E1%83%A1_%E1%83%A8%E1%83%94%E1%83%94%E1%83%A0%E1%83%97%E1%83%94%E1%83%91%E1%83%A3%E1%83%9A%E1%83%98_%E1%83%A8%E1%83%A2%E1%83%90%E1%83%A2%E1%83%94%E1%83%91%E1%83%98
1819 # ამერიკის შეერთებულ შტატებს(ა) (ameriḳis šeertebul šṭaṭebs(a))
1820 # The above should generate two alts entries, with two different
1821 # parallel versions, one without (a) and with (a) at the end,
1822 # for both the Georgian original and the romanization.
1823 elif ( 1823 ↛ 1828line 1823 didn't jump to line 1828 because the condition on line 1823 was never true
1824 tablecontext.template_name == "ka-decl-noun"
1825 and len(alts) == 1
1826 and " (" in alts[0]
1827 ):
1828 nalts = ka_decl_noun_template_cell(alts)
1829 else:
1830 new_alts = []
1831 for alt in alts:
1832 lst = [""]
1833 idx = 0
1834 for m in re.finditer(
1835 r"(^|\w|\*)\((\w+" r"(/\w+)*)\)",
1836 # start OR letter OR asterisk (word/word*)
1837 # \\___________group 1_______/ \ \_g3_///
1838 # \ \__gr. 2_//
1839 # \_____________group 0________________/
1840 alt,
1841 ):
1842 v = m.group(2) # (word/word/word...)
1843 if (
1844 classify_desc(v) == "tags" # Tags inside parens
1845 or m.group(0) == alt
1846 ): # All in parens
1847 continue
1848 new_lst = []
1849 for x in lst:
1850 x += alt[idx : m.start()] + m.group(1)
1851 # alt until letter or asterisk
1852 idx = m.end()
1853 vparts = v.split("/")
1854 # group(2) = ["word", "wörd"...]
1855 if len(vparts) == 1:
1856 new_lst.append(x)
1857 new_lst.append(x + v)
1858 # "kind(er)" -> ["kind", "kinder"]
1859 else:
1860 for vv in vparts:
1861 new_lst.append(x + vv)
1862 # "lampai(tten/den)" ->
1863 # ["lampaitten", "lampaiden"]
1864 lst = new_lst
1865 for x in lst:
1866 new_alts.append(x + alt[idx:])
1867 # add the end of alt
1868 nalts = list((x, "", "") for x in new_alts)
1869 # [form, no romz, no ipa]
1870 return nalts
1872 def find_semantic_parens(form: str) -> tuple[str, list[str]]:
1873 # "Some languages" (=Greek) use brackets to mark things that
1874 # require tags, like (informality), [rarity] and {archaicity}.
1875 extra_tags = []
1876 if re.match(r"\([^][(){}]*\)$", form):
1877 if get_lang_conf(lang, "parentheses_for_informal"):
1878 form = form[1:-1]
1879 extra_tags.append("informal")
1880 else:
1881 form = form[1:-1]
1882 elif re.match(r"\{\[[^][(){}]*\]\}$", form):
1883 if get_lang_conf( 1883 ↛ 1890line 1883 didn't jump to line 1890 because the condition on line 1883 was always true
1884 lang, "square_brackets_for_rare"
1885 ) and get_lang_conf(lang, "curly_brackets_for_archaic"):
1886 # είμαι/Greek/Verb
1887 form = form[2:-2]
1888 extra_tags.extend(["rare", "archaic"])
1889 else:
1890 form = form[2:-2]
1891 elif re.match(r"\{[^][(){}]*\}$", form):
1892 if get_lang_conf(lang, "curly_brackets_for_archaic"): 1892 ↛ 1897line 1892 didn't jump to line 1897 because the condition on line 1892 was always true
1893 # είμαι/Greek/Verb
1894 form = form[1:-1]
1895 extra_tags.extend(["archaic"])
1896 else:
1897 form = form[1:-1]
1898 elif re.match(r"\[[^][(){}]*\]$", form):
1899 if get_lang_conf(lang, "square_brackets_for_rare"): 1899 ↛ 1904line 1899 didn't jump to line 1904 because the condition on line 1899 was always true
1900 # είμαι/Greek/Verb
1901 form = form[1:-1]
1902 extra_tags.append("rare")
1903 else:
1904 form = form[1:-1]
1905 return form, extra_tags
1907 def handle_parens(
1908 form: str, roman: str, clitic: str, extra_tags: list[str]
1909 ) -> tuple[str, str, str]:
1910 if re.match(r"[’'][a-z]([a-z][a-z]?)?$", paren):
1911 # is there a clitic starting with apostrophe?
1912 clitic = paren
1913 # assume the whole paren is a clitic
1914 # then remove paren from form
1915 form = (form[: m.start()] + subst + form[m.end() :]).strip()
1916 elif classify_desc(paren) == "tags":
1917 tagsets1, topics1 = decode_tags(paren)
1918 if not topics1: 1918 ↛ 1939line 1918 didn't jump to line 1939 because the condition on line 1918 was always true
1919 for ts in tagsets1:
1920 ts = tuple(x for x in ts if " " not in x)
1921 # There are some generated tags containing
1922 # spaces; do not let them through here.
1923 extra_tags.extend(ts)
1924 form = (form[: m.start()] + subst + form[m.end() :]).strip()
1925 # brackets contain romanization
1926 elif (
1927 m.start() > 0
1928 and not roman
1929 and classify_desc(form[: m.start()]) == "other"
1930 and
1931 # "other" ~ text
1932 classify_desc(paren) in ("romanization", "english")
1933 and not re.search(r"^with |-form$", paren)
1934 ):
1935 roman = paren
1936 form = (form[: m.start()] + subst + form[m.end() :]).strip()
1937 elif re.search(r"^with |-form", paren): 1937 ↛ 1938line 1937 didn't jump to line 1938 because the condition on line 1937 was never true
1938 form = (form[: m.start()] + subst + form[m.end() :]).strip()
1939 return form, roman, clitic
1941 def merge_row_and_column_tags(form, some_has_covered_text):
1942 # Merge column tags and row tags. We give preference
1943 # to moods etc coming from rowtags (cf. austteigen/German/Verb
1944 # imperative forms).
1946 # In certain cases, what a tag means depends on whether
1947 # it is a row or column header. Depending on the language,
1948 # we replace certain tags with others if they're in
1949 # a column or row
1951 ret = []
1952 # rtagreplacs = get_lang_conf(lang, "rowtag_replacements")
1953 # ctagreplacs = get_lang_conf(lang, "coltag_replacements")
1954 for rt in sorted(rowtags):
1955 if "dummy-use-as-coltags" in rt: 1955 ↛ 1956line 1955 didn't jump to line 1956 because the condition on line 1955 was never true
1956 continue
1957 # if lang was in rowtag_replacements)
1958 # if not rtagreplacs == None:
1959 # rt = replace_directional_tags(rt, rtagreplacs)
1960 for ct in sorted(coltags):
1961 if "dummy-use-as-rowtags" in ct: 1961 ↛ 1962line 1961 didn't jump to line 1962 because the condition on line 1961 was never true
1962 continue
1963 # if lang was in coltag_replacements
1964 # if not ctagreplacs == None:
1965 # ct = replace_directional_tags(ct,
1966 # ctagreplacs)
1967 tags = set(global_tags)
1968 tags.update(extra_tags)
1969 tags.update(rt)
1970 tags.update(refs_tags)
1971 tags.update(tablecontext.section_header)
1972 # Merge tags from column. For certain kinds of tags,
1973 # those coming from row take precedence.
1974 old_tags = set(tags)
1975 for t in ct:
1976 c = valid_tags[t]
1977 if c in ("mood", "case", "number") and any(
1978 valid_tags[tt] == c for tt in old_tags
1979 ):
1980 continue
1981 tags.add(t)
1983 # Extract language-specific tags from the
1984 # form. This may also adjust the form.
1985 form, lang_tags = lang_specific_tags(lang, pos, form)
1986 tags.update(lang_tags)
1988 # For non-finite verb forms, see if they have
1989 # a gender/class suffix
1990 if pos == "verb" and any(
1991 valid_tags[t] == "non-finite" for t in tags
1992 ):
1993 form, tt = parse_head_final_tags(wxr, lang, form)
1994 tags.update(tt)
1996 # Remove "personal" tag if have nth person; these
1997 # come up with e.g. reconhecer/Portuguese/Verb. But
1998 # not if we also have "pronoun"
1999 if (
2000 "personal" in tags
2001 and "pronoun" not in tags
2002 and any(
2003 x in tags
2004 for x in [
2005 "first-person",
2006 "second-person",
2007 "third-person",
2008 ]
2009 )
2010 ):
2011 tags.remove("personal")
2013 # If we have impersonal, remove person and number.
2014 # This happens with e.g. viajar/Portuguese/Verb
2015 if "impersonal" in tags:
2016 tags = tags - set(
2017 [
2018 "first-person",
2019 "second-person",
2020 "third-person",
2021 "singular",
2022 "plural",
2023 ]
2024 )
2026 # Remove unnecessary "positive" tag from verb forms
2027 if pos == "verb" and "positive" in tags:
2028 if "negative" in tags: 2028 ↛ 2029line 2028 didn't jump to line 2029 because the condition on line 2028 was never true
2029 tags.remove("negative")
2030 tags.remove("positive")
2032 # Many Russian (and other Slavic) inflection tables
2033 # have animate/inanimate distinction that generates
2034 # separate entries for neuter/feminine, but the
2035 # distinction only applies to masculine. Remove them
2036 # form neuter/feminine and eliminate duplicates.
2037 if get_lang_conf(lang, "masc_only_animate"):
2038 for t1 in ("animate", "inanimate"):
2039 for t2 in ("neuter", "feminine"):
2040 if (
2041 t1 in tags
2042 and t2 in tags
2043 and "masculine" not in tags
2044 and "plural" not in tags
2045 ):
2046 tags.remove(t1)
2048 # German adjective tables contain "(keiner)" etc
2049 # for mixed declension plural. When the adjective
2050 # disappears and it becomes just one word, remove
2051 # the "includes-article" tag. e.g. eiskalt/German
2052 if "includes-article" in tags and " " not in form:
2053 tags.remove("includes-article")
2055 # Handle ignored forms. We mark that the form was
2056 # provided. This is important information; some words
2057 # just do not have a certain form. However, there also
2058 # many cases where no word in a language has a
2059 # particular form. Post-processing could detect and
2060 # remove such cases.
2061 if form in IGNORED_COLVALUES:
2062 # if cell text seems to be ignorable
2063 if "dummy-ignore-skipped" in tags:
2064 continue
2065 if (
2066 col_idx not in has_covering_hdr
2067 and some_has_covered_text
2068 ):
2069 continue
2070 # don't ignore this cell if there's been a header
2071 # above it
2072 form = "-"
2073 elif col_idx in has_covering_hdr:
2074 some_has_covered_text = True
2076 # Handle ambiguous object concord. If a header
2077 # gives the "dummy-object-concord"-tag to a word,
2078 # replace person, number and gender tags with
2079 # their "object-" counterparts so that the verb
2080 # agrees with the object instead.
2081 # Use only when the verb has ONLY object agreement!
2082 # a پخول/Pashto
2083 if "dummy-object-concord" in tags: 2083 ↛ 2084line 2083 didn't jump to line 2084 because the condition on line 2083 was never true
2084 for subtag, objtag in object_concord_replacements.items():
2085 if subtag in tags:
2086 tags.remove(subtag)
2087 tags.add(objtag)
2089 # Remove the dummy mood tag that we sometimes
2090 # use to block adding other mood and related
2091 # tags
2092 tags = tags - set(
2093 [
2094 "dummy-mood",
2095 "dummy-tense",
2096 "dummy-ignore-skipped",
2097 "dummy-object-concord",
2098 "dummy-reset-headers",
2099 "dummy-use-as-coltags",
2100 "dummy-use-as-rowtags",
2101 "dummy-store-hdrspan",
2102 "dummy-load-stored-hdrspans",
2103 "dummy-reset-stored-hdrspans",
2104 "dummy-section-header",
2105 ]
2106 )
2108 # Perform language-specific tag replacements according
2109 # to rules in a table.
2110 lang_tag_mappings = get_lang_conf(lang, "lang_tag_mappings")
2111 if lang_tag_mappings is not None: 2111 ↛ 2112line 2111 didn't jump to line 2112 because the condition on line 2111 was never true
2112 for pre, post in lang_tag_mappings.items():
2113 if all(t in tags for t in pre):
2114 tags = (tags - set(pre)) | set(post)
2116 # Warn if there are entries with empty tags
2117 if not tags:
2118 wxr.wtp.debug(
2119 "inflection table: empty tags for {}".format(form),
2120 sortid="inflection/1826",
2121 )
2123 # Warn if form looks like IPA
2124 ########## XXX ########
2125 # Because IPA is its own unicode block, we could also
2126 # technically do a Unicode name check to see if a string
2127 # contains IPA. Not all valid IPA characters are in the
2128 # IPA extension block, so you can technically have false
2129 # negatives if it's something like /toki/, but it
2130 # shouldn't give false positives.
2131 # Alternatively, you could make a list of IPA-admissible
2132 # characters and reject non-IPA stuff with that.
2133 if re.match(r"\s*/.*/\s*$", form): 2133 ↛ 2134line 2133 didn't jump to line 2134 because the condition on line 2133 was never true
2134 wxr.wtp.debug(
2135 "inflection table form looks like IPA: "
2136 "form={} tags={}".format(form, tags),
2137 sortid="inflection/1840",
2138 )
2140 # Note that this checks `form`, not `in tags`
2141 if form == "dummy-ignored-text-cell": 2141 ↛ 2142line 2141 didn't jump to line 2142 because the condition on line 2141 was never true
2142 continue
2144 if "dummy-remove-this-cell" in tags: 2144 ↛ 2145line 2144 didn't jump to line 2145 because the condition on line 2144 was never true
2145 continue
2147 # Add the form
2148 tags = list(sorted(tags))
2149 dt = {"form": form, "tags": tags, "source": source}
2150 if roman:
2151 dt["roman"] = roman
2152 if ipa:
2153 dt["ipa"] = ipa
2154 ret.append(dt)
2155 # If we got separate clitic form, add it
2156 if clitic:
2157 dt = {
2158 "form": clitic,
2159 "tags": tags + ["clitic"],
2160 "source": source,
2161 }
2162 ret.append(dt)
2163 return ret, form, some_has_covered_text
2165 # First extract definitions from cells
2166 # See defs_ht for footnote defs stuff
2167 for row in rows:
2168 for cell in row:
2169 text, refs, defs, hdr_tags = extract_cell_content(
2170 lang, word, cell.text
2171 )
2172 # refs, defs = footnote stuff, defs -> (ref, def)
2173 add_defs(defs)
2174 # Extract definitions from text after table
2175 text, refs, defs, hdr_tags = extract_cell_content(lang, word, after)
2176 add_defs(defs)
2178 # Then extract the actual forms
2179 ret = []
2180 hdrspans = []
2181 first_col_has_text = False
2182 rownum = 0
2183 title = None
2184 global_tags = []
2185 table_tags = []
2186 special_phrase_splits = get_lang_conf(lang, "special_phrase_splits")
2187 form_replacements = get_lang_conf(lang, "form_replacements")
2188 form_transformations = get_lang_conf(lang, "form_transformations")
2189 possibly_ignored_forms = get_lang_conf(lang, "conditionally_ignored_cells")
2190 cleanup_rules = get_lang_conf(lang, "minor_text_cleanups")
2192 for title in titles:
2193 more_global_tags, more_table_tags, extra_forms = parse_title(
2194 title, source
2195 )
2196 global_tags.extend(more_global_tags)
2197 table_tags.extend(more_table_tags)
2198 ret.extend(extra_forms)
2199 cell_rowcnt = collections.defaultdict(int)
2200 seen_cells = set()
2201 has_covering_hdr = set()
2202 some_has_covered_text = False
2203 for row in rows:
2204 # print("ROW:", row)
2205 # print("====")
2206 # print(f"Start of PREVIOUS row hdrspans:"
2207 # f"{tuple(sp.tagsets for sp in hdrspans)}")
2208 # print(f"Start of row txt: {tuple(t.text for t in row)}")
2209 if not row: 2209 ↛ 2210line 2209 didn't jump to line 2210 because the condition on line 2209 was never true
2210 continue # Skip empty rows
2211 all_headers = all(x.is_title or not x.text.strip() for x in row)
2212 text = row[0].text
2213 if (
2214 row[0].is_title
2215 and text
2216 and not is_superscript(text[0])
2217 and text not in infl_map # zealous inflation map?
2218 and (
2219 re.match(r"Inflection ", text)
2220 or re.sub(
2221 r"\s+",
2222 " ", # flatten whitespace
2223 re.sub(
2224 r"\s*\([^)]*\)",
2225 "",
2226 # Remove whitespace+parens
2227 text,
2228 ),
2229 ).strip()
2230 not in infl_map
2231 )
2232 and not re.match(infl_start_re, text)
2233 and all(
2234 x.is_title == row[0].is_title and x.text == text
2235 # all InflCells in `row` have the same is_title and text
2236 for x in row
2237 )
2238 ):
2239 if text and title is None:
2240 # Only if there were no titles previously make the first
2241 # text that is found the title
2242 title = text
2243 if re.match(r"(Note:|Notes:)", title): 2243 ↛ 2244line 2243 didn't jump to line 2244 because the condition on line 2243 was never true
2244 continue # not a title
2245 more_global_tags, more_table_tags, extra_forms = parse_title(
2246 title, source
2247 )
2248 global_tags.extend(more_global_tags)
2249 table_tags.extend(more_table_tags)
2250 ret.extend(extra_forms)
2251 continue # Skip title rows without incrementing i
2252 if "dummy-skip-this" in global_tags: 2252 ↛ 2253line 2252 didn't jump to line 2253 because the condition on line 2252 was never true
2253 return []
2254 rowtags = [()]
2255 # have_hdr = False
2256 # have_hdr never used?
2257 have_text = False
2258 samecell_cnt = 0
2259 col0_hdrspan = None # col0 or later header (despite its name)
2260 col0_followed_by_nonempty = False
2261 row_empty = True
2262 for col_idx, cell in enumerate(row):
2263 colspan = cell.colspan # >= 1
2264 rowspan = cell.rowspan # >= 1
2265 previously_seen = id(cell) in seen_cells
2266 # checks to see if this cell was in the previous ROW
2267 seen_cells.add(id(cell))
2268 if samecell_cnt == 0:
2269 # First column of a (possible multi-column) cell
2270 samecell_cnt = colspan - 1
2271 else:
2272 assert samecell_cnt > 0
2273 samecell_cnt -= 1
2274 continue
2276 # is_first_row_of_cell = cell_rowcnt[id(cell)] == 0
2277 # never used?
2279 # defaultdict(int) around line 1900
2280 cell_rowcnt[id(cell)] += 1
2281 # => how many cols this spans
2282 col = cell.text
2283 if not col:
2284 continue
2285 row_empty = False
2286 is_title = cell.is_title
2288 # If the cell has a target, i.e., text after colon, interpret
2289 # it as simply specifying a value for that value and ignore
2290 # it otherwise.
2291 if cell.target:
2292 text, refs, defs, hdr_tags = extract_cell_content(
2293 lang, word, col
2294 )
2295 if not text: 2295 ↛ 2296line 2295 didn't jump to line 2296 because the condition on line 2295 was never true
2296 continue
2297 refs_tags = set()
2298 for ref in refs: # gets tags from footnotes 2298 ↛ 2299line 2298 didn't jump to line 2299 because the loop on line 2298 never started
2299 if ref in def_ht:
2300 refs_tags.update(def_ht[ref])
2301 rowtags = expand_header(
2302 wxr,
2303 tablecontext,
2304 word,
2305 lang,
2306 pos,
2307 text,
2308 [],
2309 silent=True,
2310 depth=depth,
2311 )
2312 rowtags = list(
2313 set(tuple(sorted(set(x) | refs_tags)) for x in rowtags)
2314 )
2315 is_title = False
2316 col = cell.target
2318 # print(rownum, col_idx, col)
2319 # print(f"is_title: {is_title}")
2320 if is_title:
2321 # It is a header cell
2322 text, refs, defs, hdr_tags = extract_cell_content(
2323 lang, word, col
2324 )
2325 if not text:
2326 continue
2327 # Extract tags from referenced footnotes
2328 refs_tags = set()
2329 for ref in refs:
2330 if ref in def_ht:
2331 refs_tags.update(def_ht[ref])
2333 # Expand header to tags
2334 v = expand_header(
2335 wxr,
2336 tablecontext,
2337 word,
2338 lang,
2339 pos,
2340 text,
2341 [],
2342 silent=True,
2343 depth=depth,
2344 )
2345 # print("EXPANDED {!r} to {}".format(text, v))
2347 if col_idx == 0:
2348 # first_col_has_text is used for a test to ignore
2349 # upper-left cells that are just text without
2350 # header info
2351 first_col_has_text = True
2352 # Check if the header expands to reset hdrspans
2353 if any("dummy-reset-headers" in tt for tt in v):
2354 new_hdrspans = []
2355 for hdrspan in hdrspans:
2356 # if there are HdrSpan objects (abstract headers with
2357 # row- and column-spans) that are to the left or at the
2358 # same row or below, KEEP those; things above and to
2359 # the right of the hdrspan with dummy-reset-headers
2360 # are discarded. Tags from the header together with
2361 # dummy-reset-headers are kept as normal.
2362 if (
2363 hdrspan.start + hdrspan.colspan < col_idx
2364 or hdrspan.rownum > rownum - cell.rowspan
2365 ):
2366 new_hdrspans.append(hdrspan)
2367 hdrspans = new_hdrspans
2369 for tt in v:
2370 if "dummy-section-header" in tt: 2370 ↛ 2371line 2370 didn't jump to line 2371 because the condition on line 2370 was never true
2371 tablecontext.section_header = tt
2372 break
2373 if "dummy-reset-section-header" in tt: 2373 ↛ 2374line 2373 didn't jump to line 2374 because the condition on line 2373 was never true
2374 tablecontext.section_header = []
2375 # Text between headers on a row causes earlier headers to
2376 # be reset
2377 if have_text:
2378 # print(" HAVE_TEXT BEFORE HDR:", col)
2379 # Reset rowtags if new title column after previous
2380 # text cells
2381 # +-----+-----+-----+-----+
2382 # |hdr-a|txt-a|hdr-B|txt-B|
2383 # +-----+-----+-----+-----+
2384 # ^reset rowtags=>
2385 # XXX beware of header "—": "" - must not clear on that if
2386 # it expands to no tags
2387 rowtags = [()]
2388 # have_hdr = True
2389 # have_hdr never used?
2390 # print("HAVE_HDR: {} rowtags={}".format(col, rowtags))
2391 # Update rowtags and coltags
2392 has_covering_hdr.add(col_idx) # col_idx == current column
2393 # has_covering_hdr is a set that has the col_idx-ids of columns
2394 # that have previously had some kind of header. It is never
2395 # resetted inside the col_idx-loops OR the bigger rows-loop, so
2396 # applies to the whole table.
2398 rowtags, new_coltags, all_hdr_tags = generate_tags(
2399 rowtags, table_tags
2400 )
2402 if any("dummy-skip-this" in ts for ts in rowtags):
2403 continue # Skip this cell
2405 if any("dummy-load-stored-hdrspans" in ts for ts in v): 2405 ↛ 2406line 2405 didn't jump to line 2406 because the condition on line 2405 was never true
2406 hdrspans.extend(tablecontext.stored_hdrspans)
2408 if any("dummy-reset-stored-hdrspans" in ts for ts in v): 2408 ↛ 2409line 2408 didn't jump to line 2409 because the condition on line 2408 was never true
2409 tablecontext.stored_hdrspans = []
2411 if any("dummy-store-hdrspan" in ts for ts in v): 2411 ↛ 2413line 2411 didn't jump to line 2413 because the condition on line 2411 was never true
2412 # print(f"STORED: {col}")
2413 store_new_hdrspan = True
2414 else:
2415 store_new_hdrspan = False
2417 new_coltags = list(
2418 x
2419 for x in new_coltags
2420 if not any(t in noinherit_tags for t in x)
2421 )
2422 # print("new_coltags={} previously_seen={} all_hdr_tags={}"
2423 # .format(new_coltags, previously_seen, all_hdr_tags))
2424 if any(new_coltags):
2425 (
2426 col,
2427 col0_followed_by_nonempty,
2428 col0_hdrspan,
2429 ) = add_new_hdrspan(
2430 col,
2431 hdrspans,
2432 store_new_hdrspan,
2433 col0_followed_by_nonempty,
2434 col0_hdrspan,
2435 )
2437 continue
2439 # These values are ignored, at least for now
2440 if re.match(r"^(# |\(see )", col): 2440 ↛ 2441line 2440 didn't jump to line 2441 because the condition on line 2440 was never true
2441 continue
2443 if any("dummy-skip-this" in ts for ts in rowtags):
2444 continue # Skip this cell
2446 # If the word has no rowtags and is a multi-row cell, then
2447 # ignore this. This happens with empty separator rows
2448 # within a rowspan>1 cell. cf. wander/English/Conjugation.
2449 if rowtags == [()] and rowspan > 1:
2450 continue
2452 # Minor cleanup. See e.g. είμαι/Greek/Verb present participle.
2453 if cleanup_rules:
2454 for regx, substitution in cleanup_rules.items():
2455 col = re.sub(regx, substitution, col)
2457 if ( 2457 ↛ 2462line 2457 didn't jump to line 2462 because the condition on line 2457 was never true
2458 col_idx == 0
2459 and not first_col_has_text
2460 and get_lang_conf(lang, "ignore_top_left_text_cell") is True
2461 ):
2462 continue # Skip text at top left, as in Icelandic, Faroese
2464 # if col0_hdrspan is not None:
2465 # print("COL0 FOLLOWED NONHDR: {!r} by {!r}"
2466 # .format(col0_hdrspan.text, col))
2467 col0_followed_by_nonempty = True
2468 have_text = True
2470 # Determine column tags for the multi-column cell
2471 combined_coltags = compute_coltags(
2472 lang, pos, hdrspans, col_idx, colspan, col
2473 )
2474 if any("dummy-ignored-text-cell" in ts for ts in combined_coltags): 2474 ↛ 2475line 2474 didn't jump to line 2475 because the condition on line 2474 was never true
2475 continue
2477 # Split the text into separate forms. First simplify spaces except
2478 # newline.
2479 col = re.sub(r"[ \t\r]+", " ", col)
2480 # Split the cell text into alternatives
2482 col, alts, split_extra_tags = split_text_into_alts(col)
2484 # Some cells have mixed form content, like text and romanization,
2485 # or text and IPA. Handle these.
2486 alts = handle_mixed_lines(alts)
2488 alts = list((x, combined_coltags) for x in alts)
2490 # Generate forms from the alternatives
2491 # alts is a list of (tuple of forms, tuple of tags)
2492 for (form, base_roman, ipa), coltags in alts:
2493 form = form.strip()
2494 extra_tags = []
2495 extra_tags.extend(split_extra_tags)
2496 # Handle special splits again here, so that we can have custom
2497 # mappings from form to form and tags.
2498 if form in form_replacements:
2499 replacement, tags = form_replacements[form]
2500 for x in tags.split():
2501 assert x in valid_tags
2502 assert isinstance(replacement, str)
2503 assert isinstance(tags, str)
2504 form = replacement
2505 extra_tags.extend(tags.split())
2507 check_romanization_form_transformation = False
2508 # loop over regexes in form_transformation and replace text
2509 # in form using regex patterns
2510 # this does a bit of the same stuff the above does,
2511 # but with regexes and re.sub() instead
2512 for (
2513 form_transformations_pos,
2514 v,
2515 subst,
2516 tags,
2517 ) in form_transformations:
2518 # v is a pattern string, like "^ich"
2519 if pos != form_transformations_pos:
2520 continue
2521 m = re.search(v, form)
2522 if m is not None:
2523 form = re.sub(v, subst, form)
2524 for x in tags.split():
2525 assert x in valid_tags
2526 extra_tags.extend(tags.split())
2527 check_romanization_form_transformation = True
2528 break
2530 # Clean the value, extracting reference symbols
2531 form, refs, defs, hdr_tags = extract_cell_content(
2532 lang, word, form
2533 )
2534 # if refs:
2535 # print("REFS:", refs)
2536 extra_tags.extend(hdr_tags)
2537 # Extract tags from referenced footnotes
2538 # Extract tags from referenced footnotes
2539 refs_tags = set()
2540 for ref in refs:
2541 if ref in def_ht:
2542 refs_tags.update(def_ht[ref])
2544 if base_roman:
2545 if check_romanization_form_transformation: 2545 ↛ 2549line 2545 didn't jump to line 2549 because the condition on line 2545 was never true
2546 # because form_transformations are used to handle things
2547 # where the romanization has the "same" structure, we
2548 # need to handle that here too....
2549 for (
2550 _,
2551 v,
2552 subst,
2553 _,
2554 ) in form_transformations:
2555 # v is a pattern string, like "^ich"
2556 m = re.search(v, base_roman)
2557 if m is not None:
2558 base_roman = re.sub(v, subst, base_roman)
2559 # XXX add tag stuff here if needed
2560 break
2562 base_roman, _, _, hdr_tags = extract_cell_content(
2563 lang, word, base_roman
2564 )
2565 extra_tags.extend(hdr_tags)
2567 # Do some additional cleanup on the cell.
2568 form = re.sub(r"^\s*,\s*", "", form)
2569 form = re.sub(r"\s*,\s*$", "", form)
2570 form = re.sub(r"\s*(,\s*)+", ", ", form)
2571 form = re.sub(r"(?i)^Main:", "", form)
2572 form = re.sub(r"\s+", " ", form)
2573 form = form.strip()
2575 # Look for parentheses that have semantic meaning
2576 form, et = find_semantic_parens(form)
2577 extra_tags.extend(et)
2579 # Handle parentheses in the table element. We parse
2580 # tags anywhere and romanizations anywhere but beginning.
2581 roman = base_roman
2582 paren = None
2583 clitic = None
2584 m = re.search(r"(\s+|^)\(([^)]*)\)", form)
2585 # start|spaces + (anything)
2586 if m is not None:
2587 subst = m.group(1)
2588 paren = m.group(2)
2589 else:
2590 m = re.search(r"\(([^)]*)\)(\s+|$)", form)
2591 # (anything) + spaces|end
2592 if m is not None: 2592 ↛ 2593line 2592 didn't jump to line 2593 because the condition on line 2592 was never true
2593 paren = m.group(1)
2594 subst = m.group(2)
2595 if paren is not None:
2596 form, roman, clitic = handle_parens(
2597 form, roman, clitic, extra_tags
2598 )
2600 # Ignore certain forms that are not really forms,
2601 # unless they're really, really close to the article title
2602 if form in ( 2602 ↛ 2607line 2602 didn't jump to line 2607 because the condition on line 2602 was never true
2603 "",
2604 "unchanged",
2605 "after an", # in sona/Irish/Adj/Mutation
2606 ):
2607 Lev = distw([form], word)
2608 if form and Lev < 0.1:
2609 wxr.wtp.debug(
2610 "accepted possible false positive '{}' with"
2611 "> 0.1 Levenshtein distance in {}/{}".format(
2612 form, word, lang
2613 ),
2614 sortid="inflection/2213",
2615 )
2616 elif form and Lev < 0.3:
2617 wxr.wtp.debug(
2618 "skipped possible match '{}' with > 0.3"
2619 "Levenshtein distance in {}/{}".format(
2620 form, word, lang
2621 ),
2622 sortid="inflection/2218",
2623 )
2624 continue
2625 else:
2626 continue
2627 # print("ROWTAGS={} COLTAGS={} REFS_TAGS={} "
2628 # "FORM={!r} ROMAN={!r}"
2629 # .format(rowtags, coltags, refs_tags,
2630 # form, roman))
2632 # Merge tags from row and column and do miscellaneous
2633 # tag-related handling.
2634 (
2635 merge_ret,
2636 form,
2637 some_has_covered_text,
2638 ) = merge_row_and_column_tags(form, some_has_covered_text)
2639 ret.extend(merge_ret)
2641 # End of row.
2642 rownum += 1
2643 # For certain languages, if the row was empty, reset
2644 # hdrspans (saprast/Latvian/Verb, but not aussteigen/German/Verb).
2645 if row_empty and get_lang_conf(lang, "empty_row_resets"):
2646 hdrspans = []
2647 # Check if we should expand col0_hdrspan.
2648 if col0_hdrspan is not None:
2649 col0_allowed = get_lang_conf(lang, "hdr_expand_first")
2650 col0_cats = tagset_cats(col0_hdrspan.tagsets)
2651 # Only expand if col0_cats and later_cats are allowed
2652 # and don't overlap and col0 has tags, and there have
2653 # been no disallowed cells in between.
2654 if (
2655 not col0_followed_by_nonempty
2656 and not (col0_cats - col0_allowed)
2657 and
2658 # len(col0_cats) == 1 and
2659 col_idx > col0_hdrspan.start + col0_hdrspan.colspan
2660 ):
2661 # If an earlier header is only followed by headers that yield
2662 # no tags, expand it to entire row
2663 # print("EXPANDING COL0: {} from {} to {} cols {}"
2664 # .format(col0_hdrspan.text, col0_hdrspan.colspan,
2665 # len(row) - col0_hdrspan.start,
2666 # col0_hdrspan.tagsets))
2667 col0_hdrspan.colspan = len(row) - col0_hdrspan.start
2668 col0_hdrspan.expanded = True
2669 # XXX handle refs and defs
2670 # for x in hdrspans:
2671 # print(" HDRSPAN {} {} {} {!r}"
2672 # .format(x.start, x.colspan, x.tagsets, x.text))
2674 # Post-process German nouns with articles in separate columns. We move the
2675 # definite/indefinite/usually-without-article markers into the noun and
2676 # remove the article entries.
2677 if get_lang_conf(lang, "articles_in_separate_columns") and any(
2678 "noun" in x["tags"] for x in ret
2679 ):
2680 new_ret = []
2681 saved_tags = set()
2682 had_noun = False
2683 for dt in ret:
2684 tags = dt["tags"]
2685 # print(tags)
2686 if "noun" in tags:
2687 tags = list(
2688 sorted(set(t for t in tags if t != "noun") | saved_tags)
2689 )
2690 had_noun = True
2691 elif ( 2691 ↛ 2718line 2691 didn't jump to line 2718 because the condition on line 2691 was always true
2692 "indefinite" in tags
2693 or "definite" in tags
2694 or "usually-without-article" in tags
2695 or "without-article" in tags
2696 ):
2697 if had_noun:
2698 saved_tags = set(tags)
2699 else:
2700 saved_tags = saved_tags | set(tags) # E.g. Haus/German
2701 remove_useless_tags(lang, pos, saved_tags)
2702 saved_tags = saved_tags & set(
2703 [
2704 "masculine",
2705 "feminine",
2706 "neuter",
2707 "singular",
2708 "plural",
2709 "indefinite",
2710 "definite",
2711 "usually-without-article",
2712 "without-article",
2713 ]
2714 )
2715 had_noun = False
2716 continue # Skip the articles
2718 dt = dt.copy()
2719 dt["tags"] = tags
2720 new_ret.append(dt)
2721 ret = new_ret
2723 elif possibly_ignored_forms:
2724 # Some languages have tables with cells that are kind of separated
2725 # and difficult to handle, like eulersche Formel/German where
2726 # the definite and indefinite articles are just floating.
2727 # If a language has a dict of conditionally_ignored_cells,
2728 # and if the contents of a cell is found in one of the rules
2729 # there, ignore that cell if it
2730 # 1. Does not have the appropriate tag (like "definite" for "die")
2731 # and
2732 # 2. The title of the article is not one of the other co-words
2733 # (ie. it's an article for the definite articles in german etc.)
2734 # pass
2735 new_ret = []
2736 for cell_data in ret:
2737 tags = cell_data["tags"]
2738 text = cell_data["form"]
2739 skip_this = False
2740 for key_tag, ignored_forms in possibly_ignored_forms.items():
2741 if text not in ignored_forms: 2741 ↛ 2743line 2741 didn't jump to line 2743 because the condition on line 2741 was always true
2742 continue
2743 if word in ignored_forms:
2744 continue
2745 if key_tag not in tags:
2746 skip_this = True
2748 if skip_this: 2748 ↛ 2749line 2748 didn't jump to line 2749 because the condition on line 2748 was never true
2749 continue
2750 new_ret.append(cell_data)
2752 ret = new_ret
2754 # Post-process English inflection tables, addding "multiword-construction"
2755 # when the number of words has increased.
2756 if lang == "English" and pos == "verb":
2757 word_words = len(word.split())
2758 new_ret = []
2759 for dt in ret:
2760 form = dt.get("form", "")
2761 if len(form.split()) > word_words:
2762 dt = dt.copy()
2763 dt["tags"] = list(dt.get("tags", []))
2764 # This strange copy-assigning shuffle is preventative black
2765 # magic; do not touch lest you invoke deep bugs.
2766 data_append(dt, "tags", "multiword-construction")
2767 new_ret.append(dt)
2768 ret = new_ret
2770 # Always insert "table-tags" detail as the first entry in any inflection
2771 # table. This way we can reliably detect where a new table starts.
2772 # Table-tags applies until the next table-tags entry.
2773 if ret or table_tags:
2774 table_tags = list(sorted(set(table_tags)))
2775 dt = {
2776 "form": " ".join(table_tags),
2777 "source": source,
2778 "tags": ["table-tags"],
2779 }
2780 if dt["form"] == "":
2781 dt["form"] = "no-table-tags"
2782 if tablecontext.template_name:
2783 tn = {
2784 "form": tablecontext.template_name,
2785 "source": source,
2786 "tags": ["inflection-template"],
2787 }
2788 ret = [dt] + [tn] + ret
2789 else:
2790 ret = [dt] + ret
2792 return ret
2795def handle_generic_table(
2796 wxr, tablecontext, data, word, lang, pos, rows, titles, source, after, depth
2797):
2798 assert isinstance(wxr, WiktextractContext)
2799 assert isinstance(data, dict)
2800 assert isinstance(word, str)
2801 assert isinstance(lang, str)
2802 assert isinstance(pos, str)
2803 assert isinstance(rows, list)
2804 assert isinstance(source, str)
2805 assert isinstance(after, str)
2806 assert isinstance(depth, int)
2807 for row in rows:
2808 assert isinstance(row, list)
2809 for x in row:
2810 assert isinstance(x, InflCell)
2811 assert isinstance(titles, list)
2812 for x in titles:
2813 assert isinstance(x, str)
2815 # Try to parse the table as a simple table
2816 ret = parse_simple_table(
2817 wxr, tablecontext, word, lang, pos, rows, titles, source, after, depth
2818 )
2819 if ret is None: 2819 ↛ 2822line 2819 didn't jump to line 2822 because the condition on line 2819 was never true
2820 # XXX handle other table formats
2821 # We were not able to handle the table
2822 wxr.wtp.debug(
2823 "unhandled inflection table format, {}/{}".format(word, lang),
2824 sortid="inflection/2370",
2825 )
2826 return
2828 # Add the returned forms but eliminate duplicates.
2829 have_forms = set()
2830 for dt in ret:
2831 fdt = freeze(dt)
2832 if fdt in have_forms:
2833 continue # Don't add duplicates
2834 # Some Russian words have Declension and Pre-reform declension partially
2835 # duplicating same data. Don't add "dated" tags variant if already have
2836 # the same without "dated" from the modern declension table
2838 tags = dt.get("tags", [])
2839 for dated_tag in ("dated",):
2840 if dated_tag in tags:
2841 dt2 = dt.copy()
2842 tags2 = list(x for x in tags if x != dated_tag)
2843 dt2["tags"] = tags2
2844 if tags2 and freeze(dt2) in have_forms: 2844 ↛ 2845line 2844 didn't jump to line 2845 because the condition on line 2844 was never true
2845 break # Already have without archaic
2846 else:
2847 if "table-tags" not in tags:
2848 have_forms.add(fdt)
2849 data_append(data, "forms", dt)
2852def determine_header(
2853 wxr,
2854 tablecontext,
2855 lang,
2856 word,
2857 pos,
2858 table_kind,
2859 kind,
2860 style,
2861 row,
2862 col,
2863 celltext,
2864 titletext,
2865 cols_headered,
2866 target,
2867 cellstyle,
2868):
2869 assert isinstance(table_kind, NodeKind)
2870 assert isinstance(kind, (NodeKind, str))
2871 assert style is None or isinstance(style, str)
2872 assert cellstyle is None or isinstance(cellstyle, str)
2874 if table_kind == NodeKind.TABLE:
2875 header_kind = NodeKind.TABLE_HEADER_CELL
2876 elif table_kind == NodeKind.HTML: 2876 ↛ 2878line 2876 didn't jump to line 2878 because the condition on line 2876 was always true
2877 header_kind = "th"
2878 idx = celltext.find(": ")
2879 is_title = False
2880 # remove anything in parentheses, compress whitespace, .strip()
2881 cleaned_titletext = re.sub(
2882 r"\s+", " ", re.sub(r"\s*\([^)]*\)", "", titletext)
2883 ).strip()
2884 cleaned, _, _, _ = extract_cell_content(lang, word, celltext)
2885 cleaned = re.sub(r"\s+", " ", cleaned)
2886 hdr_expansion = expand_header(
2887 wxr,
2888 tablecontext,
2889 word,
2890 lang,
2891 pos,
2892 cleaned,
2893 [],
2894 silent=True,
2895 ignore_tags=True,
2896 )
2897 candidate_hdr = not any(
2898 any(t.startswith("error-") for t in ts) for ts in hdr_expansion
2899 )
2900 # KJ candidate_hdr says that a specific cell is a candidate
2901 # for being a header because it passed through expand_header
2902 # without getting any "error-" tags; that is, the contents
2903 # is "valid" for being a header; these are the false positives
2904 # we want to catch
2905 ignored_cell = any(
2906 any(t.startswith("dummy-") for t in ts) for ts in hdr_expansion
2907 )
2908 # ignored_cell should NOT be used to filter for headers, like
2909 # candidate_hdr is used, but only to filter for related *debug
2910 # messages*: some dummy-tags are actually half-way to headers,
2911 # like ones with "Notes", so they MUST be headers, but later
2912 # on they're ignored *as* headers so they don't need to print
2913 # out any cells-as-headers debug messages.
2914 if (
2915 candidate_hdr
2916 and kind != header_kind
2917 and cleaned != ""
2918 and cleaned != "dummy-ignored-text-cell"
2919 and cleaned not in IGNORED_COLVALUES
2920 ):
2921 # print("col: {}".format(col))
2922 if not ignored_cell and lang not in LANGUAGES_WITH_CELLS_AS_HEADERS:
2923 wxr.wtp.debug(
2924 "rejected heuristic header: "
2925 "table cell identified as header and given "
2926 "candidate status, BUT {} is not in "
2927 "LANGUAGES_WITH_CELLS_AS_HEADERS; "
2928 "cleaned text: {}".format(lang, cleaned),
2929 sortid="inflection/2447",
2930 )
2931 candidate_hdr = False
2932 elif cleaned not in LANGUAGES_WITH_CELLS_AS_HEADERS.get(lang, ""):
2933 wxr.wtp.debug(
2934 "rejected heuristic header: "
2935 "table cell identified as header and given "
2936 "candidate status, BUT the cleaned text is "
2937 "not in LANGUAGES_WITH_CELLS_AS_HEADERS[{}]; "
2938 "cleaned text: {}".format(lang, cleaned),
2939 sortid="inflection/2457",
2940 )
2941 candidate_hdr = False
2942 else:
2943 wxr.wtp.debug(
2944 "accepted heuristic header: "
2945 "table cell identified as header and given "
2946 "candidate status, AND the cleaned text is "
2947 "in LANGUAGES_WITH_CELLS_AS_HEADERS[{}]; "
2948 "cleaned text: {}".format(lang, cleaned),
2949 sortid="inflection/2466",
2950 )
2952 # If the cell starts with something that could start a
2953 # definition (typically a reference symbol), make it a candidate
2954 # regardless of whether the language is listed.
2955 if re.match(def_re, cleaned) and not re.match(nondef_re, cleaned): 2955 ↛ 2956line 2955 didn't jump to line 2956 because the condition on line 2955 was never true
2956 candidate_hdr = True
2958 # print("titletext={!r} hdr_expansion={!r} candidate_hdr={!r} "
2959 # "lang={} pos={}"
2960 # .format(titletext, hdr_expansion, candidate_hdr,
2961 # lang, pos))
2962 if idx >= 0 and titletext[:idx] in infl_map:
2963 target = titletext[idx + 2 :].strip()
2964 celltext = celltext[:idx]
2965 is_title = True
2966 elif (
2967 kind == header_kind
2968 and " + " not in titletext # For "avoir + blah blah"?
2969 and not any(
2970 isinstance(x, WikiNode)
2971 and x.kind == NodeKind.HTML
2972 and x.sarg == "span"
2973 and x.attrs.get("lang") in ("az",)
2974 for x in col.children
2975 )
2976 ):
2977 is_title = True
2978 elif (
2979 candidate_hdr
2980 and cleaned_titletext not in IGNORED_COLVALUES
2981 and distw([cleaned_titletext], word) > 0.3
2982 and cleaned_titletext not in ("I", "es")
2983 ):
2984 is_title = True
2985 # if first column or same style as first column
2986 elif (
2987 style == cellstyle
2988 and
2989 # and title is not identical to word name
2990 titletext != word
2991 and cleaned not in IGNORED_COLVALUES
2992 and cleaned != "dummy-ignored-text-cell"
2993 and
2994 # the style composite string is not broken
2995 not style.startswith("////")
2996 and " + " not in titletext
2997 ):
2998 if not ignored_cell and lang not in LANGUAGES_WITH_CELLS_AS_HEADERS: 2998 ↛ 2999line 2998 didn't jump to line 2999 because the condition on line 2998 was never true
2999 wxr.wtp.debug(
3000 "rejected heuristic header: "
3001 "table cell identified as header based "
3002 "on style, BUT {} is not in "
3003 "LANGUAGES_WITH_CELLS_AS_HEADERS; "
3004 "cleaned text: {}, style: {}".format(lang, cleaned, style),
3005 sortid="inflection/2512",
3006 )
3007 elif ( 3007 ↛ 3011line 3007 didn't jump to line 3011 because the condition on line 3007 was never true
3008 not ignored_cell
3009 and cleaned not in LANGUAGES_WITH_CELLS_AS_HEADERS.get(lang, "")
3010 ):
3011 wxr.wtp.debug(
3012 "rejected heuristic header: "
3013 "table cell identified as header based "
3014 "on style, BUT the cleaned text is "
3015 "not in LANGUAGES_WITH_CELLS_AS_HEADERS[{}]; "
3016 "cleaned text: {}, style: {}".format(lang, cleaned, style),
3017 sortid="inflection/2522",
3018 )
3019 else:
3020 wxr.wtp.debug(
3021 "accepted heuristic header: "
3022 "table cell identified as header based "
3023 "on style, AND the cleaned text is "
3024 "in LANGUAGES_WITH_CELLS_AS_HEADERS[{}]; "
3025 "cleaned text: {}, style: {}".format(lang, cleaned, style),
3026 sortid="inflection/2530",
3027 )
3028 is_title = True
3029 if ( 3029 ↛ 3036line 3029 didn't jump to line 3036 because the condition on line 3029 was never true
3030 not is_title
3031 and len(row) < len(cols_headered)
3032 and cols_headered[len(row)]
3033 ):
3034 # Whole column has title suggesting they are headers
3035 # (e.g. "Case")
3036 is_title = True
3037 if re.match(
3038 r"Conjugation of |Declension of |Inflection of |"
3039 r"Mutation of |Notes\b", # \b is word-boundary
3040 titletext,
3041 ):
3042 is_title = True
3043 return is_title, hdr_expansion, target, celltext
3046class TableContext:
3047 """Saved context used when parsing a table and its subtables."""
3049 __slot__ = (
3050 "stored_hdrspans",
3051 "section_header",
3052 "template_name",
3053 )
3055 def __init__(self, template_name=None):
3056 self.stored_hdrspans = []
3057 self.section_header = []
3058 if not template_name:
3059 self.template_name = ""
3060 else:
3061 self.template_name = template_name
3064def handle_wikitext_or_html_table(
3065 wxr, word, lang, pos, data, tree, titles, source, after, tablecontext=None
3066):
3067 """Parses a table from parsed Wikitext format into rows and columns of
3068 InflCell objects and then calls handle_generic_table() to parse it into
3069 forms. This adds the forms into ``data``."""
3070 assert isinstance(wxr, WiktextractContext)
3071 assert isinstance(word, str)
3072 assert isinstance(lang, str)
3073 assert isinstance(pos, str)
3074 assert isinstance(data, dict)
3075 assert isinstance(tree, WikiNode)
3076 assert tree.kind == NodeKind.TABLE or (
3077 tree.kind == NodeKind.HTML and tree.sarg == "table"
3078 )
3079 assert isinstance(titles, list)
3080 assert isinstance(source, str)
3081 for x in titles:
3082 assert isinstance(x, str)
3083 assert isinstance(after, str)
3084 assert tablecontext is None or isinstance(tablecontext, TableContext)
3085 # Imported here to avoid a circular import
3086 from wiktextract.page import clean_node, recursively_extract
3088 # from wikitextprocessor.parser import print_tree
3089 # print_tree(tree)
3090 # print("-------==========-------")
3092 if not tablecontext:
3093 tablecontext = TableContext()
3095 def handle_table1(
3096 wxr,
3097 tablecontext,
3098 word,
3099 lang,
3100 pos,
3101 data,
3102 tree,
3103 titles,
3104 source,
3105 after,
3106 depth,
3107 ):
3108 """Helper function allowing the 'flattening' out of the table
3109 recursion: instead of handling the tables in the wrong order
3110 (recursively), this function adds to new_row that is then
3111 iterated through in the main function at the end, creating
3112 a longer table (still in pieces) in the correct order."""
3114 assert isinstance(data, dict)
3115 assert isinstance(titles, list)
3116 assert isinstance(source, str)
3117 for x in titles:
3118 assert isinstance(x, str)
3119 assert isinstance(after, str)
3120 assert isinstance(depth, int)
3121 # print("HANDLE_WIKITEXT_TABLE", titles)
3123 col_gap_data = [] # Filling for columns with rowspan > 1
3124 # col_gap_data contains None or InflCell
3125 vertical_still_left = [] # Number of remaining rows for which to fill
3126 # the column; vertical_still_left contains int
3127 cols_headered = [] # [F, T, F, F...]
3128 # True when the whole column contains headers, even
3129 # when the cell is not considered a header; triggered
3130 # by the "*" inflmap meta-tag.
3131 rows = []
3133 sub_ret = []
3135 # from wikitextprocessor.parser import print_tree
3136 # print_tree(tree)
3137 for node in tree.children:
3138 if not isinstance(node, WikiNode):
3139 continue
3140 if node.kind == NodeKind.HTML:
3141 kind = node.sarg
3142 else:
3143 kind = node.kind
3145 # print(" {}".format(node))
3146 if kind in (NodeKind.TABLE_CAPTION, "caption"):
3147 # print(" CAPTION:", node)
3148 pass
3149 elif kind in (NodeKind.TABLE_ROW, "tr"):
3150 if "vsShow" in node.attrs.get("class", "").split():
3151 # vsShow rows are those that are intially shown in tables
3152 # that have more data. The hidden data duplicates these
3153 # rows, so we skip it and just process the hidden data.
3154 continue
3156 # if (
3157 # len(node.children) == 1
3158 # and node.children[0].attrs.get("class") == "separator"
3159 # ):
3160 # print("------------------ skip separator")
3161 # continue
3163 # Parse a table row.
3164 row = []
3165 style = None
3166 row_has_nonempty_cells = False
3167 # Have nonempty cell not from rowspan
3168 for col in get_table_cells(node):
3169 # loop through each cell in the ROW
3171 # The below skip is not needed anymore, because we "skip" in
3172 # get_table_cells, but left here as a comment
3173 # if not isinstance(col, WikiNode):
3174 # # This skip is not used for counting,
3175 # # "None" is not used in
3176 # # indexing or counting or looping.
3177 # continue
3178 if col.kind == NodeKind.HTML:
3179 kind = col.sarg
3180 else:
3181 kind = col.kind
3182 if kind not in ( 3182 ↛ 3188line 3182 didn't jump to line 3188 because the condition on line 3182 was never true
3183 NodeKind.TABLE_HEADER_CELL,
3184 NodeKind.TABLE_CELL,
3185 "th",
3186 "td",
3187 ):
3188 print(" UNEXPECTED ROW CONTENT: {}".format(col))
3189 continue
3191 while (
3192 len(row) < len(vertical_still_left)
3193 and vertical_still_left[len(row)] > 0
3194 ):
3195 # vertical_still_left is [...0, 0, 2...] for each
3196 # column. It is populated at the end of the loop, at the
3197 # same time as col_gap_data. This needs to be looped and
3198 # filled this way because each `for col`-looping jumps
3199 # straight to the next meaningful cell; there is no
3200 # "None" cells, only emptiness between, and rowspan and
3201 # colspan are just to generate the "fill-
3202 vertical_still_left[len(row)] -= 1
3203 row.append(col_gap_data[len(row)])
3205 # appending row is how "indexing" is
3206 # done here; something is appended,
3207 # like a filler-cell here or a "start"
3208 # cell at the end of the row-loop,
3209 # which increased len(row) which is
3210 # then used as the target-index to check
3211 # for gaps. vertical_still_left is
3212 # the countdown to when to stop
3213 # filling in gaps, and goes down to 0,
3214 # and col_gap_data is not touched
3215 # except when a new rowspan is needed,
3216 # at the same time that
3217 # vertical_still_left gets reassigned.
3219 try:
3220 rowspan = int(col.attrs.get("rowspan", "1")) # 🡙
3221 colspan = int(col.attrs.get("colspan", "1")) # 🡘
3222 except ValueError:
3223 rowspan = 1
3224 colspan = 1
3225 # print("COL:", col)
3227 # Too many of these errors
3228 if colspan > 100:
3229 # wxr.wtp.error(
3230 # f"Colspan {colspan} over 30, set to 1",
3231 # sortid="inflection/20250113a",
3232 # )
3233 colspan = 100
3234 if rowspan > 100: 3234 ↛ 3239line 3234 didn't jump to line 3239 because the condition on line 3234 was never true
3235 # wxr.wtp.error(
3236 # f"Rowspan {rowspan} over 30, set to 1",
3237 # sortid="inflection/20250113b",
3238 # )
3239 rowspan = 100
3241 # Process any nested tables recursively.
3242 tables, rest = recursively_extract(
3243 col,
3244 lambda x: isinstance(x, WikiNode)
3245 and (x.kind == NodeKind.TABLE or x.sarg == "table"),
3246 )
3248 # Clean the rest of the cell.
3249 celltext = clean_node(wxr, None, rest)
3250 # print("CLEANED:", celltext)
3251 # print(f"SUBTABLES: {tables}")
3253 # Handle nested tables.
3254 for tbl in tables:
3255 # Some nested tables (e.g., croí/Irish) have subtitles
3256 # as normal paragraphs in the same cell under a descrip-
3257 # tive text that should be treated as a title (e.g.,
3258 # "Forms with the definite article", with "definite" not
3259 # mentioned elsewhere).
3260 new_titles = list(titles)
3261 if celltext:
3262 new_titles.append(celltext)
3263 subtbl = handle_table1(
3264 wxr,
3265 tablecontext,
3266 word,
3267 lang,
3268 pos,
3269 data,
3270 tbl,
3271 new_titles,
3272 source,
3273 "",
3274 depth + 1,
3275 )
3276 if subtbl: 3276 ↛ 3254line 3276 didn't jump to line 3254 because the condition on line 3276 was always true
3277 sub_ret.append((rows, titles, after, depth))
3278 rows = []
3279 titles = []
3280 after = ""
3281 sub_ret.extend(subtbl)
3283 # This magic value is used as part of header detection
3284 cellstyle = (
3285 col.attrs.get("style", "")
3286 + "//"
3287 + col.attrs.get("class", "")
3288 + "//"
3289 + str(kind)
3290 )
3292 if not row: # if first column in row
3293 style = cellstyle
3294 target = None
3295 titletext = celltext.strip()
3296 while titletext and is_superscript(titletext[-1]):
3297 titletext = titletext[:-1]
3299 (
3300 is_title,
3301 hdr_expansion,
3302 target,
3303 celltext,
3304 ) = determine_header(
3305 wxr,
3306 tablecontext,
3307 lang,
3308 word,
3309 pos,
3310 tree.kind,
3311 kind,
3312 style,
3313 row,
3314 col,
3315 celltext,
3316 titletext,
3317 cols_headered,
3318 None,
3319 cellstyle,
3320 )
3322 if is_title:
3323 # If this cell gets a "*" tag, make the whole column
3324 # below it (toggling it in cols_headered = [F, F, T...])
3325 # into headers.
3326 while len(cols_headered) <= len(row):
3327 cols_headered.append(False)
3328 if any("*" in tt for tt in hdr_expansion):
3329 cols_headered[len(row)] = True
3330 celltext = ""
3331 # if row_has_nonempty_cells has been True at some point, it
3332 # keeps on being True.
3333 # if row_has_nonempty_cells or is_title or celltext != "":
3334 # row_has_nonempty_cells = True
3335 # ⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓
3336 row_has_nonempty_cells |= is_title or celltext != ""
3337 cell = InflCell(
3338 celltext, is_title, colspan, rowspan, target
3339 )
3340 for _ in range(0, colspan):
3341 # colspan🡘 current loop (col) or 1
3342 # All the data-filling for colspan
3343 # is done simply in this loop,
3344 # while rowspan needs to use
3345 # vertical_still_left to count gaps
3346 # and col_gap_data to fill in
3347 # those gaps with InflCell data.
3348 if rowspan > 1: # rowspan🡙 current loop (col) or 1
3349 while len(col_gap_data) <= len(row):
3350 # Initialize col_gap_data/ed if
3351 # it is lacking slots
3352 # for each column; col_gap_data and
3353 # vertical_still_left are never
3354 # reset to [], during
3355 # the whole table function.
3356 col_gap_data.append(None)
3357 vertical_still_left.append(0)
3358 # Below is where the "rectangle" block of rowspan
3359 # and colspan is filled for the future.
3360 col_gap_data[len(row)] = cell
3361 # col_gap_data contains cells that
3362 # will be used in the
3363 # future, or None
3364 vertical_still_left[len(row)] = rowspan - 1
3365 # A counter for how many gaps🡙 are still left to be
3366 # filled (row.append or
3367 # row[col_gap_data[len(row)] =>
3368 # rows), it is not reset to [], but decremented to 0
3369 # each time a row gets something from col_gap_data.
3370 # Append this cell 1+ times for colspan🡘
3371 row.append(cell)
3372 if not row:
3373 continue
3374 # After looping the original row-nodes above, fill
3375 # in the rest of the row if the final cell has colspan
3376 # (inherited from above, so a cell with rowspan and colspan)
3377 for i in range(len(row), len(vertical_still_left)):
3378 if vertical_still_left[i] <= 0:
3379 continue
3380 vertical_still_left[i] -= 1
3381 while len(row) < i:
3382 row.append(InflCell("", False, 1, 1, None))
3383 row.append(col_gap_data[i])
3384 # print(" ROW {!r}".format(row))
3385 if row_has_nonempty_cells: 3385 ↛ 3137line 3385 didn't jump to line 3137 because the condition on line 3385 was always true
3386 rows.append(row)
3387 elif kind in ( 3387 ↛ 3137line 3387 didn't jump to line 3137 because the condition on line 3387 was always true
3388 NodeKind.TABLE_HEADER_CELL,
3389 NodeKind.TABLE_CELL,
3390 "th",
3391 "td",
3392 "span",
3393 ):
3394 # print(" TOP-LEVEL CELL", node)
3395 pass
3397 if sub_ret:
3398 main_ret = sub_ret
3399 main_ret.append((rows, titles, after, depth))
3400 else:
3401 main_ret = [(rows, titles, after, depth)]
3402 return main_ret
3404 new_rows = handle_table1(
3405 wxr, tablecontext, word, lang, pos, data, tree, titles, source, after, 0
3406 )
3408 # Now we have a table that has been parsed into rows and columns of
3409 # InflCell objects. Parse the inflection table from that format.
3410 if new_rows: 3410 ↛ exitline 3410 didn't return from function 'handle_wikitext_or_html_table' because the condition on line 3410 was always true
3411 for rows, titles, after, depth in new_rows:
3412 handle_generic_table(
3413 wxr,
3414 tablecontext,
3415 data,
3416 word,
3417 lang,
3418 pos,
3419 rows,
3420 titles,
3421 source,
3422 after,
3423 depth,
3424 )
3427def get_table_cells(node: WikiNode) -> Generator[WikiNode, None, None]:
3428 """If a wikitext table cell contains HTML cells `<td>`, as they sometimes
3429 do because it is easier to write wikitext conditionals that way,
3430 those td-elements are parsed as child elements of the Wikitext cell.
3431 This generator will yield wikitext and HTML direct children of
3432 `node` and if a Wikitext TABLE_CELL has direct td-element children,
3433 those are also yielded."""
3434 for col in node.children:
3435 if not isinstance(col, WikiNode):
3436 continue
3437 if any(
3438 isinstance(c, HTMLNode) and c.sarg in ("th", "td")
3439 for c in col.children
3440 ):
3441 html_cells = []
3442 content = []
3443 for c in col.children:
3444 if isinstance(c, HTMLNode) and c.sarg in ("th", "td"):
3445 html_cells.append(c)
3446 else:
3447 content.append(c)
3448 # Remove td-elements from col so they are not returned twice
3449 col.children = content
3450 yield col
3451 for c in html_cells:
3452 yield c
3453 else:
3454 yield col
3457def handle_html_table(
3458 wxr, word, lang, pos, data, tree, titles, source, after, tablecontext=None
3459):
3460 """A passer-on function for html-tables, XXX, remove these?"""
3461 handle_wikitext_or_html_table(
3462 wxr, word, lang, pos, data, tree, titles, source, after, tablecontext
3463 )
3466def handle_wikitext_table(
3467 wxr, word, lang, pos, data, tree, titles, source, after, tablecontext=None
3468):
3469 """A passer-on function for html-tables, XXX, remove these?"""
3470 handle_wikitext_or_html_table(
3471 wxr, word, lang, pos, data, tree, titles, source, after, tablecontext
3472 )
3475def parse_inflection_section(
3476 wxr, data, word, lang, pos, section, tree, tablecontext=None
3477):
3478 """Parses an inflection section on a page. ``data`` should be the
3479 data for a part-of-speech, and inflections will be added to it."""
3481 # print("PARSE_INFLECTION_SECTION {}/{}/{}/{}"
3482 # .format(word, lang, pos, section))
3483 assert isinstance(wxr, WiktextractContext)
3484 assert isinstance(data, dict)
3485 assert isinstance(word, str)
3486 assert isinstance(lang, str)
3487 assert isinstance(section, str)
3488 assert isinstance(tree, WikiNode)
3489 assert tablecontext is None or isinstance(tablecontext, TableContext)
3490 source = section
3491 tables = []
3492 titleparts = []
3493 preceding_bolded_title = ""
3495 # from wikitextprocessor.parser import print_tree
3496 # print_tree(tree)
3497 # print("--------------******************----------------")
3499 def process_tables():
3500 for kind, node, titles, after in tables:
3501 after = "".join(after).strip()
3502 after = clean_value(wxr, after)
3503 if kind == "wikitext":
3504 handle_wikitext_table(
3505 wxr,
3506 word,
3507 lang,
3508 pos,
3509 data,
3510 node,
3511 titles,
3512 source,
3513 after,
3514 tablecontext=tablecontext,
3515 )
3516 elif kind == "html": 3516 ↛ 3530line 3516 didn't jump to line 3530 because the condition on line 3516 was always true
3517 handle_html_table(
3518 wxr,
3519 word,
3520 lang,
3521 pos,
3522 data,
3523 node,
3524 titles,
3525 source,
3526 after,
3527 tablecontext=tablecontext,
3528 )
3529 else:
3530 raise RuntimeError(
3531 "{}: unimplemented table kind {}".format(word, kind)
3532 )
3534 def recurse_navframe(node, titles):
3535 nonlocal tables
3536 nonlocal titleparts
3537 titleparts = []
3538 old_tables = tables
3539 tables = []
3541 recurse(node, [], navframe=True)
3543 process_tables()
3544 tables = old_tables
3546 def recurse(node, titles, navframe=False):
3547 nonlocal tables
3548 if isinstance(node, (list, tuple)):
3549 for x in node:
3550 recurse(x, titles, navframe)
3551 return
3552 if isinstance(node, str):
3553 if tables:
3554 tables[-1][-1].append(node)
3555 elif navframe:
3556 titleparts.append(node)
3557 return
3558 if not isinstance(node, WikiNode): 3558 ↛ 3559line 3558 didn't jump to line 3559 because the condition on line 3558 was never true
3559 if navframe:
3560 wxr.wtp.debug(
3561 "inflection table: unhandled in NavFrame: {}".format(node),
3562 sortid="inflection/2907",
3563 )
3564 return
3565 kind = node.kind
3566 if navframe:
3567 if kind == NodeKind.HTML:
3568 classes = node.attrs.get("class", "").split()
3569 if "NavToggle" in classes: 3569 ↛ 3570line 3569 didn't jump to line 3570 because the condition on line 3569 was never true
3570 return
3571 if "NavHead" in classes:
3572 # print("NAVHEAD:", node)
3573 recurse(node.children, titles, navframe)
3574 return
3575 if "NavContent" in classes:
3576 # print("NAVCONTENT:", node)
3577 title = "".join(titleparts).strip()
3578 title = html.unescape(title)
3579 title = title.strip()
3580 new_titles = list(titles)
3581 if not re.match(r"(Note:|Notes:)", title): 3581 ↛ 3583line 3581 didn't jump to line 3583 because the condition on line 3581 was always true
3582 new_titles.append(title)
3583 recurse(node, new_titles, navframe=False)
3584 return
3585 else:
3586 if kind == NodeKind.TABLE:
3587 tables.append(["wikitext", node, titles, []])
3588 return
3589 elif kind == NodeKind.HTML and node.sarg == "table":
3590 classes = node.attrs.get("class", ())
3591 if "audiotable" in classes:
3592 return
3593 tables.append(["html", node, titles, []])
3594 return
3595 elif kind in ( 3595 ↛ 3602line 3595 didn't jump to line 3602 because the condition on line 3595 was never true
3596 NodeKind.LEVEL2,
3597 NodeKind.LEVEL3,
3598 NodeKind.LEVEL4,
3599 NodeKind.LEVEL5,
3600 NodeKind.LEVEL6,
3601 ):
3602 return # Skip subsections
3603 if (
3604 kind == NodeKind.HTML
3605 and node.sarg == "div"
3606 and "NavFrame" in node.attrs.get("class", "").split()
3607 ):
3608 recurse_navframe(node, titles)
3609 return
3610 if kind == NodeKind.LINK:
3611 if len(node.largs) > 1:
3612 recurse(node.largs[1:], titles, navframe)
3613 else:
3614 recurse(node.largs[0], titles, navframe)
3615 return
3616 if kind == NodeKind.LIST and node.sarg == ";":
3617 nonlocal preceding_bolded_title
3618 from wiktextract.page import clean_node
3620 preceding_bolded_title = clean_node(wxr, None, node).strip("; ")
3621 for x in node.children:
3622 recurse(x, titles, navframe)
3624 assert tree.kind == NodeKind.ROOT
3625 for x in tree.children:
3626 if preceding_bolded_title != "":
3627 recurse(x, [preceding_bolded_title])
3628 else:
3629 recurse(x, [])
3631 # Process the tables we found
3632 process_tables()
3634 # XXX this code is used for extracting tables for inflection tests
3635 if wxr.config.expand_tables: 3635 ↛ 3636line 3635 didn't jump to line 3636 because the condition on line 3635 was never true
3636 if section != "Mutation":
3637 with open(wxr.config.expand_tables, "w") as f:
3638 f.write(word + "\n")
3639 f.write(lang + "\n")
3640 f.write(pos + "\n")
3641 f.write(section + "\n")
3642 text = wxr.wtp.node_to_wikitext(tree)
3643 f.write(text + "\n")