Coverage for src/wiktextract/extractor/en/inflection.py: 87%
1536 statements
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
1# Code for parsing inflection tables.
2#
3# Copyright (c) 2021-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org.
5import collections
6import copy
7import functools
8import html
9import re
10import unicodedata
11from typing import Generator, Optional, Union
13from mediawiki_langcodes import code_to_name, name_to_code
14from wikitextprocessor import MAGIC_FIRST, HTMLNode, NodeKind, WikiNode
16from ...clean import clean_value
17from ...datautils import data_append, freeze, split_at_comma_semi
18from ...tags import valid_tags
19from ...wxr_context import WiktextractContext
20from .form_descriptions import (
21 classify_desc,
22 decode_tags,
23 distw,
24 parse_head_final_tags,
25)
26from .inflection_kludges import ka_decl_noun_template_cell
27from .inflectiondata import infl_map, infl_start_map, infl_start_re
28from .lang_specific_configs import get_lang_conf, lang_specific_tags
29from .table_headers_heuristics_data import LANGUAGES_WITH_CELLS_AS_HEADERS
30from .type_utils import FormData
32# --debug-text-cell WORD
33# Command-line parameter for debugging. When parsing inflection tables,
34# print out debug messages when encountering this text.
35debug_cell_text: Optional[str] = None
38def set_debug_cell_text(text: str) -> None:
39 global debug_cell_text
40 debug_cell_text = text
43TagSets = list[tuple[str, ...]]
45# Column texts that are interpreted as an empty column.
46IGNORED_COLVALUES = {
47 "-",
48 "־",
49 "᠆",
50 "‐",
51 "‑",
52 "‒",
53 "–",
54 "—",
55 "―",
56 "−",
57 "⸺",
58 "⸻",
59 "﹘",
60 "﹣",
61 "-",
62 "/",
63 "?",
64 "not used",
65 "not applicable",
66}
68# These tags are never inherited from above
69# XXX merge with lang_specific
70noinherit_tags = {
71 "infinitive-i",
72 "infinitive-i-long",
73 "infinitive-ii",
74 "infinitive-iii",
75 "infinitive-iv",
76 "infinitive-v",
77}
79# Subject->object transformation mapping, when using dummy-object-concord
80# to replace subject concord tags with object concord tags
81object_concord_replacements = {
82 "first-person": "object-first-person",
83 "second-person": "object-second-person",
84 "third-person": "object-third-person",
85 "singular": "object-singular",
86 "plural": "object-plural",
87 "definite": "object-definite",
88 "indefinite": "object-indefinite",
89 "class-1": "object-class-1",
90 "class-2": "object-class-2",
91 "class-3": "object-class-3",
92 "class-4": "object-class-4",
93 "class-5": "object-class-5",
94 "class-6": "object-class-6",
95 "class-7": "object-class-7",
96 "class-8": "object-class-8",
97 "class-9": "object-class-9",
98 "class-10": "object-class-10",
99 "class-11": "object-class-11",
100 "class-12": "object-class-12",
101 "class-13": "object-class-13",
102 "class-14": "object-class-14",
103 "class-15": "object-class-15",
104 "class-16": "object-class-16",
105 "class-17": "object-class-17",
106 "class-18": "object-class-18",
107 "masculine": "object-masculine",
108 "feminine": "object-feminine",
109}
111# Words in title that cause addition of tags in all entries
112title_contains_global_map = {
113 "possessive": "possessive",
114 "possessed forms of": "possessive",
115 "predicative forms of": "predicative",
116 "negative": "negative",
117 "positive definite forms": "positive definite",
118 "positive indefinite forms": "positive indefinite",
119 "comparative": "comparative",
120 "superlative": "superlative",
121 "combined forms": "combined-form",
122 "mutation": "mutation",
123 "definite article": "definite",
124 "indefinite article": "indefinite",
125 "indefinite declension": "indefinite",
126 "bare forms": "indefinite", # e.g., cois/Irish
127 "definite declension": "definite",
128 "pre-reform": "dated",
129 "personal pronouns": "personal pronoun",
130 "composed forms of": "multiword-construction",
131 "subordinate-clause forms of": "subordinate-clause",
132 "participles of": "participle",
133 "variation of": "dummy-skip-this", # a'/Scottish Gaelic
134 "command form of": "imperative", # a راتلل/Pashto
135 "historical inflection of": "dummy-skip-this", # kork/Norwegian Nynorsk
136 "obsolete declension": "obsolete", # März/German 20241111
137}
138for k, v in title_contains_global_map.items():
139 if any(t not in valid_tags for t in v.split()): 139 ↛ 140line 139 didn't jump to line 140 because the condition on line 139 was never true
140 print("TITLE_CONTAINS_GLOBAL_MAP UNRECOGNIZED TAG: {}: {}".format(k, v))
141table_hdr_ign_part = r"(Inflection|Conjugation|Declension|Mutation) of [^\s]"
143table_hdr_ign_part_re = re.compile(r"(?i)(" + table_hdr_ign_part + ")")
144# (?i) python regex extension, ignore case
145title_contains_global_re = re.compile(
146 r"(?i)(^|\b)({}|{})($|\b)".format(
147 table_hdr_ign_part,
148 "|".join(re.escape(x) for x in title_contains_global_map.keys()),
149 )
150)
152# Words in title that cause addition of tags to table-tags "form"
153title_contains_wordtags_map = {
154 "pf": "perfective",
155 "impf": "imperfective",
156 "strong": "strong",
157 "weak": "weak",
158 "countable": "countable",
159 "uncountable": "uncountable",
160 "inanimate": "inanimate",
161 "animate": "animate",
162 "transitive": "transitive",
163 "intransitive": "intransitive",
164 "ditransitive": "ditransitive",
165 "ambitransitive": "ambitransitive",
166 "archaic": "archaic",
167 "dated": "dated",
168 "affirmative": "affirmative",
169 "negative": "negative",
170 "subject pronouns": "subjective",
171 "object pronouns": "objective",
172 "emphatic": "emphatic",
173 "proper noun": "proper-noun",
174 "no plural": "no-plural",
175 "imperfective": "imperfective",
176 "perfective": "perfective",
177 "no supine stem": "no-supine",
178 "no perfect stem": "no-perfect",
179 "deponent": "deponent",
180 "irregular": "irregular",
181 "no short forms": "no-short-form",
182 "iō-variant": "iō-variant",
183 "1st declension": "declension-1",
184 "2nd declension": "declension-2",
185 "3rd declension": "declension-3",
186 "4th declension": "declension-4",
187 "5th declension": "declension-5",
188 "6th declension": "declension-6",
189 "first declension": "declension-1",
190 "second declension": "declension-2",
191 "third declension": "declension-3",
192 "fourth declension": "declension-4",
193 "fifth declension": "declension-5",
194 "sixth declension": "declension-6",
195 "1st conjugation": "conjugation-1",
196 "2nd conjugation": "conjugation-2",
197 "3rd conjugation": "conjugation-3",
198 "4th conjugation": "conjugation-4",
199 "5th conjugation": "conjugation-5",
200 "6th conjugation": "conjugation-6",
201 "7th conjugation": "conjugation-7",
202 "first conjugation": "conjugation-1",
203 "second conjugation": "conjugation-2",
204 "third conjugation": "conjugation-3",
205 "fourth conjugation": "conjugation-4",
206 "fifth conjugation": "conjugation-5",
207 "sixth conjugation": "conjugation-6",
208 "seventh conjugation": "conjugation-7",
209 # Corsican regional tags in table header
210 "cismontane": "Cismontane",
211 "ultramontane": "Ultramontane",
212 "western lombard": "Western-Lombard",
213 "eastern lombard": "Eastern-Lombard",
214 "contracted": "contracted",
215 "present": "present",
216 "perfect": "perfect",
217 "imperfect": "imperfect",
218 "pluperfect": "pluperfect",
219 "future": "future",
220 "aorist": "aorist",
221}
222for k, v in title_contains_wordtags_map.items():
223 if any(t not in valid_tags for t in v.split()): 223 ↛ 224line 223 didn't jump to line 224 because the condition on line 223 was never true
224 print(
225 "TITLE_CONTAINS_WORDTAGS_MAP UNRECOGNIZED TAG: {}: {}".format(k, v)
226 )
227title_contains_wordtags_re = re.compile(
228 r"(?i)(^|\b)({}|{})($|\b)".format(
229 table_hdr_ign_part,
230 "|".join(re.escape(x) for x in title_contains_wordtags_map.keys()),
231 )
232)
234# Parenthesized elements in title that are converted to tags in
235# "table-tags" form
236title_elements_map = {
237 "weak": "weak",
238 "strong": "strong",
239 "separable": "separable",
240 "masculine": "masculine",
241 "feminine": "feminine",
242 "neuter": "neuter",
243 "singular": "singular",
244 "plural": "plural",
245 "archaic": "archaic",
246 "dated": "dated",
247 "iterative": "iterative",
248 "poetic": "poetic",
249 "Attic": "Attic",
250 "Epic": "Epic",
251 "Aeolic": "Aeolic",
252 "Arcadocypriot": "Arcadocypriot",
253 "Old Attic": "Old-Attic",
254 "Boeotian": "Boeotian",
255 "Byzantine": "Byzantine",
256 "Choral Doric": "Choral-Doric",
257 "Doric": "Doric",
258 "Elean": "Elean",
259 "Epirote": "Epirote",
260 "Ionic": "Ionic",
261 "Koine": "Koine",
262 "Cretan": "Cretan",
263 "Corinthian": "Corinthian",
264 "Laconian": "Laconian",
265 "Later poetic": "Later-poetic-Ancient-Greek",
266 "Lesbian": "Lesbian",
267 "Locrian": "Locrian",
268 "Lyric": "Lyric-Ancient-Greek",
269 "Thessalian": "Thessalian",
270 "Tragic": "Tragic-Ancient-Greek",
271}
272for k, v in title_elements_map.items():
273 if any(t not in valid_tags for t in v.split()): 273 ↛ 274line 273 didn't jump to line 274 because the condition on line 273 was never true
274 print("TITLE_ELEMENTS_MAP UNRECOGNIZED TAG: {}: {}".format(k, v))
276# Parenthized element starts to map them to tags for form for the rest of
277# the element
278title_elemstart_map = {
279 "auxiliary": "auxiliary",
280 "Kotus type": "class",
281 "ÕS type": "class",
282 "class": "class",
283 "short class": "class",
284 "type": "class",
285 "strong class": "class",
286 "weak class": "class",
287 "accent paradigm": "accent-paradigm",
288 "stem in": "class",
289}
290for k, v in title_elemstart_map.items():
291 if any(t not in valid_tags for t in v.split()): 291 ↛ 292line 291 didn't jump to line 292 because the condition on line 291 was never true
292 print("TITLE_ELEMSTART_MAP UNRECOGNIZED TAG: {}: {}".format(k, v))
293title_elemstart_re = re.compile(
294 r"^({}) ".format("|".join(re.escape(x) for x in title_elemstart_map.keys()))
295)
298# Regexp for cell starts that are likely definitions of reference symbols.
299# See also nondef_re.
300def_re = re.compile(
301 r"(\s*•?\s+)?"
302 r"((\*+|[△†0123456789⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻]+)([⁾):]|\s|(?=[A-Z]))|"
303 r"\^(\*+|[△†])|"
304 r"([¹²³⁴⁵⁶⁷⁸⁹])|"
305 r"([ᴬᴮᴰᴱᴳᴴᴵᴶᴷᴸᴹᴺᴼᴾᴿᵀᵁⱽᵂᵃᵇᶜᵈᵉᶠᵍʰⁱʲᵏˡᵐⁿᵒᵖʳˢᵗᵘᵛʷˣʸᶻᵝᵞᵟᶿᶥᵠᵡ]))"
306)
307# ᴺᴸᴴ persan/Old Irish
309# Regexp for cell starts that are exceptions to def_re and do not actually
310# start a definition.
311nondef_re = re.compile(
312 r"(^\s*(1|2|3)\s+(sg|pl)\s*$|" # 1s or 3p etc.
313 r"\s*\d\d?\s*/\s*\d\d?\s*$)"
314) # taka/Swahili "15 / 17"
317class InflCell:
318 """Cell in an inflection table."""
320 __slots__ = (
321 "text",
322 "is_title",
323 "colspan",
324 "rowspan",
325 "target",
326 )
328 def __init__(
329 self,
330 text: str,
331 is_title: bool,
332 colspan: int,
333 rowspan: int,
334 target: Optional[str],
335 ) -> None:
336 assert isinstance(text, str)
337 assert is_title in (True, False)
338 assert isinstance(colspan, int) and colspan >= 1
339 assert isinstance(rowspan, int) and rowspan >= 1
340 assert target is None or isinstance(target, str)
341 self.text = text.strip()
342 self.is_title = text and is_title
343 self.colspan = colspan
344 self.rowspan = rowspan
345 self.target = target
347 def __str__(self) -> str:
348 v = "{}/{}/{}/{!r}".format(
349 self.text, self.is_title, self.colspan, self.rowspan
350 )
351 if self.target:
352 v += ": {!r}".format(self.target)
353 return v
355 def __repr__(self) -> str:
356 return str(self)
359class HdrSpan:
360 """Saved information about a header cell/span during the parsing
361 of a table."""
363 __slots__ = (
364 "start",
365 "colspan",
366 "rowspan",
367 "rownum", # Row number where this occurred
368 "tagsets", # list of tuples
369 "text", # For debugging
370 "all_headers_row",
371 "expanded", # The header has been expanded to cover whole row/part
372 )
374 def __init__(
375 self,
376 start: int,
377 colspan: int,
378 rowspan: int,
379 rownum: int,
380 tagsets: TagSets,
381 text: str,
382 all_headers_row: bool,
383 ) -> None:
384 assert isinstance(start, int) and start >= 0
385 assert isinstance(colspan, int) and colspan >= 1
386 assert isinstance(rownum, int)
387 assert isinstance(tagsets, list)
388 for x in tagsets:
389 assert isinstance(x, tuple)
390 assert all_headers_row in (True, False)
391 self.start = start
392 self.colspan = colspan
393 self.rowspan = rowspan
394 self.rownum = rownum
395 self.tagsets = list(tuple(sorted(set(tags))) for tags in tagsets)
396 self.text = text
397 self.all_headers_row = all_headers_row
398 self.expanded = False
401def is_superscript(ch: str) -> bool:
402 """Returns True if the argument is a superscript character."""
403 assert isinstance(ch, str) and len(ch) == 1
404 try:
405 name = unicodedata.name(ch)
406 except ValueError:
407 return False
408 return (
409 re.match(
410 r"SUPERSCRIPT |"
411 r"MODIFIER LETTER SMALL |"
412 r"MODIFIER LETTER CAPITAL ",
413 name,
414 )
415 is not None
416 )
419def remove_useless_tags(lang: str, pos: str, tags: set[str]) -> None:
420 """Remove certain tag combinations from ``tags`` when they serve no purpose
421 together (cover all options)."""
422 assert isinstance(lang, str)
423 assert isinstance(pos, str)
424 assert isinstance(tags, set)
425 if (
426 "animate" in tags
427 and "inanimate" in tags
428 and get_lang_conf(lang, "animate_inanimate_remove")
429 ):
430 tags.remove("animate")
431 tags.remove("inanimate")
432 if (
433 "virile" in tags
434 and "nonvirile" in tags
435 and get_lang_conf(lang, "virile_nonvirile_remove")
436 ):
437 tags.remove("virile")
438 tags.remove("nonvirile")
439 # If all numbers in the language are listed, remove them all
440 numbers = get_lang_conf(lang, "numbers")
441 if numbers and all(x in tags for x in numbers):
442 for x in numbers:
443 tags.remove(x)
444 # If all genders in the language are listed, remove them all
445 genders = get_lang_conf(lang, "genders")
446 if genders and all(x in tags for x in genders):
447 for x in genders:
448 tags.remove(x)
449 # If all voices in the language are listed, remove them all
450 voices = get_lang_conf(lang, "voices")
451 if voices and all(x in tags for x in voices):
452 for x in voices:
453 tags.remove(x)
454 # If all strengths of the language are listed, remove them all
455 strengths = get_lang_conf(lang, "strengths")
456 if strengths and all(x in tags for x in strengths):
457 for x in strengths:
458 tags.remove(x)
459 # If all persons of the language are listed, remove them all
460 persons = get_lang_conf(lang, "persons")
461 if persons and all(x in tags for x in persons):
462 for x in persons:
463 tags.remove(x)
464 # If all definitenesses of the language are listed, remove them all
465 definitenesses = get_lang_conf(lang, "definitenesses")
466 if definitenesses and all(x in tags for x in definitenesses):
467 for x in definitenesses:
468 tags.remove(x)
471def tagset_cats(tagset: TagSets) -> set[str]:
472 """Returns a set of tag categories for the tagset (merged from all
473 alternatives)."""
474 return set(valid_tags[t] for ts in tagset for t in ts)
477def or_tagsets(
478 lang: str, pos: str, tagsets1: TagSets, tagsets2: TagSets
479) -> TagSets:
480 """Merges two tagsets (the new tagset just merges the tags from both, in
481 all combinations). If they contain simple alternatives (differ in
482 only one category), they are simply merged; otherwise they are split to
483 more alternatives. The tagsets are assumed be sets of sorted tuples."""
484 assert isinstance(tagsets1, list)
485 assert all(isinstance(x, tuple) for x in tagsets1)
486 assert isinstance(tagsets2, list)
487 assert all(isinstance(x, tuple) for x in tagsets1)
488 tagsets: TagSets = [] # This will be the result
490 def add_tags(tags1: tuple[str, ...]) -> None:
491 # CONTINUE
492 if not tags1:
493 return # empty set would merge with anything, won't change result
494 if not tagsets:
495 tagsets.append(tags1)
496 return
497 for tags2 in tagsets:
498 # Determine if tags1 can be merged with tags2
499 num_differ = 0
500 if tags1 and tags2: 500 ↛ 518line 500 didn't jump to line 518 because the condition on line 500 was always true
501 cats1 = set(valid_tags[t] for t in tags1)
502 cats2 = set(valid_tags[t] for t in tags2)
503 cats = cats1 | cats2
504 for cat in cats:
505 tags1_in_cat = set(t for t in tags1 if valid_tags[t] == cat)
506 tags2_in_cat = set(t for t in tags2 if valid_tags[t] == cat)
507 if (
508 tags1_in_cat != tags2_in_cat
509 or not tags1_in_cat
510 or not tags2_in_cat
511 ):
512 num_differ += 1
513 if not tags1_in_cat or not tags2_in_cat:
514 # Prevent merging if one is empty
515 num_differ += 1
516 # print("tags1={} tags2={} num_differ={}"
517 # .format(tags1, tags2, num_differ))
518 if num_differ <= 1:
519 # Yes, they can be merged
520 tagsets.remove(tags2)
521 tags_s = set(tags1) | set(tags2)
522 remove_useless_tags(lang, pos, tags_s)
523 tags_t = tuple(sorted(tags_s))
524 add_tags(tags_t) # Could result in further merging
525 return
526 # If we could not merge, add to tagsets
527 tagsets.append(tags1)
529 for tags in tagsets1:
530 add_tags(tags)
531 for tags in tagsets2:
532 add_tags(tags)
533 if not tagsets:
534 tagsets.append(())
536 # print("or_tagsets: {} + {} -> {}"
537 # .format(tagsets1, tagsets2, tagsets))
538 return tagsets
541def and_tagsets(
542 lang: str,
543 pos: str,
544 tagsets1: list[tuple[str, ...]],
545 tagsets2: list[tuple[str, ...]],
546) -> list[tuple[str, ...]]:
547 """Merges tagsets by taking union of all cobinations, without trying
548 to determine whether they are compatible."""
549 assert isinstance(tagsets1, list) and len(tagsets1) >= 1
550 assert all(isinstance(x, tuple) for x in tagsets1)
551 assert isinstance(tagsets2, list) and len(tagsets2) >= 1
552 assert all(isinstance(x, tuple) for x in tagsets1)
553 new_tagsets = []
554 tags: Union[set[str], tuple[str, ...]]
555 for tags1 in tagsets1:
556 for tags2 in tagsets2:
557 tags = set(tags1) | set(tags2)
558 remove_useless_tags(lang, pos, tags)
559 if "dummy-ignored-text-cell" in tags: 559 ↛ 560line 559 didn't jump to line 560 because the condition on line 559 was never true
560 tags.remove("dummy-ignored-text-cell")
561 tags = tuple(sorted(tags))
562 if tags not in new_tagsets: 562 ↛ 556line 562 didn't jump to line 556 because the condition on line 562 was always true
563 new_tagsets.append(tags)
564 # print("and_tagsets: {} + {} -> {}"
565 # .format(tagsets1, tagsets2, new_tagsets))
566 return new_tagsets
569@functools.lru_cache(65536)
570def extract_cell_content(
571 lang: str, word: str, col: str
572) -> tuple[str, list[str], list[tuple[str, str]], list[str]]:
573 """Cleans a row/column header for later processing. This returns
574 (cleaned, refs, defs, tags)."""
575 # print("EXTRACT_CELL_CONTENT {!r}".format(col))
576 hdr_tags = []
577 col = re.sub(r"(?s)\s*,\s*$", "", col)
578 col = re.sub(r"(?s)\s*•\s*$", "", col)
579 col = re.sub(r"\s+", " ", col)
580 col = col.strip()
581 if re.search(
582 r"^\s*(There are |"
583 r"\* |"
584 r"see |"
585 r"Use |"
586 r"use the |"
587 r"Only used |"
588 r"The forms in |"
589 r"these are also written |"
590 r"The genitive can be |"
591 r"Genitive forms are rare or non-existant|"
592 r"Accusative Note: |"
593 r"Classifier Note: |"
594 r"Noun: Assamese nouns are |"
595 r"the active conjugation|"
596 r"the instrumenal singular|"
597 r"Note:|"
598 r"\^* Note:|"
599 r"possible mutated form |"
600 r"The future tense: )",
601 col,
602 ):
603 return "dummy-ignored-text-cell", [], [], []
605 # Temporarily remove final parenthesized part (if separated by whitespace),
606 # so that we can extract reference markers before it.
607 final_paren = ""
608 m = re.search(r"\s+\([^)]*\)$", col)
609 if m is not None:
610 final_paren = m.group(0)
611 col = col[: m.start()]
613 # Extract references and tag markers
614 refs = []
615 special_references = get_lang_conf(lang, "special_references")
616 while True:
617 m = re.search(r"\^(.|\([^)]*\))$", col)
618 if not m:
619 break
620 r = m.group(1)
621 if r.startswith("(") and r.endswith(")"):
622 r = r[1:-1]
623 for r1 in r.split(","):
624 if r1 == "rare": 624 ↛ 625line 624 didn't jump to line 625 because the condition on line 624 was never true
625 hdr_tags.append("rare")
626 elif special_references and r1 in special_references:
627 hdr_tags.extend(special_references[r1].split())
628 else:
629 # v = m.group(1)
630 if r1.startswith("(") and r1.endswith(")"): 630 ↛ 631line 630 didn't jump to line 631 because the condition on line 630 was never true
631 r1 = r1[1:-1]
632 refs.append(unicodedata.normalize("NFKD", r1))
633 col = col[: m.start()]
634 # See if it is a ref definition
635 # print("BEFORE REF CHECK: {!r}".format(col))
636 m = def_re.match(col)
637 # print(f"Before def_re: {refs=}")
638 if m and not nondef_re.match(col):
639 ofs = 0
640 ref = None
641 deflst = []
642 for m in re.finditer(def_re, col):
643 if ref:
644 deflst.append((ref, col[ofs : m.start()].strip()))
645 ref = unicodedata.normalize(
646 "NFKD", m.group(3) or m.group(5) or m.group(6) or ""
647 )
648 ofs = m.end()
649 if ref: 649 ↛ 652line 649 didn't jump to line 652 because the condition on line 649 was always true
650 deflst.append((ref, col[ofs:].strip()))
651 # print("deflst:", deflst)
652 return "", [], deflst, []
653 # See if it *looks* like a reference to a definition
654 # print(f"After def_re: {refs=}")
655 while col:
656 if is_superscript(col[-1]) or col[-1] in ("†",):
657 if col.endswith("ʳᵃʳᵉ"):
658 hdr_tags.append("rare")
659 col = col[:-4].strip()
660 continue
661 if special_references:
662 stop_flag = False
663 for r in special_references:
664 if col.endswith(r):
665 hdr_tags.extend(special_references[r].split())
666 col = col[: -len(r)].strip()
667 stop_flag = True
668 break # this for loop
669 if stop_flag:
670 continue # this while loop
671 # Numbers and H/L/N are useful information
672 refs.append(unicodedata.normalize("NFKD", col[-1]))
673 col = col[:-1]
674 else:
675 break
677 # Check for another form of note definition
678 if ( 678 ↛ 684line 678 didn't jump to line 684 because the condition on line 678 was never true
679 len(col) > 2
680 and col[1] in (")", " ", ":")
681 and col[0].isdigit()
682 and not re.match(nondef_re, col)
683 ):
684 return "", [], [(col[0], col[2:].strip())], []
685 col = col.strip()
687 # Extract final "*" reference symbols. Sometimes there are multiple.
688 m = re.search(r"\*+$", col)
689 if m is not None:
690 col = col[: m.start()]
691 refs.append(unicodedata.normalize("NFKD", m.group(0)))
692 if col.endswith("(*)"): 692 ↛ 693line 692 didn't jump to line 693 because the condition on line 692 was never true
693 col = col[:-3].strip()
694 refs.append("*")
696 # Put back the final parenthesized part
697 col = col.strip() + final_paren
698 # print("EXTRACT_CELL_CONTENT: orig_col={!r} col={!r} refs={!r} hdr_tags={}"
699 # .format(orig_col, col, refs, hdr_tags))
700 return col.strip(), refs, [], hdr_tags
703@functools.lru_cache(10000)
704def parse_title(
705 title: str, source: str
706) -> tuple[list[str], list[str], list[FormData]]:
707 """Parses inflection table title. This returns (global_tags, table_tags,
708 extra_forms), where ``global_tags`` is tags to be added to each inflection
709 entry, ``table_tags`` are tags for the word but not to be added to every
710 form, and ``extra_forms`` is dictionary describing additional forms to be
711 included in the part-of-speech entry)."""
712 assert isinstance(title, str)
713 assert isinstance(source, str)
714 title = html.unescape(title)
715 title = re.sub(r"(?i)<[^>]*>", "", title).strip()
716 title = re.sub(r"\s+", " ", title)
717 # print("PARSE_TITLE:", title)
718 global_tags: list[str] = []
719 table_tags: list[str] = []
720 extra_forms = []
721 # Add certain global tags based on contained words
722 for m in re.finditer(title_contains_global_re, title):
723 v = m.group(0).lower()
724 if re.match(table_hdr_ign_part_re, v): 724 ↛ 725line 724 didn't jump to line 725 because the condition on line 724 was never true
725 continue
726 global_tags.extend(title_contains_global_map[v].split())
727 # Add certain tags to table-tags "form" based on contained words
728 for m in re.finditer(title_contains_wordtags_re, title):
729 v = m.group(0).lower()
730 if re.match(table_hdr_ign_part_re, v): 730 ↛ 731line 730 didn't jump to line 731 because the condition on line 730 was never true
731 continue
732 table_tags.extend(title_contains_wordtags_map[v].split())
733 if re.search(r"Conjugation of (s’|se ).*French verbs", title): 733 ↛ 734line 733 didn't jump to line 734 because the condition on line 733 was never true
734 global_tags.append("reflexive")
735 # Check for <x>-type at the beginning of title (e.g., Armenian) and various
736 # other ways of specifying an inflection class.
737 for m in re.finditer(
738 r"\b("
739 r"[\w/]+-type|"
740 r"accent-\w+|"
741 r"[\w/]+-stem|"
742 r"[^ ]+ gradation|"
743 r"\b(stem in [\w/ ]+)|"
744 r"[^ ]+ alternation|"
745 r"(First|Second|Third|Fourth|Fifth|Sixth|Seventh) "
746 r"(Conjugation|declension)|"
747 r"First and second declension|"
748 r"(1st|2nd|3rd|4th|5th|6th) declension|"
749 r"\w[\w/ ]* harmony"
750 r")\b",
751 title,
752 ):
753 dt: FormData = {"form": m.group(1), "source": source, "tags": ["class"]}
754 extra_forms.append(dt)
755 # Parse parenthesized part from title
756 for m in re.finditer(r"\(([^)]*)\)", title):
757 for elem in m.group(1).split(","):
758 # group(0) is the whole string, group(1) first parens
759 elem = elem.strip()
760 if elem in title_elements_map:
761 table_tags.extend(title_elements_map[elem].split())
762 else:
763 m1 = re.match(title_elemstart_re, elem)
764 if m1:
765 tags = title_elemstart_map[m1.group(1)].split()
766 dt = {
767 "form": elem[m1.end() :],
768 "source": source,
769 "tags": tags,
770 }
771 extra_forms.append(dt)
772 # For titles that contains no parenthesized parts, do some special
773 # handling to still interpret parts from them
774 if "(" not in title:
775 # No parenthesized parts
776 m1 = re.search(r"\b(Portuguese) (-.* verb) ", title)
777 if m1 is not None:
778 dt = {"form": m1.group(2), "tags": ["class"], "source": source}
779 extra_forms.append(dt)
780 for elem in title.split(","):
781 elem = elem.strip()
782 if elem in title_elements_map: 782 ↛ 783line 782 didn't jump to line 783 because the condition on line 782 was never true
783 table_tags.extend(title_elements_map[elem].split())
784 elif elem.endswith("-stem"): 784 ↛ 785line 784 didn't jump to line 785 because the condition on line 784 was never true
785 dt = {"form": elem, "tags": ["class"], "source": source}
786 extra_forms.append(dt)
787 return global_tags, table_tags, extra_forms
790def expand_header(
791 wxr: WiktextractContext,
792 tablecontext: "TableContext",
793 word: str,
794 lang: str,
795 pos: str,
796 text: str,
797 base_tags: Union[list[str], set[str], tuple[str, ...]],
798 silent=False,
799 ignore_tags=False,
800 depth=0,
801 column_number: int | None = None,
802) -> list[tuple[str, ...]]:
803 """Expands a cell header to tagset, handling conditional expressions
804 in infl_map. This returns list of tuples of tags, each list element
805 describing an alternative interpretation. ``base_tags`` is combined
806 column and row tags for the cell in which the text is being interpreted
807 (conditional expressions in inflection data may depend on it).
808 If ``silent`` is True, then no warnings will be printed. If ``ignore_tags``
809 is True, then tags listed in "if" will be ignored in the test (this is
810 used when trying to heuristically detect whether a non-<th> cell is anyway
811 a header)."""
812 assert isinstance(wxr, WiktextractContext)
813 assert isinstance(word, str)
814 assert isinstance(lang, str)
815 assert isinstance(pos, str)
816 assert isinstance(text, str)
817 assert isinstance(base_tags, (list, tuple, set))
818 assert silent in (True, False)
819 assert isinstance(depth, int)
820 # print("EXPAND_HDR: text={!r} base_tags={!r}".format(text, base_tags))
821 # First map the text using the inflection map
822 text = clean_value(wxr, text)
823 combined_return: list[tuple[str, ...]] = []
824 parts = split_at_comma_semi(text, separators=[";"])
825 for text in parts:
826 if not text: 826 ↛ 827line 826 didn't jump to line 827 because the condition on line 826 was never true
827 continue
828 if text in infl_map:
829 v = infl_map[text] # list or string
830 else:
831 m = re.match(infl_start_re, text)
832 if m is not None: 832 ↛ 833line 832 didn't jump to line 833 because the condition on line 832 was never true
833 v = infl_start_map[m.group(1)]
834 # print("INFL_START {} -> {}".format(text, v))
835 elif re.match(r"Notes", text):
836 # Ignored header
837 # print("IGNORING NOTES")
838 combined_return = or_tagsets(
839 lang, pos, combined_return, [("dummy-skip-this",)]
840 )
841 # this just adds dummy-skip-this
842 continue
843 elif text in IGNORED_COLVALUES:
844 combined_return = or_tagsets(
845 lang, pos, combined_return, [("dummy-ignore-skipped",)]
846 )
847 continue
848 # Try without final parenthesized part
849 text_without_parens = re.sub(r"[,/]?\s+\([^)]*\)\s*$", "", text)
850 if text_without_parens in infl_map:
851 v = infl_map[text_without_parens]
852 elif m is None: 852 ↛ 868line 852 didn't jump to line 868 because the condition on line 852 was always true
853 if not silent:
854 wxr.wtp.debug(
855 "inflection table: unrecognized header: {}".format(
856 repr(text)
857 ),
858 sortid="inflection/735",
859 )
860 # Unrecognized header
861 combined_return = or_tagsets(
862 lang, pos, combined_return, [("error-unrecognized-form",)]
863 )
864 continue
866 # Then loop interpreting the value, until the value is a simple string.
867 # This may evaluate nested conditional expressions.
868 default_else = None
869 while True:
870 # If it is a string, we are done.
871 if isinstance(v, str):
872 tags = set(v.split())
873 remove_useless_tags(lang, pos, tags)
874 tagset = [tuple(sorted(tags))]
875 break
876 # For a list, just interpret it as alternatives. (Currently the
877 # alternatives must directly be strings.)
878 if isinstance(v, (list, tuple)):
879 tagset = []
880 for x in v:
881 tags = set(x.split())
882 remove_useless_tags(lang, pos, tags)
883 tags_t = tuple(sorted(tags))
884 if tags_t not in tagset: 884 ↛ 880line 884 didn't jump to line 880 because the condition on line 884 was always true
885 tagset.append(tags_t)
886 break
887 # Otherwise the value should be a dictionary describing a
888 # conditional expression.
889 if not isinstance(v, dict): 889 ↛ 890line 889 didn't jump to line 890 because the condition on line 889 was never true
890 wxr.wtp.debug(
891 "inflection table: internal: "
892 "UNIMPLEMENTED INFL_MAP VALUE: {}".format(infl_map[text]),
893 sortid="inflection/767",
894 )
895 tagset = [()]
896 break
897 # Evaluate the conditional expression.
898 assert isinstance(v, dict)
899 cond: Union[bool, str] = "default-true"
900 c: Union[str, list[str], set[str]] = ""
901 # Handle "lang" condition. The value must be either a
902 # single language or a list of languages, and the
903 # condition evaluates to True if the table is one of
904 # those languages.
905 if "lang" in v:
906 c = v["lang"]
907 # check if it's a code and transform if necessary
908 if isinstance(c, str):
909 if c != lang:
910 cond = lang == code_to_name(c, "en")
911 else:
912 cond = True
913 else:
914 assert isinstance(c, (list, tuple, set))
915 if lang not in c:
916 cond = name_to_code(lang, "en") in c
917 else:
918 cond = True
919 # Handle "nested-table-depth" condition. The value must
920 # be an int or list of ints, and the condition evaluates
921 # True if the depth is one of those values.
922 # "depth" is how deep into a nested table tree the current
923 # table lies. It is first started in handle_wikitext_table,
924 # so only applies to tables-within-tables, not other
925 # WikiNode content. `depth` is currently only passed as a
926 # parameter down the table parsing stack, and not stored.
927 if cond and "nested-table-depth" in v: 927 ↛ 928line 927 didn't jump to line 928 because the condition on line 927 was never true
928 d = v["nested-table-depth"]
929 if isinstance(d, int):
930 cond = d == depth
931 else:
932 assert isinstance(d, (list, tuple, set))
933 cond = depth in d
934 # Column index: check if we're in position X of the row
935 if cond and "column-index" in v:
936 index = v["column-index"]
937 if isinstance(index, int): 937 ↛ 940line 937 didn't jump to line 940 because the condition on line 937 was always true
938 cond = index == column_number
939 else:
940 assert isinstance(index, (list, tuple, set))
941 cond = column_number in index
942 # Handle inflection-template condition. Must be a string
943 # or list of strings, and if tablecontext.template_name is in
944 # those, accept the condition.
945 # TableContext.template_name is passed down from page/
946 # parse_inflection, before parsing and expanding itself
947 # has begun.
948 if cond and tablecontext and "inflection-template" in v:
949 d1 = v["inflection-template"]
950 if isinstance(d1, str): 950 ↛ 953line 950 didn't jump to line 953 because the condition on line 950 was always true
951 cond = d1 == tablecontext.template_name
952 else:
953 assert isinstance(d1, (list, tuple, set))
954 cond = tablecontext.template_name in d1
955 # Handle "pos" condition. The value must be either a single
956 # part-of-speech or a list of them, and the condition evaluates to
957 # True if the part-of-speech is any of those listed.
958 if cond and "pos" in v:
959 c = v["pos"]
960 if isinstance(c, str):
961 cond = c == pos
962 else:
963 assert isinstance(c, (list, tuple, set))
964 cond = pos in c
965 # Handle "if" condition. The value must be a string containing a
966 # space-separated list of tags. The condition evaluates to True if
967 # ``base_tags`` contains all of the listed tags. If the condition
968 # is of the form "any: ...tags...", then any of the tags will be
969 # enough.
970 if cond and "if" in v and not ignore_tags:
971 c = v["if"]
972 assert isinstance(c, str)
973 # "if" condition is true if any of the listed tags is present if
974 # it starts with "any:", otherwise all must be present
975 if c.startswith("any: "):
976 cond = any(t in base_tags for t in c[5:].split())
977 else:
978 cond = all(t in base_tags for t in c.split())
980 # Handle "default" assignment. Store the value to be used
981 # as a default later.
982 if "default" in v:
983 assert isinstance(v["default"], str)
984 default_else = v["default"]
986 # Warning message about missing conditions for debugging.
988 if cond == "default-true" and not default_else and not silent:
989 wxr.wtp.debug(
990 "inflection table: IF MISSING COND: word={} "
991 "lang={} text={} base_tags={} c={} cond={}".format(
992 word, lang, text, base_tags, c, cond
993 ),
994 sortid="inflection/851",
995 )
996 # Based on the result of evaluating the condition, select either
997 # "then" part or "else" part.
998 if cond:
999 v = v.get("then", "")
1000 else:
1001 v1 = v.get("else")
1002 if v1 is None:
1003 if default_else is not None:
1004 v = default_else
1005 else:
1006 if not silent:
1007 wxr.wtp.debug(
1008 "inflection table: IF WITHOUT ELSE EVALS "
1009 "False: "
1010 "{}/{} {!r} base_tags={}".format(
1011 word, lang, text, base_tags
1012 ),
1013 sortid="inflection/865",
1014 )
1015 v = "error-unrecognized-form"
1016 else:
1017 v = v1
1019 # Merge the resulting tagset from this header part with the other
1020 # tagsets from the whole header
1021 combined_return = or_tagsets(lang, pos, combined_return, tagset)
1023 # Return the combined tagsets, or empty tagset if we got no tagsets
1024 if not combined_return:
1025 combined_return = [()]
1026 return combined_return
1029def compute_coltags(
1030 lang: str,
1031 pos: str,
1032 hdrspans: list[str],
1033 start: int,
1034 colspan: int,
1035 celltext: int,
1036) -> list[tuple[str]]:
1037 """Computes column tags for a column of the given width based on the
1038 current header spans."""
1039 assert isinstance(lang, str)
1040 assert isinstance(pos, str)
1041 assert isinstance(hdrspans, list)
1042 assert isinstance(start, int) and start >= 0
1043 assert isinstance(colspan, int) and colspan >= 1
1044 assert isinstance(celltext, str) # For debugging only
1045 # print("COMPUTE_COLTAGS CALLED start={} colspan={} celltext={!r}"
1046 # .format(start, colspan, celltext))
1047 # For debugging, set this to the form for whose cell you want debug prints
1048 if celltext == debug_cell_text: 1048 ↛ 1049line 1048 didn't jump to line 1049 because the condition on line 1048 was never true
1049 print(
1050 "COMPUTE_COLTAGS CALLED start={} colspan={} celltext={!r}".format(
1051 start, colspan, celltext
1052 )
1053 )
1054 for hdrspan in hdrspans:
1055 print(
1056 " row={} start={} colspans={} tagsets={}".format(
1057 hdrspan.rownum,
1058 hdrspan.start,
1059 hdrspan.colspan,
1060 hdrspan.tagsets,
1061 )
1062 )
1063 used = set()
1064 coltags = [()]
1065 last_header_row = 1000000
1066 # Iterate through the headers in reverse order, i.e., headers lower in the
1067 # table (closer to the cell) first.
1068 row_tagsets = [()]
1069 row_tagsets_rownum = 1000000
1070 used_hdrspans = set()
1071 for hdrspan in reversed(hdrspans):
1072 if (
1073 hdrspan.start + hdrspan.colspan <= start
1074 or hdrspan.start >= start + colspan
1075 ):
1076 # Does not horizontally overlap current cell. Ignore this hdrspan.
1077 if celltext == debug_cell_text: 1077 ↛ 1078line 1077 didn't jump to line 1078 because the condition on line 1077 was never true
1078 print(
1079 "Ignoring row={} start={} colspan={} tagsets={}".format(
1080 hdrspan.rownum,
1081 hdrspan.start,
1082 hdrspan.colspan,
1083 hdrspan.tagsets,
1084 )
1085 )
1086 continue
1087 # If the cell partially overlaps the current cell, assume we have
1088 # reached something unrelated and abort.
1089 if (
1090 hdrspan.start < start
1091 and hdrspan.start + hdrspan.colspan > start
1092 and hdrspan.start + hdrspan.colspan < start + colspan
1093 ):
1094 if celltext == debug_cell_text: 1094 ↛ 1095line 1094 didn't jump to line 1095 because the condition on line 1094 was never true
1095 print(
1096 "break on partial overlap at start {} {} {}".format(
1097 hdrspan.start, hdrspan.colspan, hdrspan.tagsets
1098 )
1099 )
1100 break
1101 if (
1102 hdrspan.start < start + colspan
1103 and hdrspan.start > start
1104 and hdrspan.start + hdrspan.colspan > start + colspan
1105 and not hdrspan.expanded
1106 ):
1107 if celltext == debug_cell_text: 1107 ↛ 1108line 1107 didn't jump to line 1108 because the condition on line 1107 was never true
1108 print(
1109 "break on partial overlap at end {} {} {}".format(
1110 hdrspan.start, hdrspan.colspan, hdrspan.tagsets
1111 )
1112 )
1113 break
1114 # Check if we have already used this cell.
1115 if id(hdrspan) in used_hdrspans:
1116 continue
1117 # We are going to use this cell.
1118 used_hdrspans.add(id(hdrspan))
1119 tagsets = hdrspan.tagsets
1120 # If the hdrspan is fully inside the current cell and does not cover
1121 # it fully, check if we should merge information from multiple cells.
1122 if not hdrspan.expanded and (
1123 hdrspan.start > start
1124 or hdrspan.start + hdrspan.colspan < start + colspan
1125 ):
1126 # Multiple columns apply to the current cell, only
1127 # gender/number/case tags present
1128 # If there are no tags outside the range in any of the
1129 # categories included in these cells, don't add anything
1130 # (assume all choices valid in the language are possible).
1131 in_cats = set(
1132 valid_tags[t]
1133 for x in hdrspans
1134 if x.rownum == hdrspan.rownum
1135 and x.start >= start
1136 and x.start + x.colspan <= start + colspan
1137 for tt in x.tagsets
1138 for t in tt
1139 )
1140 if celltext == debug_cell_text: 1140 ↛ 1141line 1140 didn't jump to line 1141 because the condition on line 1140 was never true
1141 print("in_cats={} tagsets={}".format(in_cats, tagsets))
1142 # Merge the tagsets into existing tagsets. This merges
1143 # alternatives into the same tagset if there is only one
1144 # category different; otherwise this splits the tagset into
1145 # more alternatives.
1146 includes_all_on_row = True
1147 for x in hdrspans:
1148 # print("X: x.rownum={} x.start={}".format(x.rownum, x.start))
1149 if x.rownum != hdrspan.rownum:
1150 continue
1151 if x.start < start or x.start + x.colspan > start + colspan:
1152 if celltext == debug_cell_text: 1152 ↛ 1153line 1152 didn't jump to line 1153 because the condition on line 1152 was never true
1153 print(
1154 "NOT IN RANGE: {} {} {}".format(
1155 x.start, x.colspan, x.tagsets
1156 )
1157 )
1158 includes_all_on_row = False
1159 continue
1160 if id(x) in used_hdrspans:
1161 if celltext == debug_cell_text: 1161 ↛ 1162line 1161 didn't jump to line 1162 because the condition on line 1161 was never true
1162 print(
1163 "ALREADY USED: {} {} {}".format(
1164 x.start, x.colspan, x.tagsets
1165 )
1166 )
1167 continue
1168 used_hdrspans.add(id(x))
1169 if celltext == debug_cell_text: 1169 ↛ 1170line 1169 didn't jump to line 1170 because the condition on line 1169 was never true
1170 print(
1171 "Merging into wide col: x.rownum={} "
1172 "x.start={} x.colspan={} "
1173 "start={} colspan={} tagsets={} x.tagsets={}".format(
1174 x.rownum,
1175 x.start,
1176 x.colspan,
1177 start,
1178 colspan,
1179 tagsets,
1180 x.tagsets,
1181 )
1182 )
1183 tagsets = or_tagsets(lang, pos, tagsets, x.tagsets)
1184 # If all headers on the row were included, ignore them.
1185 # See e.g. kunna/Swedish/Verb.
1186 ts_cats = tagset_cats(tagsets)
1187 if (
1188 includes_all_on_row
1189 or
1190 # Kludge, see fut/Hungarian/Verb
1191 ("tense" in ts_cats and "object" in ts_cats)
1192 ):
1193 tagsets = [()]
1194 # For limited categories, if the category doesn't appear
1195 # outside, we won't include the category
1196 if not in_cats - set(
1197 ("gender", "number", "person", "case", "category", "voice")
1198 ):
1199 # Sometimes we have masc, fem, neut and plural, so treat
1200 # number and gender as the same here (if one given, look for
1201 # the other too)
1202 if "number" in in_cats or "gender" in in_cats:
1203 in_cats.update(("number", "gender"))
1204 # Determine which categories occur outside on
1205 # the same row. Ignore headers that have been expanded
1206 # to cover the whole row/part of it.
1207 out_cats = set(
1208 valid_tags[t]
1209 for x in hdrspans
1210 if x.rownum == hdrspan.rownum
1211 and not x.expanded
1212 and (
1213 x.start < start or x.start + x.colspan > start + colspan
1214 )
1215 for tt in x.tagsets
1216 for t in tt
1217 )
1218 if celltext == debug_cell_text: 1218 ↛ 1219line 1218 didn't jump to line 1219 because the condition on line 1218 was never true
1219 print("in_cats={} out_cats={}".format(in_cats, out_cats))
1220 # Remove all inside categories that do not appear outside
1222 new_tagsets = []
1223 for ts in tagsets:
1224 tags = tuple(
1225 sorted(t for t in ts if valid_tags[t] in out_cats)
1226 )
1227 if tags not in new_tagsets: 1227 ↛ 1223line 1227 didn't jump to line 1223 because the condition on line 1227 was always true
1228 new_tagsets.append(tags)
1229 if celltext == debug_cell_text and new_tagsets != tagsets: 1229 ↛ 1230line 1229 didn't jump to line 1230 because the condition on line 1229 was never true
1230 print(
1231 "Removed tags that do not "
1232 "appear outside {} -> {}".format(
1233 # have_hdr never used?
1234 tagsets,
1235 new_tagsets,
1236 )
1237 )
1238 tagsets = new_tagsets
1239 key = (hdrspan.start, hdrspan.colspan)
1240 if key in used:
1241 if celltext == debug_cell_text: 1241 ↛ 1242line 1241 didn't jump to line 1242 because the condition on line 1241 was never true
1242 print(
1243 "Cellspan already used: start={} "
1244 "colspan={} rownum={} {}".format(
1245 hdrspan.start,
1246 hdrspan.colspan,
1247 hdrspan.rownum,
1248 hdrspan.tagsets,
1249 )
1250 )
1251 action = get_lang_conf(lang, "reuse_cellspan")
1252 # can be "stop", "skip" or "reuse"
1253 if action == "stop":
1254 break
1255 if action == "skip":
1256 continue
1257 assert action == "reuse"
1258 tcats = tagset_cats(tagsets)
1259 # Most headers block using the same column position above. However,
1260 # "register" tags don't do this (cf. essere/Italian/verb: "formal")
1261 if len(tcats) != 1 or "register" not in tcats:
1262 used.add(key)
1263 # If we have moved to a different row, merge into column tagsets
1264 # (we use different and_tagsets within the row)
1265 if row_tagsets_rownum != hdrspan.rownum:
1266 # row_tagsets_rownum was initialized as 10000000
1267 ret = and_tagsets(lang, pos, coltags, row_tagsets)
1268 if celltext == debug_cell_text: 1268 ↛ 1269line 1268 didn't jump to line 1269 because the condition on line 1268 was never true
1269 print(
1270 "merging rows: {} {} -> {}".format(
1271 coltags, row_tagsets, ret
1272 )
1273 )
1274 coltags = ret
1275 row_tagsets = [()]
1276 row_tagsets_rownum = hdrspan.rownum
1277 # Merge into coltags
1278 if hdrspan.all_headers_row and hdrspan.rownum + 1 == last_header_row:
1279 # If this row is all headers and immediately preceeds the last
1280 # header we accepted, take any header from there.
1281 row_tagsets = and_tagsets(lang, pos, row_tagsets, tagsets)
1282 if celltext == debug_cell_text: 1282 ↛ 1283line 1282 didn't jump to line 1283 because the condition on line 1282 was never true
1283 print("merged (next header row): {}".format(row_tagsets))
1284 else:
1285 # new_cats is for the new tags (higher up in the table)
1286 new_cats = tagset_cats(tagsets)
1287 # cur_cats is for the tags already collected (lower in the table)
1288 cur_cats = tagset_cats(coltags)
1289 if celltext == debug_cell_text: 1289 ↛ 1290line 1289 didn't jump to line 1290 because the condition on line 1289 was never true
1290 print(
1291 "row={} start={} colspan={} tagsets={} coltags={} "
1292 "new_cats={} cur_cats={}".format(
1293 hdrspan.rownum,
1294 hdrspan.start,
1295 hdrspan.colspan,
1296 tagsets,
1297 coltags,
1298 new_cats,
1299 cur_cats,
1300 )
1301 )
1302 if "detail" in new_cats:
1303 if not any(coltags): # Only if no tags so far
1304 coltags = or_tagsets(lang, pos, coltags, tagsets)
1305 if celltext == debug_cell_text: 1305 ↛ 1306line 1305 didn't jump to line 1306 because the condition on line 1305 was never true
1306 print("stopping on detail after merge")
1307 break
1308 # Here, we block bleeding of categories from above
1309 elif "non-finite" in cur_cats and "non-finite" in new_cats:
1310 stop = get_lang_conf(lang, "stop_non_finite_non_finite")
1311 if stop: 1311 ↛ 1337line 1311 didn't jump to line 1337 because the condition on line 1311 was always true
1312 if celltext == debug_cell_text: 1312 ↛ 1313line 1312 didn't jump to line 1313 because the condition on line 1312 was never true
1313 print("stopping on non-finite-non-finite")
1314 break
1315 elif "non-finite" in cur_cats and "voice" in new_cats:
1316 stop = get_lang_conf(lang, "stop_non_finite_voice")
1317 if stop: 1317 ↛ 1337line 1317 didn't jump to line 1337 because the condition on line 1317 was always true
1318 if celltext == debug_cell_text: 1318 ↛ 1319line 1318 didn't jump to line 1319 because the condition on line 1318 was never true
1319 print("stopping on non-finite-voice")
1320 break
1321 elif "non-finite" in new_cats and cur_cats & set(
1322 ("person", "number")
1323 ):
1324 if celltext == debug_cell_text: 1324 ↛ 1325line 1324 didn't jump to line 1325 because the condition on line 1324 was never true
1325 print("stopping on non-finite new")
1326 break
1327 elif "non-finite" in new_cats and "tense" in new_cats:
1328 stop = get_lang_conf(lang, "stop_non_finite_tense")
1329 if stop:
1330 if celltext == debug_cell_text: 1330 ↛ 1331line 1330 didn't jump to line 1331 because the condition on line 1330 was never true
1331 print("stopping on non-finite new")
1332 break
1333 elif "non-finite" in cur_cats and new_cats & set(("mood",)): 1333 ↛ 1334line 1333 didn't jump to line 1334 because the condition on line 1333 was never true
1334 if celltext == debug_cell_text:
1335 print("stopping on non-finite cur")
1336 break
1337 if (
1338 "tense" in new_cats
1339 and any("imperative" in x for x in coltags)
1340 and get_lang_conf(lang, "imperative_no_tense")
1341 ):
1342 if celltext == debug_cell_text: 1342 ↛ 1343line 1342 didn't jump to line 1343 because the condition on line 1342 was never true
1343 print("skipping tense in imperative")
1344 continue
1345 elif (
1346 "mood" in new_cats
1347 and "mood" in cur_cats
1348 and
1349 # Allow if all new tags are already in current set
1350 any(
1351 t not in ts1
1352 for ts1 in coltags # current
1353 for ts2 in tagsets # new (from above)
1354 for t in ts2
1355 )
1356 ):
1357 skip = get_lang_conf(lang, "skip_mood_mood")
1358 if skip:
1359 if celltext == debug_cell_text: 1359 ↛ 1360line 1359 didn't jump to line 1360 because the condition on line 1359 was never true
1360 print("skipping on mood-mood")
1361 # we continue to next header
1362 else:
1363 if celltext == debug_cell_text: 1363 ↛ 1364line 1363 didn't jump to line 1364 because the condition on line 1363 was never true
1364 print("stopping on mood-mood")
1365 break
1366 elif "tense" in new_cats and "tense" in cur_cats:
1367 skip = get_lang_conf(lang, "skip_tense_tense")
1368 if skip:
1369 if celltext == debug_cell_text: 1369 ↛ 1370line 1369 didn't jump to line 1370 because the condition on line 1369 was never true
1370 print("skipping on tense-tense")
1371 # we continue to next header
1372 else:
1373 if celltext == debug_cell_text: 1373 ↛ 1374line 1373 didn't jump to line 1374 because the condition on line 1373 was never true
1374 print("stopping on tense-tense")
1375 break
1376 elif "aspect" in new_cats and "aspect" in cur_cats:
1377 if celltext == debug_cell_text: 1377 ↛ 1378line 1377 didn't jump to line 1378 because the condition on line 1377 was never true
1378 print("skipping on aspect-aspect")
1379 continue
1380 elif "number" in cur_cats and "number" in new_cats:
1381 if celltext == debug_cell_text: 1381 ↛ 1382line 1381 didn't jump to line 1382 because the condition on line 1381 was never true
1382 print("stopping on number-number")
1383 break
1384 elif "number" in cur_cats and "gender" in new_cats:
1385 if celltext == debug_cell_text: 1385 ↛ 1386line 1385 didn't jump to line 1386 because the condition on line 1385 was never true
1386 print("stopping on number-gender")
1387 break
1388 elif "person" in cur_cats and "person" in new_cats:
1389 if celltext == debug_cell_text: 1389 ↛ 1390line 1389 didn't jump to line 1390 because the condition on line 1389 was never true
1390 print("stopping on person-person")
1391 break
1392 else:
1393 # Merge tags and continue to next header up/left in the table.
1394 row_tagsets = and_tagsets(lang, pos, row_tagsets, tagsets)
1395 if celltext == debug_cell_text: 1395 ↛ 1396line 1395 didn't jump to line 1396 because the condition on line 1395 was never true
1396 print("merged: {}".format(coltags))
1397 # Update the row number from which we have last taken headers
1398 last_header_row = hdrspan.rownum
1399 # Merge the final row tagset into coltags
1400 coltags = and_tagsets(lang, pos, coltags, row_tagsets)
1401 # print(
1402 # "HDRSPANS:", list((x.start, x.colspan, x.tagsets) for x in hdrspans)
1403 # )
1404 if celltext == debug_cell_text: 1404 ↛ 1405line 1404 didn't jump to line 1405 because the condition on line 1404 was never true
1405 print("COMPUTE_COLTAGS {} {}: {}".format(start, colspan, coltags))
1406 assert isinstance(coltags, list)
1407 assert all(isinstance(x, tuple) for x in coltags)
1408 return coltags
1411def parse_simple_table(
1412 wxr, tablecontext, word, lang, pos, rows, titles, source, after, depth
1413):
1414 """This is the default table parser. Despite its name, it can parse
1415 complex tables. This returns a list of forms to be added to the
1416 part-of-speech, or None if the table could not be parsed."""
1417 assert isinstance(wxr, WiktextractContext)
1418 assert isinstance(tablecontext, TableContext)
1419 assert isinstance(word, str)
1420 assert isinstance(lang, str)
1421 assert isinstance(pos, str)
1422 assert isinstance(rows, list)
1423 assert isinstance(source, str)
1424 assert isinstance(after, str)
1425 assert isinstance(depth, int)
1426 for row in rows:
1427 for col in row:
1428 assert isinstance(col, InflCell)
1429 assert isinstance(titles, list)
1430 for x in titles:
1431 assert isinstance(x, str)
1433 # print("PARSE_SIMPLE_TABLE: TITLES:", titles)
1434 if debug_cell_text: 1434 ↛ 1435line 1434 didn't jump to line 1435 because the condition on line 1434 was never true
1435 print("ROWS:")
1436 for row in rows:
1437 print(" ", row)
1439 # Check for forced rowspan kludge. See e.g.
1440 # maorski/Serbo-Croatian. These are essentially multi-row
1441 # cells implemented using <br> rather than separate cell. We fix this
1442 # by identifying rows where this happens, and splitting the current row
1443 # to multiple rows by synthesizing additional cells.
1444 new_rows = []
1445 for row in rows:
1446 split_row = (
1447 any(x.is_title and x.text in ("inanimate\nanimate",) for x in row)
1448 and
1449 # x is an InflCell
1450 all(x.rowspan == 1 for x in row)
1451 )
1452 if not split_row:
1453 new_rows.append(row)
1454 continue
1455 row1 = []
1456 row2 = []
1457 for cell in row:
1458 cell1 = copy.deepcopy(cell)
1459 if "\n" in cell.text:
1460 # Has more than one line - split this cell
1461 parts = cell.text.strip().splitlines()
1462 if len(parts) != 2: 1462 ↛ 1463line 1462 didn't jump to line 1463 because the condition on line 1462 was never true
1463 wxr.wtp.debug(
1464 "forced rowspan kludge got {} parts: {!r}".format(
1465 len(parts), cell.text
1466 ),
1467 sortid="inflection/1234",
1468 )
1469 cell2 = copy.deepcopy(cell)
1470 cell1.text = parts[0]
1471 cell2.text = parts[1]
1472 else:
1473 cell1.rowspan = 2
1474 cell2 = cell1 # ref, not a copy
1475 row1.append(cell1)
1476 row2.append(cell2)
1477 new_rows.append(row1)
1478 new_rows.append(row2)
1479 rows = new_rows
1480 # print("ROWS AFTER FORCED ROWSPAN KLUDGE:")
1481 # for row in rows:
1482 # print(" ", row)
1484 # Parse definitions for references (from table itself and from text
1485 # after it)
1486 def_ht = {}
1488 def add_defs(defs: list[tuple[str, str]]) -> None:
1489 for ref, d in defs:
1490 # print("DEF: ref={} d={}".format(ref, d))
1491 d = d.strip()
1492 d = d.split(". ")[0].strip() # text before ". "
1493 if not d: 1493 ↛ 1494line 1493 didn't jump to line 1494 because the condition on line 1493 was never true
1494 continue
1495 if d.endswith("."): # catc ".."??
1496 d = d[:-1]
1497 tags, topics = decode_tags(d, no_unknown_starts=True)
1498 # print(f"{ref=}, {transformed=}, {tags=}")
1499 if topics or any("error-unknown-tag" in ts for ts in tags):
1500 d = d[0].lower() + d[1:]
1501 tags, topics = decode_tags(d, no_unknown_starts=True)
1502 if topics or any("error-unknown-tag" in ts for ts in tags):
1503 # Failed to parse as tags
1504 # print("Failed: topics={} tags={}"
1505 # .format(topics, tags))
1506 continue
1507 tags1_s: set[str] = set()
1508 for ts in tags:
1509 # Set.update is a union operation: definition tags are flat
1510 tags1_s.update(ts)
1511 tags1 = tuple(sorted(tags1_s))
1512 # print("DEFINED: {} -> {}".format(ref, tags1))
1513 def_ht[ref] = tags1
1515 def generate_tags(
1516 rowtags: list[tuple[str]], table_tags: list[str]
1517 ) -> tuple[
1518 list[tuple[str, ...]], list[tuple[str, ...]], list[tuple[str, ...]]
1519 ]:
1520 new_coltags = []
1521 all_hdr_tags = [] # list of tuples
1522 new_rowtags = []
1523 for rt0 in rowtags:
1524 for ct0 in compute_coltags(
1525 lang,
1526 pos,
1527 hdrspans,
1528 col_idx, # col_idx=>start
1529 colspan,
1530 col, # cell_text
1531 ):
1532 base_tags: set[str] = (
1533 set(rt0)
1534 | set(ct0)
1535 | set(global_tags)
1536 | set(table_tags)
1537 ) # Union.
1538 # print(f"{rt0=}, {ct0=}, {global_tags=},"
1539 # f" {table_tags=}, {base_tags=}")
1540 alt_tags = expand_header(
1541 wxr,
1542 tablecontext,
1543 word,
1544 lang,
1545 pos,
1546 text,
1547 base_tags,
1548 depth=depth,
1549 column_number=col_idx,
1550 )
1551 # base_tags are used in infl_map "if"-conds.
1552 for tt in alt_tags:
1553 if tt not in all_hdr_tags:
1554 all_hdr_tags.append(tt)
1555 tt_s = set(tt)
1556 # Add tags from referenced footnotes
1557 tt_s.update(refs_tags)
1558 # Sort, convert to tuple, and add to set of
1559 # alternatives.
1560 tt = tuple(sorted(tt_s))
1561 if tt not in new_coltags:
1562 new_coltags.append(tt)
1563 # Kludge (saprast/Latvian/Verb): ignore row tags
1564 # if trying to add a non-finite after mood.
1565 if any(valid_tags[t] == "mood" for t in rt0) and any(
1566 valid_tags[t] == "non-finite" for t in tt
1567 ):
1568 tags = tuple(sorted(set(tt) | set(hdr_tags)))
1569 else:
1570 tags = tuple(sorted(set(tt) | set(rt0) | set(hdr_tags)))
1571 if tags not in new_rowtags:
1572 new_rowtags.append(tags)
1573 return new_rowtags, new_coltags, all_hdr_tags
1575 def add_new_hdrspan(
1576 col: str,
1577 hdrspans: list[HdrSpan],
1578 store_new_hdrspan: bool,
1579 col0_followed_by_nonempty: bool,
1580 col0_hdrspan: Optional[HdrSpan],
1581 ) -> tuple[str, bool, Optional[HdrSpan]]:
1582 hdrspan = HdrSpan(
1583 col_idx, colspan, rowspan, rownum, new_coltags, col, all_headers
1584 )
1585 hdrspans.append(hdrspan)
1587 # infl-map tag "dummy-store-hdrspan" causes this new hdrspan
1588 # to be added to a register of stored hdrspans to be used
1589 # later with "dummy-load-stored-hdrspans".
1590 if store_new_hdrspan: 1590 ↛ 1591line 1590 didn't jump to line 1591 because the condition on line 1590 was never true
1591 tablecontext.stored_hdrspans.append(hdrspan)
1593 # Handle headers that are above left-side header
1594 # columns and are followed by personal pronouns in
1595 # remaining columns (basically headers that
1596 # evaluate to no tags). In such cases widen the
1597 # left-side header to the full row.
1598 if previously_seen: # id(cell) in seen_cells previously
1599 col0_followed_by_nonempty = True
1600 return col, col0_followed_by_nonempty, col0_hdrspan
1601 elif col0_hdrspan is None:
1602 col0_hdrspan = hdrspan
1603 elif any(all_hdr_tags): 1603 ↛ 1671line 1603 didn't jump to line 1671 because the condition on line 1603 was always true
1604 col0_cats = tagset_cats(col0_hdrspan.tagsets)
1605 later_cats = tagset_cats(all_hdr_tags)
1606 col0_allowed = get_lang_conf(lang, "hdr_expand_first")
1607 later_allowed = get_lang_conf(lang, "hdr_expand_cont")
1608 later_allowed = later_allowed | set(["dummy"])
1609 # dummy2 has different behavior than plain dummy
1610 # and does not belong here.
1612 # print("col0_cats={} later_cats={} "
1613 # "fol_by_nonempty={} col_idx={} end={} "
1614 # "tagsets={}"
1615 # .format(col0_cats, later_cats,
1616 # col0_followed_by_nonempty, col_idx,
1617 # col0_hdrspan.start +
1618 # col0_hdrspan.colspan,
1619 # col0_hdrspan.tagsets))
1620 # print("col0.rowspan={} rowspan={}"
1621 # .format(col0_hdrspan.rowspan, rowspan))
1622 # Only expand if [col0_cats and later_cats are allowed
1623 # and don't overlap] and [col0 has tags], and there have
1624 # been [no disallowed cells in between].
1625 #
1626 # There are three cases here:
1627 # - col0_hdrspan set, continue with allowed current
1628 # - col0_hdrspan set, expand, start new
1629 # - col0_hdrspan set, no expand, start new
1630 if (
1631 not col0_followed_by_nonempty
1632 and
1633 # XXX Only one cat of tags: kunna/Swedish
1634 # XXX len(col0_cats) == 1 and
1635 col0_hdrspan.rowspan >= rowspan
1636 and
1637 # from hdrspan
1638 not (later_cats - later_allowed)
1639 and not (col0_cats & later_cats)
1640 ):
1641 # First case: col0 set, continue
1642 return col, col0_followed_by_nonempty, col0_hdrspan
1643 # We are going to start new col0_hdrspan. Check if
1644 # we should expand.
1645 if (
1646 not col0_followed_by_nonempty
1647 and not (col0_cats - col0_allowed)
1648 and
1649 # Only "allowed" allowed
1650 # XXX len(col0_cats) == 1 and
1651 col_idx > col0_hdrspan.start + col0_hdrspan.colspan
1652 ):
1653 # col_idx is beyond current colspan
1654 # *Expand* current col0_hdrspan
1655 # print("EXPANDING COL0 MID: {} from {} to {} "
1656 # "cols {}"
1657 # .format(col0_hdrspan.text,
1658 # col0_hdrspan.colspan,
1659 # col_idx - col0_hdrspan.start,
1660 # col0_hdrspan.tagsets))
1661 col0_hdrspan.colspan = col_idx - col0_hdrspan.start
1662 col0_hdrspan.expanded = True
1663 # Clear old col0_hdrspan
1664 if col == debug_cell_text: 1664 ↛ 1665line 1664 didn't jump to line 1665 because the condition on line 1664 was never true
1665 print("START NEW {}".format(hdrspan.tagsets))
1666 col0_hdrspan = None
1667 # Now start new, unless it comes from previous row
1668 if not previously_seen: 1668 ↛ 1671line 1668 didn't jump to line 1671 because the condition on line 1668 was always true
1669 col0_hdrspan = hdrspan
1670 col0_followed_by_nonempty = False
1671 return col, col0_followed_by_nonempty, col0_hdrspan
1673 def split_text_into_alts(col: str) -> tuple[str, list[str], list[str]]:
1674 # Split the cell text into alternatives
1675 split_extra_tags = []
1676 if col and is_superscript(col[0]): 1676 ↛ 1677line 1676 didn't jump to line 1677 because the condition on line 1676 was never true
1677 alts = [col]
1678 else:
1679 separators = [";", "•", r"\n", " or "]
1680 if " + " not in col:
1681 separators.append(",")
1682 if not col.endswith("/"):
1683 separators.append("/")
1684 if col in special_phrase_splits:
1685 # Use language-specific special splits.
1686 # These are phrases and constructions that have
1687 # unique ways of splitting, not specific characters
1688 # to split on like with the default splitting.
1689 alts, tags = special_phrase_splits[col]
1690 split_extra_tags = tags.split()
1691 for x in split_extra_tags:
1692 assert x in valid_tags
1693 assert isinstance(alts, (list, tuple))
1694 assert isinstance(tags, str)
1695 else:
1696 # Use default splitting. However, recognize
1697 # language-specific replacements and change them to magic
1698 # characters before splitting. This way we won't split
1699 # them. This is important for, e.g., recognizing
1700 # alternative pronouns.
1701 # The magic characters are characters out of Unicode scope
1702 # that are given a simple incremental value, int > unicode.
1703 repls = {}
1704 magic_ch = MAGIC_FIRST
1705 trs = get_lang_conf(lang, "form_transformations")
1706 # trs is a list of lists of strings
1707 for _, v, _, _ in trs:
1708 # v is a pattern string, like "^ich"
1709 # form_transformations data is doing double-duty here,
1710 # because the pattern strings are already known to us and
1711 # not meant to be split.
1712 m = re.search(v, col)
1713 if m is not None:
1714 # if pattern found in text
1715 magic = chr(magic_ch)
1716 magic_ch += 1 # next magic character value
1717 col = re.sub(v, magic, col) # replace with magic ch
1718 repls[magic] = m.group(0)
1719 # remember what regex match string each magic char
1720 # replaces. .group(0) is the whole match.
1721 alts0 = split_at_comma_semi(col, separators=separators)
1722 # with magic characters in place, split the text so that
1723 # pre-transformation text is out of the way.
1724 alts = []
1725 for alt in alts0:
1726 # create a new list with the separated items and
1727 # the magic characters replaced with the original texts.
1728 for k, v in repls.items():
1729 alt = re.sub(k, v, alt)
1730 alts.append(alt)
1732 # Remove "*" from beginning of forms, as in non-attested
1733 # or reconstructed forms. Otherwise it might confuse romanization
1734 # detection.
1735 alts = list(re.sub(r"^\*\*?([^ ])", r"\1", x) for x in alts)
1736 alts = list(
1737 x for x in alts if not re.match(r"pronounced with |\(with ", x)
1738 )
1739 alts = list(
1740 re.sub(r"^\((in the sense [^)]*)\)\s+", "", x) for x in alts
1741 )
1742 return col, alts, split_extra_tags
1744 def handle_mixed_lines(alts: list[str]) -> list[tuple[str, str, str]]:
1745 # Handle the special case where romanization is given under
1746 # normal form, e.g. in Russian. There can be multiple
1747 # comma-separated forms in each case. We also handle the case
1748 # where instead of romanization we have IPA pronunciation
1749 # (e.g., avoir/French/verb).
1750 len2 = len(alts) // 2
1751 # Check for IPAs (forms first, IPAs under)
1752 # base, base, IPA, IPA
1753 if (
1754 len(alts) % 2 == 0 # Divisibly by two
1755 and all(
1756 re.match(r"^\s*/.*/\s*$", x) # Inside slashes = IPA
1757 for x in alts[len2:]
1758 )
1759 ): # In the second half of alts
1760 nalts = list(
1761 (alts[i], "", alts[i + len2])
1762 # List of tuples: (base, "", ipa)
1763 for i in range(len2)
1764 )
1765 # base, base, base, IPA
1766 elif (
1767 len(alts) > 2
1768 and re.match(r"^\s*/.*/\s*$", alts[-1])
1769 and all(not x.startswith("/") for x in alts[:-1])
1770 ):
1771 # Only if the last alt is IPA
1772 nalts = list((alts[i], "", alts[-1]) for i in range(len(alts) - 1))
1773 # base, IPA, IPA, IPA
1774 elif (
1775 len(alts) > 2
1776 and not alts[0].startswith("/")
1777 and all(
1778 re.match(r"^\s*/.*/\s*$", alts[i]) for i in range(1, len(alts))
1779 )
1780 ):
1781 # First is base and the rest is IPA alternatives
1782 nalts = list((alts[0], "", alts[i]) for i in range(1, len(alts)))
1784 # Check for romanizations, forms first, romanizations under
1785 elif (
1786 len(alts) % 2 == 0
1787 and not any("(" in x for x in alts)
1788 and all(
1789 classify_desc(
1790 re.sub(
1791 r"\^.*$",
1792 "",
1793 # Remove ends of strings starting from ^.
1794 # Supescripts have been already removed
1795 # from the string, while ^xyz needs to be
1796 # removed separately, though it's usually
1797 # something with a single letter?
1798 "".join(xx for xx in x if not is_superscript(xx)),
1799 )
1800 )
1801 == "other"
1802 for x in alts[:len2]
1803 )
1804 and all(
1805 classify_desc(
1806 re.sub(
1807 r"\^.*$",
1808 "",
1809 "".join(xx for xx in x if not is_superscript(xx)),
1810 )
1811 )
1812 in ("romanization", "english")
1813 for x in alts[len2:]
1814 )
1815 ):
1816 nalts = list((alts[i], alts[i + len2], "") for i in range(len2))
1817 # Check for romanizations, forms and romanizations alternating
1818 elif (
1819 len(alts) % 2 == 0
1820 and not any("(" in x for x in alts)
1821 and all(
1822 classify_desc(
1823 re.sub(
1824 r"\^.*$",
1825 "",
1826 "".join(xx for xx in alts[i] if not is_superscript(xx)),
1827 )
1828 )
1829 == "other"
1830 for i in range(0, len(alts), 2)
1831 )
1832 and all(
1833 classify_desc(
1834 re.sub(
1835 r"\^.*$",
1836 "",
1837 "".join(xx for xx in alts[i] if not is_superscript(xx)),
1838 )
1839 )
1840 in ("romanization", "english")
1841 for i in range(1, len(alts), 2)
1842 )
1843 ):
1844 # odds
1845 nalts = list(
1846 (alts[i], alts[i + 1], "") for i in range(0, len(alts), 2)
1847 )
1848 # evens
1849 # Handle complex Georgian entries with alternative forms and*
1850 # *romanizations. It's a bit of a mess. Remove this kludge if not
1851 # needed anymore. NOTE THAT THE PARENTHESES ON THE WEBSITE ARE NOT
1852 # DISPLAYED. They are put inside their own span elements that are
1853 # then hidden with some CSS.
1854 # https://en.wiktionary.org/wiki/%E1%83%90%E1%83%9B%E1%83%94%E1%83%A0%E1%83%98%E1%83%99%E1%83%98%E1%83%A1_%E1%83%A8%E1%83%94%E1%83%94%E1%83%A0%E1%83%97%E1%83%94%E1%83%91%E1%83%A3%E1%83%9A%E1%83%98_%E1%83%A8%E1%83%A2%E1%83%90%E1%83%A2%E1%83%94%E1%83%91%E1%83%98
1855 # ამერიკის შეერთებულ შტატებს(ა) (ameriḳis šeertebul šṭaṭebs(a))
1856 # The above should generate two alts entries, with two different
1857 # parallel versions, one without (a) and with (a) at the end,
1858 # for both the Georgian original and the romanization.
1859 elif ( 1859 ↛ 1864line 1859 didn't jump to line 1864 because the condition on line 1859 was never true
1860 tablecontext.template_name == "ka-decl-noun"
1861 and len(alts) >= 1
1862 and any(" (" in alt_ for alt_ in alts)
1863 ):
1864 nalts = ka_decl_noun_template_cell(alts)
1865 else:
1866 new_alts = []
1867 for alt in alts:
1868 lst = [""]
1869 idx = 0
1870 for m in re.finditer(
1871 r"(^|\w|\*)\((\w+" r"(/\w+)*)\)",
1872 # start OR letter OR asterisk (word/word*)
1873 # \\___________group 1_______/ \ \_g3_///
1874 # \ \__gr. 2_//
1875 # \_____________group 0________________/
1876 alt,
1877 ):
1878 v = m.group(2) # (word/word/word...)
1879 if (
1880 classify_desc(v) == "tags" # Tags inside parens
1881 or m.group(0) == alt
1882 ): # All in parens
1883 continue
1884 new_lst = []
1885 for x in lst:
1886 x += alt[idx : m.start()] + m.group(1)
1887 # alt until letter or asterisk
1888 idx = m.end()
1889 vparts = v.split("/")
1890 # group(2) = ["word", "wörd"...]
1891 if len(vparts) == 1:
1892 new_lst.append(x)
1893 new_lst.append(x + v)
1894 # "kind(er)" -> ["kind", "kinder"]
1895 else:
1896 for vv in vparts:
1897 new_lst.append(x + vv)
1898 # "lampai(tten/den)" ->
1899 # ["lampaitten", "lampaiden"]
1900 lst = new_lst
1901 for x in lst:
1902 new_alts.append(x + alt[idx:])
1903 # add the end of alt
1904 nalts = list((x, "", "") for x in new_alts)
1905 # [form, no romz, no ipa]
1906 return nalts
1908 def find_semantic_parens(form: str) -> tuple[str, list[str]]:
1909 # "Some languages" (=Greek) use brackets to mark things that
1910 # require tags, like (informality), [rarity] and {archaicity}.
1911 extra_tags = []
1912 if re.match(r"\([^][(){}]*\)$", form):
1913 if get_lang_conf(lang, "parentheses_for_informal"):
1914 form = form[1:-1]
1915 extra_tags.append("informal")
1916 else:
1917 form = form[1:-1]
1918 elif re.match(r"\{\[[^][(){}]*\]\}$", form):
1919 if get_lang_conf( 1919 ↛ 1926line 1919 didn't jump to line 1926 because the condition on line 1919 was always true
1920 lang, "square_brackets_for_rare"
1921 ) and get_lang_conf(lang, "curly_brackets_for_archaic"):
1922 # είμαι/Greek/Verb
1923 form = form[2:-2]
1924 extra_tags.extend(["rare", "archaic"])
1925 else:
1926 form = form[2:-2]
1927 elif re.match(r"\{[^][(){}]*\}$", form):
1928 if get_lang_conf(lang, "curly_brackets_for_archaic"): 1928 ↛ 1933line 1928 didn't jump to line 1933 because the condition on line 1928 was always true
1929 # είμαι/Greek/Verb
1930 form = form[1:-1]
1931 extra_tags.extend(["archaic"])
1932 else:
1933 form = form[1:-1]
1934 elif re.match(r"\[[^][(){}]*\]$", form):
1935 if get_lang_conf(lang, "square_brackets_for_rare"): 1935 ↛ 1940line 1935 didn't jump to line 1940 because the condition on line 1935 was always true
1936 # είμαι/Greek/Verb
1937 form = form[1:-1]
1938 extra_tags.append("rare")
1939 else:
1940 form = form[1:-1]
1941 return form, extra_tags
1943 def handle_parens(
1944 form: str, roman: str, clitic: str, extra_tags: list[str]
1945 ) -> tuple[str, str, str]:
1946 if re.match(r"[’'][a-z]([a-z][a-z]?)?$", paren):
1947 # is there a clitic starting with apostrophe?
1948 clitic = paren
1949 # assume the whole paren is a clitic
1950 # then remove paren from form
1951 form = (form[: m.start()] + subst + form[m.end() :]).strip()
1952 elif classify_desc(paren) == "tags":
1953 tagsets1, topics1 = decode_tags(paren)
1954 if not topics1: 1954 ↛ 1975line 1954 didn't jump to line 1975 because the condition on line 1954 was always true
1955 for ts in tagsets1:
1956 ts = tuple(x for x in ts if " " not in x)
1957 # There are some generated tags containing
1958 # spaces; do not let them through here.
1959 extra_tags.extend(ts)
1960 form = (form[: m.start()] + subst + form[m.end() :]).strip()
1961 # brackets contain romanization
1962 elif (
1963 m.start() > 0
1964 and not roman
1965 and classify_desc(form[: m.start()]) == "other"
1966 and
1967 # "other" ~ text
1968 classify_desc(paren) in ("romanization", "english")
1969 and not re.search(r"^with |-form$", paren)
1970 ):
1971 roman = paren
1972 form = (form[: m.start()] + subst + form[m.end() :]).strip()
1973 elif re.search(r"^with |-form", paren): 1973 ↛ 1974line 1973 didn't jump to line 1974 because the condition on line 1973 was never true
1974 form = (form[: m.start()] + subst + form[m.end() :]).strip()
1975 return form, roman, clitic
1977 def merge_row_and_column_tags(form, some_has_covered_text):
1978 # Merge column tags and row tags. We give preference
1979 # to moods etc coming from rowtags (cf. austteigen/German/Verb
1980 # imperative forms).
1982 # In certain cases, what a tag means depends on whether
1983 # it is a row or column header. Depending on the language,
1984 # we replace certain tags with others if they're in
1985 # a column or row
1987 ret = []
1988 # rtagreplacs = get_lang_conf(lang, "rowtag_replacements")
1989 # ctagreplacs = get_lang_conf(lang, "coltag_replacements")
1990 for rt in sorted(rowtags):
1991 if "dummy-use-as-coltags" in rt: 1991 ↛ 1992line 1991 didn't jump to line 1992 because the condition on line 1991 was never true
1992 continue
1993 # if lang was in rowtag_replacements)
1994 # if not rtagreplacs == None:
1995 # rt = replace_directional_tags(rt, rtagreplacs)
1996 for ct in sorted(coltags):
1997 if "dummy-use-as-rowtags" in ct: 1997 ↛ 1998line 1997 didn't jump to line 1998 because the condition on line 1997 was never true
1998 continue
1999 # if lang was in coltag_replacements
2000 # if not ctagreplacs == None:
2001 # ct = replace_directional_tags(ct,
2002 # ctagreplacs)
2003 tags = set(global_tags)
2004 tags.update(extra_tags)
2005 tags.update(rt)
2006 tags.update(refs_tags)
2007 tags.update(tablecontext.section_header)
2008 # Merge tags from column. For certain kinds of tags,
2009 # those coming from row take precedence.
2010 old_tags = set(tags)
2011 for t in ct:
2012 c = valid_tags[t]
2013 if c in ("mood", "case", "number") and any(
2014 valid_tags[tt] == c for tt in old_tags
2015 ):
2016 continue
2017 tags.add(t)
2019 # Extract language-specific tags from the
2020 # form. This may also adjust the form.
2021 form, lang_tags = lang_specific_tags(lang, pos, form)
2022 tags.update(lang_tags)
2024 # For non-finite verb forms, see if they have
2025 # a gender/class suffix
2026 if pos == "verb" and any(
2027 valid_tags[t] == "non-finite" for t in tags
2028 ):
2029 form, tt = parse_head_final_tags(wxr, lang, form)
2030 tags.update(tt)
2032 # Remove "personal" tag if have nth person; these
2033 # come up with e.g. reconhecer/Portuguese/Verb. But
2034 # not if we also have "pronoun"
2035 if (
2036 "personal" in tags
2037 and "pronoun" not in tags
2038 and any(
2039 x in tags
2040 for x in [
2041 "first-person",
2042 "second-person",
2043 "third-person",
2044 ]
2045 )
2046 ):
2047 tags.remove("personal")
2049 # If we have impersonal, remove person and number.
2050 # This happens with e.g. viajar/Portuguese/Verb
2051 if "impersonal" in tags:
2052 tags = tags - set(
2053 [
2054 "first-person",
2055 "second-person",
2056 "third-person",
2057 "singular",
2058 "plural",
2059 ]
2060 )
2062 # Remove unnecessary "positive" tag from verb forms
2063 if pos == "verb" and "positive" in tags:
2064 if "negative" in tags: 2064 ↛ 2065line 2064 didn't jump to line 2065 because the condition on line 2064 was never true
2065 tags.remove("negative")
2066 tags.remove("positive")
2068 # Many Russian (and other Slavic) inflection tables
2069 # have animate/inanimate distinction that generates
2070 # separate entries for neuter/feminine, but the
2071 # distinction only applies to masculine. Remove them
2072 # form neuter/feminine and eliminate duplicates.
2073 if get_lang_conf(lang, "masc_only_animate"):
2074 for t1 in ("animate", "inanimate"):
2075 for t2 in ("neuter", "feminine"):
2076 if (
2077 t1 in tags
2078 and t2 in tags
2079 and "masculine" not in tags
2080 and "plural" not in tags
2081 ):
2082 tags.remove(t1)
2084 # German adjective tables contain "(keiner)" etc
2085 # for mixed declension plural. When the adjective
2086 # disappears and it becomes just one word, remove
2087 # the "includes-article" tag. e.g. eiskalt/German
2088 if "includes-article" in tags and " " not in form:
2089 tags.remove("includes-article")
2091 # Handle ignored forms. We mark that the form was
2092 # provided. This is important information; some words
2093 # just do not have a certain form. However, there also
2094 # many cases where no word in a language has a
2095 # particular form. Post-processing could detect and
2096 # remove such cases.
2097 if form in IGNORED_COLVALUES:
2098 # if cell text seems to be ignorable
2099 if "dummy-ignore-skipped" in tags:
2100 continue
2101 if (
2102 col_idx not in has_covering_hdr
2103 and some_has_covered_text
2104 ):
2105 continue
2106 # don't ignore this cell if there's been a header
2107 # above it
2108 form = "-"
2109 elif col_idx in has_covering_hdr:
2110 some_has_covered_text = True
2112 # Handle ambiguous object concord. If a header
2113 # gives the "dummy-object-concord"-tag to a word,
2114 # replace person, number and gender tags with
2115 # their "object-" counterparts so that the verb
2116 # agrees with the object instead.
2117 # Use only when the verb has ONLY object agreement!
2118 # a پخول/Pashto
2119 if "dummy-object-concord" in tags: 2119 ↛ 2120line 2119 didn't jump to line 2120 because the condition on line 2119 was never true
2120 for subtag, objtag in object_concord_replacements.items():
2121 if subtag in tags:
2122 tags.remove(subtag)
2123 tags.add(objtag)
2125 # Remove the dummy mood tag that we sometimes
2126 # use to block adding other mood and related
2127 # tags
2128 tags = tags - set(
2129 [
2130 "dummy-mood",
2131 "dummy-tense",
2132 "dummy-ignore-skipped",
2133 "dummy-object-concord",
2134 "dummy-reset-headers",
2135 "dummy-use-as-coltags",
2136 "dummy-use-as-rowtags",
2137 "dummy-store-hdrspan",
2138 "dummy-load-stored-hdrspans",
2139 "dummy-reset-stored-hdrspans",
2140 "dummy-section-header",
2141 ]
2142 )
2144 # Perform language-specific tag replacements according
2145 # to rules in a table.
2146 lang_tag_mappings = get_lang_conf(lang, "lang_tag_mappings")
2147 if lang_tag_mappings is not None: 2147 ↛ 2148line 2147 didn't jump to line 2148 because the condition on line 2147 was never true
2148 for pre, post in lang_tag_mappings.items():
2149 if all(t in tags for t in pre):
2150 tags = (tags - set(pre)) | set(post)
2152 # Warn if there are entries with empty tags
2153 if not tags:
2154 wxr.wtp.debug(
2155 "inflection table: empty tags for {}".format(form),
2156 sortid="inflection/1826",
2157 )
2159 # Warn if form looks like IPA
2160 ########## XXX ########
2161 # Because IPA is its own unicode block, we could also
2162 # technically do a Unicode name check to see if a string
2163 # contains IPA. Not all valid IPA characters are in the
2164 # IPA extension block, so you can technically have false
2165 # negatives if it's something like /toki/, but it
2166 # shouldn't give false positives.
2167 # Alternatively, you could make a list of IPA-admissible
2168 # characters and reject non-IPA stuff with that.
2169 if re.match(r"\s*/.*/\s*$", form): 2169 ↛ 2170line 2169 didn't jump to line 2170 because the condition on line 2169 was never true
2170 wxr.wtp.debug(
2171 "inflection table form looks like IPA: "
2172 "form={} tags={}".format(form, tags),
2173 sortid="inflection/1840",
2174 )
2176 # Note that this checks `form`, not `in tags`
2177 if form == "dummy-ignored-text-cell": 2177 ↛ 2178line 2177 didn't jump to line 2178 because the condition on line 2177 was never true
2178 continue
2180 if "dummy-remove-this-cell" in tags: 2180 ↛ 2181line 2180 didn't jump to line 2181 because the condition on line 2180 was never true
2181 continue
2183 # Add the form
2184 tags = list(sorted(tags))
2185 dt = {"form": form, "tags": tags, "source": source}
2186 if roman:
2187 dt["roman"] = roman
2188 if ipa:
2189 dt["ipa"] = ipa
2190 ret.append(dt)
2191 # If we got separate clitic form, add it
2192 if clitic:
2193 dt = {
2194 "form": clitic,
2195 "tags": tags + ["clitic"],
2196 "source": source,
2197 }
2198 ret.append(dt)
2199 return ret, form, some_has_covered_text
2201 # First extract definitions from cells
2202 # See defs_ht for footnote defs stuff
2203 for row in rows:
2204 for cell in row:
2205 text, refs, defs, hdr_tags = extract_cell_content(
2206 lang, word, cell.text
2207 )
2208 # refs, defs = footnote stuff, defs -> (ref, def)
2209 add_defs(defs)
2210 # Extract definitions from text after table
2211 text, refs, defs, hdr_tags = extract_cell_content(lang, word, after)
2212 add_defs(defs)
2214 # Then extract the actual forms
2215 ret = []
2216 hdrspans = []
2217 first_col_has_text = False
2218 rownum = 0
2219 title = None
2220 global_tags = []
2221 table_tags = []
2222 special_phrase_splits = get_lang_conf(lang, "special_phrase_splits")
2223 form_replacements = get_lang_conf(lang, "form_replacements")
2224 form_transformations = get_lang_conf(lang, "form_transformations")
2225 possibly_ignored_forms = get_lang_conf(lang, "conditionally_ignored_cells")
2226 cleanup_rules = get_lang_conf(lang, "minor_text_cleanups")
2228 for title in titles:
2229 more_global_tags, more_table_tags, extra_forms = parse_title(
2230 title, source
2231 )
2232 global_tags.extend(more_global_tags)
2233 table_tags.extend(more_table_tags)
2234 ret.extend(extra_forms)
2235 cell_rowcnt = collections.defaultdict(int)
2236 seen_cells = set()
2237 has_covering_hdr = set()
2238 some_has_covered_text = False
2239 for row in rows:
2240 # print("ROW:", row)
2241 # print("====")
2242 # print(f"Start of PREVIOUS row hdrspans:"
2243 # f"{tuple(sp.tagsets for sp in hdrspans)}")
2244 # print(f"Start of row txt: {tuple(t.text for t in row)}")
2245 if not row: 2245 ↛ 2246line 2245 didn't jump to line 2246 because the condition on line 2245 was never true
2246 continue # Skip empty rows
2247 all_headers = all(x.is_title or not x.text.strip() for x in row)
2248 text = row[0].text
2249 if (
2250 row[0].is_title
2251 and text
2252 and not is_superscript(text[0])
2253 and text not in infl_map # zealous inflation map?
2254 and (
2255 re.match(r"Inflection ", text)
2256 or re.sub(
2257 r"\s+",
2258 " ", # flatten whitespace
2259 re.sub(
2260 r"\s*\([^)]*\)",
2261 "",
2262 # Remove whitespace+parens
2263 text,
2264 ),
2265 ).strip()
2266 not in infl_map
2267 )
2268 and not re.match(infl_start_re, text)
2269 and all(
2270 x.is_title == row[0].is_title and x.text == text
2271 # all InflCells in `row` have the same is_title and text
2272 for x in row
2273 )
2274 ):
2275 if text and title is None:
2276 # Only if there were no titles previously make the first
2277 # text that is found the title
2278 title = text
2279 if re.match(r"(Note:|Notes:)", title): 2279 ↛ 2280line 2279 didn't jump to line 2280 because the condition on line 2279 was never true
2280 continue # not a title
2281 more_global_tags, more_table_tags, extra_forms = parse_title(
2282 title, source
2283 )
2284 global_tags.extend(more_global_tags)
2285 table_tags.extend(more_table_tags)
2286 ret.extend(extra_forms)
2287 continue # Skip title rows without incrementing i
2288 if "dummy-skip-this" in global_tags: 2288 ↛ 2289line 2288 didn't jump to line 2289 because the condition on line 2288 was never true
2289 return []
2290 rowtags = [()]
2291 # have_hdr = False
2292 # have_hdr never used?
2293 have_text = False
2294 samecell_cnt = 0
2295 col0_hdrspan = None # col0 or later header (despite its name)
2296 col0_followed_by_nonempty = False
2297 row_empty = True
2298 for col_idx, cell in enumerate(row):
2299 colspan = cell.colspan # >= 1
2300 rowspan = cell.rowspan # >= 1
2301 previously_seen = id(cell) in seen_cells
2302 # checks to see if this cell was in the previous ROW
2303 seen_cells.add(id(cell))
2304 if samecell_cnt == 0:
2305 # First column of a (possible multi-column) cell
2306 samecell_cnt = colspan - 1
2307 else:
2308 assert samecell_cnt > 0
2309 samecell_cnt -= 1
2310 continue
2312 # is_first_row_of_cell = cell_rowcnt[id(cell)] == 0
2313 # never used?
2315 # defaultdict(int) around line 1900
2316 cell_rowcnt[id(cell)] += 1
2317 # => how many cols this spans
2318 col = cell.text
2319 if not col:
2320 continue
2321 row_empty = False
2322 is_title = cell.is_title
2324 # If the cell has a target, i.e., text after colon, interpret
2325 # it as simply specifying a value for that value and ignore
2326 # it otherwise.
2327 if cell.target:
2328 text, refs, defs, hdr_tags = extract_cell_content(
2329 lang, word, col
2330 )
2331 if not text: 2331 ↛ 2332line 2331 didn't jump to line 2332 because the condition on line 2331 was never true
2332 continue
2333 refs_tags = set()
2334 for ref in refs: # gets tags from footnotes 2334 ↛ 2335line 2334 didn't jump to line 2335 because the loop on line 2334 never started
2335 if ref in def_ht:
2336 refs_tags.update(def_ht[ref])
2337 rowtags = expand_header(
2338 wxr,
2339 tablecontext,
2340 word,
2341 lang,
2342 pos,
2343 text,
2344 [],
2345 silent=True,
2346 depth=depth,
2347 column_number=col_idx,
2348 )
2349 rowtags = list(
2350 set(tuple(sorted(set(x) | refs_tags)) for x in rowtags)
2351 )
2352 is_title = False
2353 col = cell.target
2355 # print(rownum, col_idx, col)
2356 # print(f"is_title: {is_title}")
2357 if is_title:
2358 # It is a header cell
2359 text, refs, defs, hdr_tags = extract_cell_content(
2360 lang, word, col
2361 )
2362 if not text:
2363 continue
2364 # Extract tags from referenced footnotes
2365 refs_tags = set()
2366 for ref in refs:
2367 if ref in def_ht:
2368 refs_tags.update(def_ht[ref])
2370 # Expand header to tags
2371 v = expand_header(
2372 wxr,
2373 tablecontext,
2374 word,
2375 lang,
2376 pos,
2377 text,
2378 [],
2379 silent=True,
2380 depth=depth,
2381 column_number=col_idx,
2382 )
2383 # print("EXPANDED {!r} to {}".format(text, v))
2385 if col_idx == 0:
2386 # first_col_has_text is used for a test to ignore
2387 # upper-left cells that are just text without
2388 # header info
2389 first_col_has_text = True
2390 # Check if the header expands to reset hdrspans
2391 if any("dummy-reset-headers" in tt for tt in v):
2392 new_hdrspans = []
2393 for hdrspan in hdrspans:
2394 # if there are HdrSpan objects (abstract headers with
2395 # row- and column-spans) that are to the left or at the
2396 # same row or below, KEEP those; things above and to
2397 # the right of the hdrspan with dummy-reset-headers
2398 # are discarded. Tags from the header together with
2399 # dummy-reset-headers are kept as normal.
2400 if (
2401 hdrspan.start + hdrspan.colspan < col_idx
2402 or hdrspan.rownum > rownum - cell.rowspan
2403 ):
2404 new_hdrspans.append(hdrspan)
2405 hdrspans = new_hdrspans
2407 for tt in v:
2408 if "dummy-section-header" in tt: 2408 ↛ 2409line 2408 didn't jump to line 2409 because the condition on line 2408 was never true
2409 tablecontext.section_header = tt
2410 break
2411 if "dummy-reset-section-header" in tt: 2411 ↛ 2412line 2411 didn't jump to line 2412 because the condition on line 2411 was never true
2412 tablecontext.section_header = []
2413 # Text between headers on a row causes earlier headers to
2414 # be reset
2415 if have_text:
2416 # print(" HAVE_TEXT BEFORE HDR:", col)
2417 # Reset rowtags if new title column after previous
2418 # text cells
2419 # +-----+-----+-----+-----+
2420 # |hdr-a|txt-a|hdr-B|txt-B|
2421 # +-----+-----+-----+-----+
2422 # ^reset rowtags=>
2423 # XXX beware of header "—": "" - must not clear on that if
2424 # it expands to no tags
2425 rowtags = [()]
2426 # have_hdr = True
2427 # have_hdr never used?
2428 # print("HAVE_HDR: {} rowtags={}".format(col, rowtags))
2429 # Update rowtags and coltags
2430 has_covering_hdr.add(col_idx) # col_idx == current column
2431 # has_covering_hdr is a set that has the col_idx-ids of columns
2432 # that have previously had some kind of header. It is never
2433 # resetted inside the col_idx-loops OR the bigger rows-loop, so
2434 # applies to the whole table.
2436 rowtags, new_coltags, all_hdr_tags = generate_tags(
2437 rowtags, table_tags
2438 )
2440 if any("dummy-skip-this" in ts for ts in rowtags):
2441 continue # Skip this cell
2443 if any("dummy-load-stored-hdrspans" in ts for ts in v): 2443 ↛ 2444line 2443 didn't jump to line 2444 because the condition on line 2443 was never true
2444 hdrspans.extend(tablecontext.stored_hdrspans)
2446 if any("dummy-reset-stored-hdrspans" in ts for ts in v): 2446 ↛ 2447line 2446 didn't jump to line 2447 because the condition on line 2446 was never true
2447 tablecontext.stored_hdrspans = []
2449 if any("dummy-store-hdrspan" in ts for ts in v): 2449 ↛ 2451line 2449 didn't jump to line 2451 because the condition on line 2449 was never true
2450 # print(f"STORED: {col}")
2451 store_new_hdrspan = True
2452 else:
2453 store_new_hdrspan = False
2455 new_coltags = list(
2456 x
2457 for x in new_coltags
2458 if not any(t in noinherit_tags for t in x)
2459 )
2460 # print("new_coltags={} previously_seen={} all_hdr_tags={}"
2461 # .format(new_coltags, previously_seen, all_hdr_tags))
2462 if any(new_coltags):
2463 (
2464 col,
2465 col0_followed_by_nonempty,
2466 col0_hdrspan,
2467 ) = add_new_hdrspan(
2468 col,
2469 hdrspans,
2470 store_new_hdrspan,
2471 col0_followed_by_nonempty,
2472 col0_hdrspan,
2473 )
2475 continue
2477 # These values are ignored, at least for now
2478 if re.match(r"^(# |\(see )", col): 2478 ↛ 2479line 2478 didn't jump to line 2479 because the condition on line 2478 was never true
2479 continue
2481 if any("dummy-skip-this" in ts for ts in rowtags):
2482 continue # Skip this cell
2484 # If the word has no rowtags and is a multi-row cell, then
2485 # ignore this. This happens with empty separator rows
2486 # within a rowspan>1 cell. cf. wander/English/Conjugation.
2487 if rowtags == [()] and rowspan > 1:
2488 continue
2490 # Minor cleanup. See e.g. είμαι/Greek/Verb present participle.
2491 if cleanup_rules:
2492 for regx, substitution in cleanup_rules.items():
2493 col = re.sub(regx, substitution, col)
2495 if ( 2495 ↛ 2500line 2495 didn't jump to line 2500 because the condition on line 2495 was never true
2496 col_idx == 0
2497 and not first_col_has_text
2498 and get_lang_conf(lang, "ignore_top_left_text_cell") is True
2499 ):
2500 continue # Skip text at top left, as in Icelandic, Faroese
2502 # if col0_hdrspan is not None:
2503 # print("COL0 FOLLOWED NONHDR: {!r} by {!r}"
2504 # .format(col0_hdrspan.text, col))
2505 col0_followed_by_nonempty = True
2506 have_text = True
2508 # Determine column tags for the multi-column cell
2509 combined_coltags = compute_coltags(
2510 lang, pos, hdrspans, col_idx, colspan, col
2511 )
2512 if any("dummy-ignored-text-cell" in ts for ts in combined_coltags): 2512 ↛ 2513line 2512 didn't jump to line 2513 because the condition on line 2512 was never true
2513 continue
2515 # Split the text into separate forms. First simplify spaces except
2516 # newline.
2517 col = re.sub(r"[ \t\r]+", " ", col)
2518 # Split the cell text into alternatives
2520 col, alts, split_extra_tags = split_text_into_alts(col)
2522 # Some cells have mixed form content, like text and romanization,
2523 # or text and IPA. Handle these.
2524 alts = handle_mixed_lines(alts)
2526 alts = list((x, combined_coltags) for x in alts)
2528 # Generate forms from the alternatives
2529 # alts is a list of (tuple of forms, tuple of tags)
2530 for (form, base_roman, ipa), coltags in alts:
2531 form = form.strip()
2532 extra_tags = []
2533 extra_tags.extend(split_extra_tags)
2534 # Handle special splits again here, so that we can have custom
2535 # mappings from form to form and tags.
2536 if form in form_replacements:
2537 replacement, tags = form_replacements[form]
2538 for x in tags.split():
2539 assert x in valid_tags
2540 assert isinstance(replacement, str)
2541 assert isinstance(tags, str)
2542 form = replacement
2543 extra_tags.extend(tags.split())
2545 check_romanization_form_transformation = False
2546 # loop over regexes in form_transformation and replace text
2547 # in form using regex patterns
2548 # this does a bit of the same stuff the above does,
2549 # but with regexes and re.sub() instead
2550 for (
2551 form_transformations_pos,
2552 v,
2553 subst,
2554 tags,
2555 ) in form_transformations:
2556 # v is a pattern string, like "^ich"
2557 if (
2558 isinstance(form_transformations_pos, str)
2559 and pos != form_transformations_pos
2560 ) or (
2561 (not isinstance(form_transformations_pos, str))
2562 and pos not in form_transformations_pos
2563 ):
2564 continue
2565 m = re.search(v, form)
2566 if m is not None:
2567 if base_roman: 2567 ↛ 2568line 2567 didn't jump to line 2568 because the condition on line 2567 was never true
2568 for _, rom_v, rom_sub, _ in form_transformations:
2569 rom_m = re.search(rom_v, base_roman)
2570 if rom_m is not None:
2571 base_roman = re.sub(
2572 rom_v, rom_sub, base_roman
2573 )
2574 break
2575 form = re.sub(v, subst, form)
2576 for x in tags.split():
2577 assert x in valid_tags
2578 extra_tags.extend(tags.split())
2579 check_romanization_form_transformation = True
2580 break
2582 # Clean the value, extracting reference symbols
2583 form, refs, defs, hdr_tags = extract_cell_content(
2584 lang, word, form
2585 )
2586 # if refs:
2587 # print("REFS:", refs)
2588 extra_tags.extend(hdr_tags)
2589 # Extract tags from referenced footnotes
2590 refs_tags = set()
2591 for ref in refs:
2592 if ref in def_ht:
2593 refs_tags.update(def_ht[ref])
2595 if base_roman:
2596 if check_romanization_form_transformation: 2596 ↛ 2600line 2596 didn't jump to line 2600 because the condition on line 2596 was never true
2597 # because form_transformations are used to handle things
2598 # where the romanization has the "same" structure, we
2599 # need to handle that here too....
2600 for (
2601 _,
2602 v,
2603 subst,
2604 _,
2605 ) in form_transformations:
2606 # v is a pattern string, like "^ich"
2607 m = re.search(v, base_roman)
2608 if m is not None:
2609 base_roman = re.sub(v, subst, base_roman)
2610 # XXX add tag stuff here if needed
2611 break
2613 base_roman, _, _, hdr_tags = extract_cell_content(
2614 lang, word, base_roman
2615 )
2616 extra_tags.extend(hdr_tags)
2618 # Do some additional cleanup on the cell.
2619 form = re.sub(r"^\s*,\s*", "", form)
2620 form = re.sub(r"\s*,\s*$", "", form)
2621 form = re.sub(r"\s*(,\s*)+", ", ", form)
2622 form = re.sub(r"(?i)^Main:", "", form)
2623 form = re.sub(r"\s+", " ", form)
2624 form = form.strip()
2626 # Look for parentheses that have semantic meaning
2627 form, et = find_semantic_parens(form)
2628 extra_tags.extend(et)
2630 # Handle parentheses in the table element. We parse
2631 # tags anywhere and romanizations anywhere but beginning.
2632 roman = base_roman
2633 paren = None
2634 clitic = None
2635 m = re.search(r"(\s+|^)\(([^)]*)\)", form)
2636 # start|spaces + (anything)
2637 if m is not None:
2638 subst = m.group(1)
2639 paren = m.group(2)
2640 else:
2641 m = re.search(r"\(([^)]*)\)(\s+|$)", form)
2642 # (anything) + spaces|end
2643 if m is not None: 2643 ↛ 2644line 2643 didn't jump to line 2644 because the condition on line 2643 was never true
2644 paren = m.group(1)
2645 subst = m.group(2)
2646 if paren is not None:
2647 form, roman, clitic = handle_parens(
2648 form, roman, clitic, extra_tags
2649 )
2651 # Ignore certain forms that are not really forms,
2652 # unless they're really, really close to the article title
2653 if form in ( 2653 ↛ 2658line 2653 didn't jump to line 2658 because the condition on line 2653 was never true
2654 "",
2655 "unchanged",
2656 "after an", # in sona/Irish/Adj/Mutation
2657 ):
2658 Lev = distw([form], word)
2659 if form and Lev < 0.1:
2660 wxr.wtp.debug(
2661 "accepted possible false positive '{}' with"
2662 "> 0.1 Levenshtein distance in {}/{}".format(
2663 form, word, lang
2664 ),
2665 sortid="inflection/2213",
2666 )
2667 elif form and Lev < 0.3:
2668 wxr.wtp.debug(
2669 "skipped possible match '{}' with > 0.3"
2670 "Levenshtein distance in {}/{}".format(
2671 form, word, lang
2672 ),
2673 sortid="inflection/2218",
2674 )
2675 continue
2676 else:
2677 continue
2678 # print("ROWTAGS={} COLTAGS={} REFS_TAGS={} "
2679 # "FORM={!r} ROMAN={!r}"
2680 # .format(rowtags, coltags, refs_tags,
2681 # form, roman))
2683 # Merge tags from row and column and do miscellaneous
2684 # tag-related handling.
2685 (
2686 merge_ret,
2687 form,
2688 some_has_covered_text,
2689 ) = merge_row_and_column_tags(form, some_has_covered_text)
2690 ret.extend(merge_ret)
2692 # End of row.
2693 rownum += 1
2694 # For certain languages, if the row was empty, reset
2695 # hdrspans (saprast/Latvian/Verb, but not aussteigen/German/Verb).
2696 if row_empty and get_lang_conf(lang, "empty_row_resets"):
2697 hdrspans = []
2698 # Check if we should expand col0_hdrspan.
2699 if col0_hdrspan is not None:
2700 col0_allowed = get_lang_conf(lang, "hdr_expand_first")
2701 col0_cats = tagset_cats(col0_hdrspan.tagsets)
2702 # Only expand if col0_cats and later_cats are allowed
2703 # and don't overlap and col0 has tags, and there have
2704 # been no disallowed cells in between.
2705 if (
2706 not col0_followed_by_nonempty
2707 and not (col0_cats - col0_allowed)
2708 and
2709 # len(col0_cats) == 1 and
2710 col_idx > col0_hdrspan.start + col0_hdrspan.colspan
2711 ):
2712 # If an earlier header is only followed by headers that yield
2713 # no tags, expand it to entire row
2714 # print("EXPANDING COL0: {} from {} to {} cols {}"
2715 # .format(col0_hdrspan.text, col0_hdrspan.colspan,
2716 # len(row) - col0_hdrspan.start,
2717 # col0_hdrspan.tagsets))
2718 col0_hdrspan.colspan = len(row) - col0_hdrspan.start
2719 col0_hdrspan.expanded = True
2720 # XXX handle refs and defs
2721 # for x in hdrspans:
2722 # print(" HDRSPAN {} {} {} {!r}"
2723 # .format(x.start, x.colspan, x.tagsets, x.text))
2725 # Post-process German nouns with articles in separate columns. We move the
2726 # definite/indefinite/usually-without-article markers into the noun and
2727 # remove the article entries.
2728 if get_lang_conf(lang, "articles_in_separate_columns") and any(
2729 "noun" in x["tags"] for x in ret
2730 ):
2731 new_ret = []
2732 saved_tags = set()
2733 had_noun = False
2734 for dt in ret:
2735 tags = dt["tags"]
2736 # print(tags)
2737 if "noun" in tags:
2738 tags = list(
2739 sorted(set(t for t in tags if t != "noun") | saved_tags)
2740 )
2741 had_noun = True
2742 elif ( 2742 ↛ 2769line 2742 didn't jump to line 2769 because the condition on line 2742 was always true
2743 "indefinite" in tags
2744 or "definite" in tags
2745 or "usually-without-article" in tags
2746 or "without-article" in tags
2747 ):
2748 if had_noun:
2749 saved_tags = set(tags)
2750 else:
2751 saved_tags = saved_tags | set(tags) # E.g. Haus/German
2752 remove_useless_tags(lang, pos, saved_tags)
2753 saved_tags = saved_tags & set(
2754 [
2755 "masculine",
2756 "feminine",
2757 "neuter",
2758 "singular",
2759 "plural",
2760 "indefinite",
2761 "definite",
2762 "usually-without-article",
2763 "without-article",
2764 ]
2765 )
2766 had_noun = False
2767 continue # Skip the articles
2769 dt = dt.copy()
2770 dt["tags"] = tags
2771 new_ret.append(dt)
2772 ret = new_ret
2774 elif possibly_ignored_forms:
2775 # Some languages have tables with cells that are kind of separated
2776 # and difficult to handle, like eulersche Formel/German where
2777 # the definite and indefinite articles are just floating.
2778 # If a language has a dict of conditionally_ignored_cells,
2779 # and if the contents of a cell is found in one of the rules
2780 # there, ignore that cell if it
2781 # 1. Does not have the appropriate tag (like "definite" for "die")
2782 # and
2783 # 2. The title of the article is not one of the other co-words
2784 # (ie. it's an article for the definite articles in german etc.)
2785 # pass
2786 new_ret = []
2787 for cell_data in ret:
2788 tags = cell_data["tags"]
2789 text = cell_data["form"]
2790 skip_this = False
2791 for key_tag, ignored_forms in possibly_ignored_forms.items():
2792 if text not in ignored_forms: 2792 ↛ 2794line 2792 didn't jump to line 2794 because the condition on line 2792 was always true
2793 continue
2794 if word in ignored_forms:
2795 continue
2796 if key_tag not in tags:
2797 skip_this = True
2799 if skip_this: 2799 ↛ 2800line 2799 didn't jump to line 2800 because the condition on line 2799 was never true
2800 continue
2801 new_ret.append(cell_data)
2803 ret = new_ret
2805 # Post-process English inflection tables, addding "multiword-construction"
2806 # when the number of words has increased.
2807 if lang == "English" and pos == "verb":
2808 word_words = len(word.split())
2809 new_ret = []
2810 for dt in ret:
2811 form = dt.get("form", "")
2812 if len(form.split()) > word_words:
2813 dt = dt.copy()
2814 dt["tags"] = list(dt.get("tags", []))
2815 # This strange copy-assigning shuffle is preventative black
2816 # magic; do not touch lest you invoke deep bugs.
2817 data_append(dt, "tags", "multiword-construction")
2818 new_ret.append(dt)
2819 ret = new_ret
2821 # Always insert "table-tags" detail as the first entry in any inflection
2822 # table. This way we can reliably detect where a new table starts.
2823 # Table-tags applies until the next table-tags entry.
2824 if ret or table_tags:
2825 table_tags = list(sorted(set(table_tags)))
2826 dt = {
2827 "form": " ".join(table_tags),
2828 "source": source,
2829 "tags": ["table-tags"],
2830 }
2831 if dt["form"] == "":
2832 dt["form"] = "no-table-tags"
2833 if tablecontext.template_name:
2834 tn = {
2835 "form": tablecontext.template_name,
2836 "source": source,
2837 "tags": ["inflection-template"],
2838 }
2839 ret = [dt] + [tn] + ret
2840 else:
2841 ret = [dt] + ret
2843 return ret
2846def handle_generic_table(
2847 wxr, tablecontext, data, word, lang, pos, rows, titles, source, after, depth
2848):
2849 assert isinstance(wxr, WiktextractContext)
2850 assert isinstance(data, dict)
2851 assert isinstance(word, str)
2852 assert isinstance(lang, str)
2853 assert isinstance(pos, str)
2854 assert isinstance(rows, list)
2855 assert isinstance(source, str)
2856 assert isinstance(after, str)
2857 assert isinstance(depth, int)
2858 for row in rows:
2859 assert isinstance(row, list)
2860 for x in row:
2861 assert isinstance(x, InflCell)
2862 assert isinstance(titles, list)
2863 for x in titles:
2864 assert isinstance(x, str)
2866 # Try to parse the table as a simple table
2867 ret = parse_simple_table(
2868 wxr, tablecontext, word, lang, pos, rows, titles, source, after, depth
2869 )
2870 if ret is None: 2870 ↛ 2873line 2870 didn't jump to line 2873 because the condition on line 2870 was never true
2871 # XXX handle other table formats
2872 # We were not able to handle the table
2873 wxr.wtp.debug(
2874 "unhandled inflection table format, {}/{}".format(word, lang),
2875 sortid="inflection/2370",
2876 )
2877 return
2879 # Add the returned forms but eliminate duplicates.
2880 have_forms = set()
2881 for dt in ret:
2882 fdt = freeze(dt)
2883 if fdt in have_forms:
2884 continue # Don't add duplicates
2885 # Some Russian words have Declension and Pre-reform declension partially
2886 # duplicating same data. Don't add "dated" tags variant if already have
2887 # the same without "dated" from the modern declension table
2889 tags = dt.get("tags", [])
2890 for dated_tag in ("dated",):
2891 if dated_tag in tags:
2892 dt2 = dt.copy()
2893 tags2 = list(x for x in tags if x != dated_tag)
2894 dt2["tags"] = tags2
2895 if tags2 and freeze(dt2) in have_forms: 2895 ↛ 2896line 2895 didn't jump to line 2896 because the condition on line 2895 was never true
2896 break # Already have without archaic
2897 else:
2898 if "table-tags" not in tags:
2899 have_forms.add(fdt)
2900 data_append(data, "forms", dt)
2903def determine_header(
2904 wxr,
2905 tablecontext,
2906 lang,
2907 word,
2908 pos,
2909 table_kind,
2910 kind,
2911 style,
2912 row,
2913 col,
2914 celltext,
2915 titletext,
2916 cols_headered,
2917 target,
2918 cellstyle,
2919):
2920 assert isinstance(table_kind, NodeKind)
2921 assert isinstance(kind, (NodeKind, str))
2922 assert style is None or isinstance(style, str)
2923 assert cellstyle is None or isinstance(cellstyle, str)
2925 if table_kind == NodeKind.TABLE:
2926 header_kind = NodeKind.TABLE_HEADER_CELL
2927 elif table_kind == NodeKind.HTML: 2927 ↛ 2929line 2927 didn't jump to line 2929 because the condition on line 2927 was always true
2928 header_kind = "th"
2929 idx = celltext.find(": ")
2930 is_title = False
2931 # remove anything in parentheses, compress whitespace, .strip()
2932 cleaned_titletext = re.sub(
2933 r"\s+", " ", re.sub(r"\s*\([^)]*\)", "", titletext)
2934 ).strip()
2935 cleaned, _, _, _ = extract_cell_content(lang, word, celltext)
2936 cleaned = re.sub(r"\s+", " ", cleaned)
2937 hdr_expansion = expand_header(
2938 wxr,
2939 tablecontext,
2940 word,
2941 lang,
2942 pos,
2943 cleaned,
2944 [],
2945 silent=True,
2946 ignore_tags=True,
2947 )
2948 candidate_hdr = not any(
2949 any(t.startswith("error-") for t in ts) for ts in hdr_expansion
2950 )
2951 # KJ candidate_hdr says that a specific cell is a candidate
2952 # for being a header because it passed through expand_header
2953 # without getting any "error-" tags; that is, the contents
2954 # is "valid" for being a header; these are the false positives
2955 # we want to catch
2956 ignored_cell = any(
2957 any(t.startswith("dummy-") for t in ts) for ts in hdr_expansion
2958 )
2959 # ignored_cell should NOT be used to filter for headers, like
2960 # candidate_hdr is used, but only to filter for related *debug
2961 # messages*: some dummy-tags are actually half-way to headers,
2962 # like ones with "Notes", so they MUST be headers, but later
2963 # on they're ignored *as* headers so they don't need to print
2964 # out any cells-as-headers debug messages.
2965 if (
2966 candidate_hdr
2967 and kind != header_kind
2968 and cleaned != ""
2969 and cleaned != "dummy-ignored-text-cell"
2970 and cleaned not in IGNORED_COLVALUES
2971 ):
2972 # print("col: {}".format(col))
2973 if not ignored_cell and lang not in LANGUAGES_WITH_CELLS_AS_HEADERS:
2974 wxr.wtp.debug(
2975 "rejected heuristic header: "
2976 "table cell identified as header and given "
2977 "candidate status, BUT {} is not in "
2978 "LANGUAGES_WITH_CELLS_AS_HEADERS; "
2979 "cleaned text: {}".format(lang, cleaned),
2980 sortid="inflection/2447",
2981 )
2982 candidate_hdr = False
2983 elif cleaned not in LANGUAGES_WITH_CELLS_AS_HEADERS.get(lang, ""):
2984 wxr.wtp.debug(
2985 "rejected heuristic header: "
2986 "table cell identified as header and given "
2987 "candidate status, BUT the cleaned text is "
2988 "not in LANGUAGES_WITH_CELLS_AS_HEADERS[{}]; "
2989 "cleaned text: {}".format(lang, cleaned),
2990 sortid="inflection/2457",
2991 )
2992 candidate_hdr = False
2993 else:
2994 wxr.wtp.debug(
2995 "accepted heuristic header: "
2996 "table cell identified as header and given "
2997 "candidate status, AND the cleaned text is "
2998 "in LANGUAGES_WITH_CELLS_AS_HEADERS[{}]; "
2999 "cleaned text: {}".format(lang, cleaned),
3000 sortid="inflection/2466",
3001 )
3003 # If the cell starts with something that could start a
3004 # definition (typically a reference symbol), make it a candidate
3005 # regardless of whether the language is listed.
3006 if re.match(def_re, cleaned) and not re.match(nondef_re, cleaned): 3006 ↛ 3007line 3006 didn't jump to line 3007 because the condition on line 3006 was never true
3007 candidate_hdr = True
3009 # print("titletext={!r} hdr_expansion={!r} candidate_hdr={!r} "
3010 # "lang={} pos={}"
3011 # .format(titletext, hdr_expansion, candidate_hdr,
3012 # lang, pos))
3013 if idx >= 0 and titletext[:idx] in infl_map:
3014 target = titletext[idx + 2 :].strip()
3015 celltext = celltext[:idx]
3016 is_title = True
3017 elif (
3018 kind == header_kind
3019 and " + " not in titletext # For "avoir + blah blah"?
3020 and not any(
3021 isinstance(x, WikiNode)
3022 and x.kind == NodeKind.HTML
3023 and x.sarg == "span"
3024 and x.attrs.get("lang") in ("az",)
3025 for x in col.children
3026 )
3027 ):
3028 is_title = True
3029 elif (
3030 candidate_hdr
3031 and cleaned_titletext not in IGNORED_COLVALUES
3032 and distw([cleaned_titletext], word) > 0.3
3033 and cleaned_titletext not in ("I", "es")
3034 ):
3035 is_title = True
3036 # if first column or same style as first column
3037 elif (
3038 style == cellstyle
3039 and
3040 # and title is not identical to word name
3041 titletext != word
3042 and cleaned not in IGNORED_COLVALUES
3043 and cleaned != "dummy-ignored-text-cell"
3044 and
3045 # the style composite string is not broken
3046 not style.startswith("////")
3047 and " + " not in titletext
3048 ):
3049 if not ignored_cell and lang not in LANGUAGES_WITH_CELLS_AS_HEADERS: 3049 ↛ 3050line 3049 didn't jump to line 3050 because the condition on line 3049 was never true
3050 wxr.wtp.debug(
3051 "rejected heuristic header: "
3052 "table cell identified as header based "
3053 "on style, BUT {} is not in "
3054 "LANGUAGES_WITH_CELLS_AS_HEADERS; "
3055 "cleaned text: {}, style: {}".format(lang, cleaned, style),
3056 sortid="inflection/2512",
3057 )
3058 elif ( 3058 ↛ 3062line 3058 didn't jump to line 3062 because the condition on line 3058 was never true
3059 not ignored_cell
3060 and cleaned not in LANGUAGES_WITH_CELLS_AS_HEADERS.get(lang, "")
3061 ):
3062 wxr.wtp.debug(
3063 "rejected heuristic header: "
3064 "table cell identified as header based "
3065 "on style, BUT the cleaned text is "
3066 "not in LANGUAGES_WITH_CELLS_AS_HEADERS[{}]; "
3067 "cleaned text: {}, style: {}".format(lang, cleaned, style),
3068 sortid="inflection/2522",
3069 )
3070 else:
3071 wxr.wtp.debug(
3072 "accepted heuristic header: "
3073 "table cell identified as header based "
3074 "on style, AND the cleaned text is "
3075 "in LANGUAGES_WITH_CELLS_AS_HEADERS[{}]; "
3076 "cleaned text: {}, style: {}".format(lang, cleaned, style),
3077 sortid="inflection/2530",
3078 )
3079 is_title = True
3080 if ( 3080 ↛ 3087line 3080 didn't jump to line 3087 because the condition on line 3080 was never true
3081 not is_title
3082 and len(row) < len(cols_headered)
3083 and cols_headered[len(row)]
3084 ):
3085 # Whole column has title suggesting they are headers
3086 # (e.g. "Case")
3087 is_title = True
3088 if re.match(
3089 r"Conjugation of |Declension of |Inflection of |"
3090 r"Mutation of |Notes\b", # \b is word-boundary
3091 titletext,
3092 ):
3093 is_title = True
3094 return is_title, hdr_expansion, target, celltext
3097class TableContext:
3098 """Saved context used when parsing a table and its subtables."""
3100 __slot__ = (
3101 "stored_hdrspans",
3102 "section_header",
3103 "template_name",
3104 )
3106 def __init__(self, template_name=None):
3107 self.stored_hdrspans = []
3108 self.section_header = []
3109 if not template_name:
3110 self.template_name = ""
3111 else:
3112 self.template_name = template_name
3115def handle_wikitext_or_html_table(
3116 wxr: WiktextractContext,
3117 word: str,
3118 lang: str,
3119 pos: str,
3120 data,
3121 tree,
3122 titles,
3123 source,
3124 after,
3125 tablecontext: TableContext | None = None,
3126):
3127 """Parses a table from parsed Wikitext format into rows and columns of
3128 InflCell objects and then calls handle_generic_table() to parse it into
3129 forms. This adds the forms into ``data``."""
3130 assert isinstance(wxr, WiktextractContext)
3131 assert isinstance(word, str)
3132 assert isinstance(lang, str)
3133 assert isinstance(pos, str)
3134 assert isinstance(data, dict)
3135 assert isinstance(tree, WikiNode)
3136 assert tree.kind == NodeKind.TABLE or (
3137 tree.kind == NodeKind.HTML and tree.sarg == "table"
3138 )
3139 assert isinstance(titles, list)
3140 assert isinstance(source, str)
3141 for x in titles:
3142 assert isinstance(x, str)
3143 assert isinstance(after, str)
3144 assert tablecontext is None or isinstance(tablecontext, TableContext)
3145 # Imported here to avoid a circular import
3146 from wiktextract.page import clean_node, recursively_extract
3148 # from wikitextprocessor.parser import print_tree
3149 # print_tree(tree)
3150 # print("-------==========-------")
3152 if not tablecontext:
3153 tablecontext = TableContext()
3155 # Get language specific text removal patterns
3156 remove_text_patterns: (
3157 dict[tuple[str, ...], tuple[str | re.Pattern, ...]] | None
3158 ) = None
3159 if rem := get_lang_conf(lang, "remove_text_patterns"):
3160 for poses in rem.keys():
3161 if pos in poses:
3162 remove_text_patterns = rem[poses]
3163 break
3165 def handle_table1(
3166 wxr,
3167 tablecontext,
3168 word,
3169 lang,
3170 pos,
3171 data,
3172 tree,
3173 titles,
3174 source,
3175 after,
3176 depth,
3177 ):
3178 """Helper function allowing the 'flattening' out of the table
3179 recursion: instead of handling the tables in the wrong order
3180 (recursively), this function adds to new_row that is then
3181 iterated through in the main function at the end, creating
3182 a longer table (still in pieces) in the correct order."""
3184 assert isinstance(data, dict)
3185 assert isinstance(titles, list)
3186 assert isinstance(source, str)
3187 for x in titles:
3188 assert isinstance(x, str)
3189 assert isinstance(after, str)
3190 assert isinstance(depth, int)
3191 # print("HANDLE_WIKITEXT_TABLE", titles)
3193 col_gap_data = [] # Filling for columns with rowspan > 1
3194 # col_gap_data contains None or InflCell
3195 vertical_still_left = [] # Number of remaining rows for which to fill
3196 # the column; vertical_still_left contains int
3197 cols_headered = [] # [F, T, F, F...]
3198 # True when the whole column contains headers, even
3199 # when the cell is not considered a header; triggered
3200 # by the "*" inflmap meta-tag.
3201 rows = []
3203 sub_ret = []
3205 # from wikitextprocessor.parser import print_tree
3206 # print_tree(tree)
3207 for node in tree.children:
3208 if not isinstance(node, WikiNode):
3209 continue
3210 if node.kind == NodeKind.HTML:
3211 kind = node.sarg
3212 else:
3213 kind = node.kind
3215 # print(" {}".format(node))
3216 if kind in (NodeKind.TABLE_CAPTION, "caption"):
3217 # print(" CAPTION:", node)
3218 pass
3219 elif kind in (NodeKind.TABLE_ROW, "tr"):
3220 if "vsShow" in node.attrs.get("class", "").split():
3221 # vsShow rows are those that are intially shown in tables
3222 # that have more data. The hidden data duplicates these
3223 # rows, so we skip it and just process the hidden data.
3224 continue
3226 # if (
3227 # len(node.children) == 1
3228 # and node.children[0].attrs.get("class") == "separator"
3229 # ):
3230 # print("------------------ skip separator")
3231 # continue
3233 # Parse a table row.
3234 row = []
3235 style = None
3236 row_has_nonempty_cells = False
3237 # Have nonempty cell not from rowspan
3238 for col in get_table_cells(node):
3239 # loop through each cell in the ROW
3241 # The below skip is not needed anymore, because we "skip" in
3242 # get_table_cells, but left here as a comment
3243 # if not isinstance(col, WikiNode):
3244 # # This skip is not used for counting,
3245 # # "None" is not used in
3246 # # indexing or counting or looping.
3247 # continue
3248 if col.kind == NodeKind.HTML:
3249 kind = col.sarg
3250 else:
3251 kind = col.kind
3252 if kind not in ( 3252 ↛ 3258line 3252 didn't jump to line 3258 because the condition on line 3252 was never true
3253 NodeKind.TABLE_HEADER_CELL,
3254 NodeKind.TABLE_CELL,
3255 "th",
3256 "td",
3257 ):
3258 print(" UNEXPECTED ROW CONTENT: {}".format(col))
3259 continue
3261 while (
3262 len(row) < len(vertical_still_left)
3263 and vertical_still_left[len(row)] > 0
3264 ):
3265 # vertical_still_left is [...0, 0, 2...] for each
3266 # column. It is populated at the end of the loop, at the
3267 # same time as col_gap_data. This needs to be looped and
3268 # filled this way because each `for col`-looping jumps
3269 # straight to the next meaningful cell; there is no
3270 # "None" cells, only emptiness between, and rowspan and
3271 # colspan are just to generate the "fill-
3272 vertical_still_left[len(row)] -= 1
3273 row.append(col_gap_data[len(row)])
3275 # appending row is how "indexing" is
3276 # done here; something is appended,
3277 # like a filler-cell here or a "start"
3278 # cell at the end of the row-loop,
3279 # which increased len(row) which is
3280 # then used as the target-index to check
3281 # for gaps. vertical_still_left is
3282 # the countdown to when to stop
3283 # filling in gaps, and goes down to 0,
3284 # and col_gap_data is not touched
3285 # except when a new rowspan is needed,
3286 # at the same time that
3287 # vertical_still_left gets reassigned.
3289 try:
3290 rowspan = int(col.attrs.get("rowspan", "1")) # 🡙
3291 colspan = int(col.attrs.get("colspan", "1")) # 🡘
3292 except ValueError:
3293 rowspan = 1
3294 colspan = 1
3295 # print("COL:", col)
3297 # Too many of these errors
3298 if colspan > 100:
3299 # wxr.wtp.error(
3300 # f"Colspan {colspan} over 30, set to 1",
3301 # sortid="inflection/20250113a",
3302 # )
3303 colspan = 100
3304 if rowspan > 100: 3304 ↛ 3309line 3304 didn't jump to line 3309 because the condition on line 3304 was never true
3305 # wxr.wtp.error(
3306 # f"Rowspan {rowspan} over 30, set to 1",
3307 # sortid="inflection/20250113b",
3308 # )
3309 rowspan = 100
3311 # Process any nested tables recursively.
3312 tables, rest = recursively_extract(
3313 col,
3314 lambda x: isinstance(x, WikiNode)
3315 and (x.kind == NodeKind.TABLE or x.sarg == "table"),
3316 )
3318 # Clean the rest of the cell.
3319 celltext = clean_node(wxr, None, rest)
3320 # print(f"CLEANED: {celltext=}")
3321 # print(f"SUBTABLES: {tables}")
3323 # Remove regexed patterns from text
3324 if remove_text_patterns is not None:
3325 for pat in remove_text_patterns:
3326 celltext = re.sub(pat, "", celltext)
3327 # print(f"AFTER: {celltext=} <<")
3329 # Handle nested tables.
3330 for tbl in tables:
3331 # Some nested tables (e.g., croí/Irish) have subtitles
3332 # as normal paragraphs in the same cell under a descrip-
3333 # tive text that should be treated as a title (e.g.,
3334 # "Forms with the definite article", with "definite" not
3335 # mentioned elsewhere).
3336 new_titles = list(titles)
3337 if celltext:
3338 new_titles.append(celltext)
3339 subtbl = handle_table1(
3340 wxr,
3341 tablecontext,
3342 word,
3343 lang,
3344 pos,
3345 data,
3346 tbl,
3347 new_titles,
3348 source,
3349 "",
3350 depth + 1,
3351 )
3352 if subtbl: 3352 ↛ 3330line 3352 didn't jump to line 3330 because the condition on line 3352 was always true
3353 sub_ret.append((rows, titles, after, depth))
3354 rows = []
3355 titles = []
3356 after = ""
3357 sub_ret.extend(subtbl)
3359 # This magic value is used as part of header detection
3360 cellstyle = (
3361 col.attrs.get("style", "")
3362 + "//"
3363 + col.attrs.get("class", "")
3364 + "//"
3365 + str(kind)
3366 )
3368 if not row: # if first column in row
3369 style = cellstyle
3370 target = None
3371 titletext = celltext.strip()
3372 while titletext and is_superscript(titletext[-1]):
3373 titletext = titletext[:-1]
3375 (
3376 is_title,
3377 hdr_expansion,
3378 target,
3379 celltext,
3380 ) = determine_header(
3381 wxr,
3382 tablecontext,
3383 lang,
3384 word,
3385 pos,
3386 tree.kind,
3387 kind,
3388 style,
3389 row,
3390 col,
3391 celltext,
3392 titletext,
3393 cols_headered,
3394 None,
3395 cellstyle,
3396 )
3398 if is_title:
3399 # If this cell gets a "*" tag, make the whole column
3400 # below it (toggling it in cols_headered = [F, F, T...])
3401 # into headers.
3402 while len(cols_headered) <= len(row):
3403 cols_headered.append(False)
3404 if any("*" in tt for tt in hdr_expansion):
3405 cols_headered[len(row)] = True
3406 celltext = ""
3407 # if row_has_nonempty_cells has been True at some point, it
3408 # keeps on being True.
3409 # if row_has_nonempty_cells or is_title or celltext != "":
3410 # row_has_nonempty_cells = True
3411 # ⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓
3412 row_has_nonempty_cells |= is_title or celltext != ""
3413 cell = InflCell(
3414 celltext, is_title, colspan, rowspan, target
3415 )
3416 for _ in range(0, colspan):
3417 # colspan🡘 current loop (col) or 1
3418 # All the data-filling for colspan
3419 # is done simply in this loop,
3420 # while rowspan needs to use
3421 # vertical_still_left to count gaps
3422 # and col_gap_data to fill in
3423 # those gaps with InflCell data.
3424 if rowspan > 1: # rowspan🡙 current loop (col) or 1
3425 while len(col_gap_data) <= len(row):
3426 # Initialize col_gap_data/ed if
3427 # it is lacking slots
3428 # for each column; col_gap_data and
3429 # vertical_still_left are never
3430 # reset to [], during
3431 # the whole table function.
3432 col_gap_data.append(None)
3433 vertical_still_left.append(0)
3434 # Below is where the "rectangle" block of rowspan
3435 # and colspan is filled for the future.
3436 col_gap_data[len(row)] = cell
3437 # col_gap_data contains cells that
3438 # will be used in the
3439 # future, or None
3440 vertical_still_left[len(row)] = rowspan - 1
3441 # A counter for how many gaps🡙 are still left to be
3442 # filled (row.append or
3443 # row[col_gap_data[len(row)] =>
3444 # rows), it is not reset to [], but decremented to 0
3445 # each time a row gets something from col_gap_data.
3446 # Append this cell 1+ times for colspan🡘
3447 row.append(cell)
3448 if not row:
3449 continue
3450 # After looping the original row-nodes above, fill
3451 # in the rest of the row if the final cell has colspan
3452 # (inherited from above, so a cell with rowspan and colspan)
3453 for i in range(len(row), len(vertical_still_left)):
3454 if vertical_still_left[i] <= 0:
3455 continue
3456 vertical_still_left[i] -= 1
3457 while len(row) < i:
3458 row.append(InflCell("", False, 1, 1, None))
3459 row.append(col_gap_data[i])
3460 # print(" ROW {!r}".format(row))
3461 if row_has_nonempty_cells: 3461 ↛ 3207line 3461 didn't jump to line 3207 because the condition on line 3461 was always true
3462 rows.append(row)
3463 elif kind in ( 3463 ↛ 3207line 3463 didn't jump to line 3207 because the condition on line 3463 was always true
3464 NodeKind.TABLE_HEADER_CELL,
3465 NodeKind.TABLE_CELL,
3466 "th",
3467 "td",
3468 "span",
3469 ):
3470 # print(" TOP-LEVEL CELL", node)
3471 pass
3473 if sub_ret:
3474 main_ret = sub_ret
3475 main_ret.append((rows, titles, after, depth))
3476 else:
3477 main_ret = [(rows, titles, after, depth)]
3478 return main_ret
3480 new_rows = handle_table1(
3481 wxr, tablecontext, word, lang, pos, data, tree, titles, source, after, 0
3482 )
3484 # Now we have a table that has been parsed into rows and columns of
3485 # InflCell objects. Parse the inflection table from that format.
3486 if new_rows: 3486 ↛ exitline 3486 didn't return from function 'handle_wikitext_or_html_table' because the condition on line 3486 was always true
3487 for rows, titles, after, depth in new_rows:
3488 handle_generic_table(
3489 wxr,
3490 tablecontext,
3491 data,
3492 word,
3493 lang,
3494 pos,
3495 rows,
3496 titles,
3497 source,
3498 after,
3499 depth,
3500 )
3503def get_table_cells(node: WikiNode) -> Generator[WikiNode, None, None]:
3504 """If a wikitext table cell contains HTML cells `<td>`, as they sometimes
3505 do because it is easier to write wikitext conditionals that way,
3506 those td-elements are parsed as child elements of the Wikitext cell.
3507 This generator will yield wikitext and HTML direct children of
3508 `node` and if a Wikitext TABLE_CELL has direct td-element children,
3509 those are also yielded."""
3510 for col in node.children:
3511 if not isinstance(col, WikiNode):
3512 continue
3513 if any(
3514 isinstance(c, HTMLNode) and c.sarg in ("th", "td")
3515 for c in col.children
3516 ):
3517 html_cells = []
3518 content = []
3519 for c in col.children:
3520 if isinstance(c, HTMLNode) and c.sarg in ("th", "td"):
3521 html_cells.append(c)
3522 else:
3523 content.append(c)
3524 # Remove td-elements from col so they are not returned twice
3525 col.children = content
3526 yield col
3527 for c in html_cells:
3528 yield c
3529 else:
3530 yield col
3533def handle_html_table(
3534 wxr, word, lang, pos, data, tree, titles, source, after, tablecontext=None
3535):
3536 """A passer-on function for html-tables, XXX, remove these?"""
3537 handle_wikitext_or_html_table(
3538 wxr, word, lang, pos, data, tree, titles, source, after, tablecontext
3539 )
3542def handle_wikitext_table(
3543 wxr, word, lang, pos, data, tree, titles, source, after, tablecontext=None
3544):
3545 """A passer-on function for html-tables, XXX, remove these?"""
3546 handle_wikitext_or_html_table(
3547 wxr, word, lang, pos, data, tree, titles, source, after, tablecontext
3548 )
3551def parse_inflection_section(
3552 wxr, data, word, lang, pos, section, tree, tablecontext=None
3553):
3554 """Parses an inflection section on a page. ``data`` should be the
3555 data for a part-of-speech, and inflections will be added to it."""
3557 # print("PARSE_INFLECTION_SECTION {}/{}/{}/{}"
3558 # .format(word, lang, pos, section))
3559 assert isinstance(wxr, WiktextractContext)
3560 assert isinstance(data, dict)
3561 assert isinstance(word, str)
3562 assert isinstance(lang, str)
3563 assert isinstance(section, str)
3564 assert isinstance(tree, WikiNode)
3565 assert tablecontext is None or isinstance(tablecontext, TableContext)
3566 source = section
3567 tables = []
3568 titleparts = []
3569 preceding_bolded_title = ""
3571 # from wikitextprocessor.parser import print_tree
3572 # print_tree(tree)
3573 # print("--------------******************----------------")
3575 def process_tables():
3576 for kind, node, titles, after in tables:
3577 after = "".join(after).strip()
3578 after = clean_value(wxr, after)
3579 if kind == "wikitext":
3580 handle_wikitext_table(
3581 wxr,
3582 word,
3583 lang,
3584 pos,
3585 data,
3586 node,
3587 titles,
3588 source,
3589 after,
3590 tablecontext=tablecontext,
3591 )
3592 elif kind == "html": 3592 ↛ 3606line 3592 didn't jump to line 3606 because the condition on line 3592 was always true
3593 handle_html_table(
3594 wxr,
3595 word,
3596 lang,
3597 pos,
3598 data,
3599 node,
3600 titles,
3601 source,
3602 after,
3603 tablecontext=tablecontext,
3604 )
3605 else:
3606 raise RuntimeError(
3607 "{}: unimplemented table kind {}".format(word, kind)
3608 )
3610 def recurse_navframe(node, titles):
3611 nonlocal tables
3612 nonlocal titleparts
3613 titleparts = []
3614 old_tables = tables
3615 tables = []
3617 recurse(node, [], navframe=True)
3619 process_tables()
3620 tables = old_tables
3622 def recurse(node, titles, navframe=False):
3623 nonlocal tables
3624 if isinstance(node, (list, tuple)):
3625 for x in node:
3626 recurse(x, titles, navframe)
3627 return
3628 if isinstance(node, str):
3629 if tables:
3630 tables[-1][-1].append(node)
3631 elif navframe:
3632 titleparts.append(node)
3633 return
3634 if not isinstance(node, WikiNode): 3634 ↛ 3635line 3634 didn't jump to line 3635 because the condition on line 3634 was never true
3635 if navframe:
3636 wxr.wtp.debug(
3637 "inflection table: unhandled in NavFrame: {}".format(node),
3638 sortid="inflection/2907",
3639 )
3640 return
3641 kind = node.kind
3642 if navframe:
3643 if kind == NodeKind.HTML:
3644 classes = node.attrs.get("class", "").split()
3645 if "NavToggle" in classes: 3645 ↛ 3646line 3645 didn't jump to line 3646 because the condition on line 3645 was never true
3646 return
3647 if "NavHead" in classes:
3648 # print("NAVHEAD:", node)
3649 recurse(node.children, titles, navframe)
3650 return
3651 if "NavContent" in classes:
3652 # print("NAVCONTENT:", node)
3653 title = "".join(titleparts).strip()
3654 title = html.unescape(title)
3655 title = title.strip()
3656 new_titles = list(titles)
3657 if not re.match(r"(Note:|Notes:)", title): 3657 ↛ 3659line 3657 didn't jump to line 3659 because the condition on line 3657 was always true
3658 new_titles.append(title)
3659 recurse(node, new_titles, navframe=False)
3660 return
3661 else:
3662 if kind == NodeKind.TABLE:
3663 tables.append(["wikitext", node, titles, []])
3664 return
3665 elif kind == NodeKind.HTML and node.sarg == "table":
3666 classes = node.attrs.get("class", ())
3667 if "audiotable" in classes:
3668 return
3669 tables.append(["html", node, titles, []])
3670 return
3671 elif kind in ( 3671 ↛ 3678line 3671 didn't jump to line 3678 because the condition on line 3671 was never true
3672 NodeKind.LEVEL2,
3673 NodeKind.LEVEL3,
3674 NodeKind.LEVEL4,
3675 NodeKind.LEVEL5,
3676 NodeKind.LEVEL6,
3677 ):
3678 return # Skip subsections
3679 if (
3680 kind == NodeKind.HTML
3681 and node.sarg == "div"
3682 and "NavFrame" in node.attrs.get("class", "").split()
3683 ):
3684 recurse_navframe(node, titles)
3685 return
3686 if kind == NodeKind.LINK:
3687 if len(node.largs) > 1:
3688 recurse(node.largs[1:], titles, navframe)
3689 else:
3690 recurse(node.largs[0], titles, navframe)
3691 return
3692 if kind == NodeKind.HTML and node.sarg == "ref":
3693 return
3694 if kind == NodeKind.LIST and node.sarg == ";":
3695 nonlocal preceding_bolded_title
3696 from wiktextract.page import clean_node
3698 preceding_bolded_title = clean_node(wxr, None, node).strip("; ")
3699 for x in node.children:
3700 recurse(x, titles, navframe)
3702 assert tree.kind == NodeKind.ROOT
3703 for x in tree.children:
3704 if preceding_bolded_title != "":
3705 recurse(x, [preceding_bolded_title])
3706 else:
3707 recurse(x, [])
3709 # Process the tables we found
3710 process_tables()
3712 # XXX this code is used for extracting tables for inflection tests
3713 if wxr.config.expand_tables: 3713 ↛ 3714line 3713 didn't jump to line 3714 because the condition on line 3713 was never true
3714 if section != "Mutation":
3715 with open(wxr.config.expand_tables, "w") as f:
3716 f.write(word + "\n")
3717 f.write(lang + "\n")
3718 f.write(pos + "\n")
3719 f.write(section + "\n")
3720 text = wxr.wtp.node_to_wikitext(tree)
3721 f.write(text + "\n")