Coverage for src/wiktextract/extractor/en/inflection.py: 87%
1521 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-12 08:27 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-12 08:27 +0000
1# Code for parsing inflection tables.
2#
3# Copyright (c) 2021-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org.
5import collections
6import copy
7import functools
8import html
9import itertools
10import re
11import unicodedata
12from typing import Generator, Optional, Union
14from wikitextprocessor import MAGIC_FIRST, HTMLNode, NodeKind, WikiNode
16from ...clean import clean_value
17from ...datautils import data_append, freeze, split_at_comma_semi
18from ...tags import valid_tags
19from ...wxr_context import WiktextractContext
20from .form_descriptions import (
21 classify_desc,
22 decode_tags,
23 distw,
24 parse_head_final_tags,
25)
26from .inflection_kludges import ka_decl_noun_template_cell
27from .inflectiondata import infl_map, infl_start_map, infl_start_re
28from .lang_specific_configs import get_lang_conf, lang_specific_tags
29from .table_headers_heuristics_data import LANGUAGES_WITH_CELLS_AS_HEADERS
30from .type_utils import FormData
32# --debug-text-cell WORD
33# Command-line parameter for debugging. When parsing inflection tables,
34# print out debug messages when encountering this text.
35debug_cell_text: Optional[str] = None
38def set_debug_cell_text(text: str) -> None:
39 global debug_cell_text
40 debug_cell_text = text
43TagSets = list[tuple[str, ...]]
45# Column texts that are interpreted as an empty column.
46IGNORED_COLVALUES = {
47 "-",
48 "־",
49 "᠆",
50 "‐",
51 "‑",
52 "‒",
53 "–",
54 "—",
55 "―",
56 "−",
57 "⸺",
58 "⸻",
59 "﹘",
60 "﹣",
61 "-",
62 "/",
63 "?",
64 "not used",
65 "not applicable",
66}
68# These tags are never inherited from above
69# XXX merge with lang_specific
70noinherit_tags = {
71 "infinitive-i",
72 "infinitive-i-long",
73 "infinitive-ii",
74 "infinitive-iii",
75 "infinitive-iv",
76 "infinitive-v",
77}
79# Subject->object transformation mapping, when using dummy-object-concord
80# to replace subject concord tags with object concord tags
81object_concord_replacements = {
82 "first-person": "object-first-person",
83 "second-person": "object-second-person",
84 "third-person": "object-third-person",
85 "singular": "object-singular",
86 "plural": "object-plural",
87 "definite": "object-definite",
88 "indefinite": "object-indefinite",
89 "class-1": "object-class-1",
90 "class-2": "object-class-2",
91 "class-3": "object-class-3",
92 "class-4": "object-class-4",
93 "class-5": "object-class-5",
94 "class-6": "object-class-6",
95 "class-7": "object-class-7",
96 "class-8": "object-class-8",
97 "class-9": "object-class-9",
98 "class-10": "object-class-10",
99 "class-11": "object-class-11",
100 "class-12": "object-class-12",
101 "class-13": "object-class-13",
102 "class-14": "object-class-14",
103 "class-15": "object-class-15",
104 "class-16": "object-class-16",
105 "class-17": "object-class-17",
106 "class-18": "object-class-18",
107 "masculine": "object-masculine",
108 "feminine": "object-feminine",
109}
111# Words in title that cause addition of tags in all entries
112title_contains_global_map = {
113 "possessive": "possessive",
114 "possessed forms of": "possessive",
115 "predicative forms of": "predicative",
116 "negative": "negative",
117 "positive definite forms": "positive definite",
118 "positive indefinite forms": "positive indefinite",
119 "comparative": "comparative",
120 "superlative": "superlative",
121 "combined forms": "combined-form",
122 "mutation": "mutation",
123 "definite article": "definite",
124 "indefinite article": "indefinite",
125 "indefinite declension": "indefinite",
126 "bare forms": "indefinite", # e.g., cois/Irish
127 "definite declension": "definite",
128 "pre-reform": "dated",
129 "personal pronouns": "personal pronoun",
130 "composed forms of": "multiword-construction",
131 "subordinate-clause forms of": "subordinate-clause",
132 "participles of": "participle",
133 "variation of": "dummy-skip-this", # a'/Scottish Gaelic
134 "command form of": "imperative", # a راتلل/Pashto
135 "historical inflection of": "dummy-skip-this", # kork/Norwegian Nynorsk
136 "obsolete declension": "obsolete", # März/German 20241111
137}
138for k, v in title_contains_global_map.items():
139 if any(t not in valid_tags for t in v.split()): 139 ↛ 140line 139 didn't jump to line 140 because the condition on line 139 was never true
140 print("TITLE_CONTAINS_GLOBAL_MAP UNRECOGNIZED TAG: {}: {}".format(k, v))
141table_hdr_ign_part = r"(Inflection|Conjugation|Declension|Mutation) of [^\s]"
143table_hdr_ign_part_re = re.compile(r"(?i)(" + table_hdr_ign_part + ")")
144# (?i) python regex extension, ignore case
145title_contains_global_re = re.compile(
146 r"(?i)(^|\b)({}|{})($|\b)".format(
147 table_hdr_ign_part,
148 "|".join(re.escape(x) for x in title_contains_global_map.keys()),
149 )
150)
152# Words in title that cause addition of tags to table-tags "form"
153title_contains_wordtags_map = {
154 "pf": "perfective",
155 "impf": "imperfective",
156 "strong": "strong",
157 "weak": "weak",
158 "countable": "countable",
159 "uncountable": "uncountable",
160 "inanimate": "inanimate",
161 "animate": "animate",
162 "transitive": "transitive",
163 "intransitive": "intransitive",
164 "ditransitive": "ditransitive",
165 "ambitransitive": "ambitransitive",
166 "archaic": "archaic",
167 "dated": "dated",
168 "affirmative": "affirmative",
169 "negative": "negative",
170 "subject pronouns": "subjective",
171 "object pronouns": "objective",
172 "emphatic": "emphatic",
173 "proper noun": "proper-noun",
174 "no plural": "no-plural",
175 "imperfective": "imperfective",
176 "perfective": "perfective",
177 "no supine stem": "no-supine",
178 "no perfect stem": "no-perfect",
179 "deponent": "deponent",
180 "irregular": "irregular",
181 "no short forms": "no-short-form",
182 "iō-variant": "iō-variant",
183 "1st declension": "declension-1",
184 "2nd declension": "declension-2",
185 "3rd declension": "declension-3",
186 "4th declension": "declension-4",
187 "5th declension": "declension-5",
188 "6th declension": "declension-6",
189 "first declension": "declension-1",
190 "second declension": "declension-2",
191 "third declension": "declension-3",
192 "fourth declension": "declension-4",
193 "fifth declension": "declension-5",
194 "sixth declension": "declension-6",
195 "1st conjugation": "conjugation-1",
196 "2nd conjugation": "conjugation-2",
197 "3rd conjugation": "conjugation-3",
198 "4th conjugation": "conjugation-4",
199 "5th conjugation": "conjugation-5",
200 "6th conjugation": "conjugation-6",
201 "7th conjugation": "conjugation-7",
202 "first conjugation": "conjugation-1",
203 "second conjugation": "conjugation-2",
204 "third conjugation": "conjugation-3",
205 "fourth conjugation": "conjugation-4",
206 "fifth conjugation": "conjugation-5",
207 "sixth conjugation": "conjugation-6",
208 "seventh conjugation": "conjugation-7",
209 # Corsican regional tags in table header
210 "cismontane": "Cismontane",
211 "ultramontane": "Ultramontane",
212 "western lombard": "Western-Lombard",
213 "eastern lombard": "Eastern-Lombard",
214 "contracted": "contracted",
215 "present": "present",
216 "perfect": "perfect",
217 "imperfect": "imperfect",
218 "pluperfect": "pluperfect",
219 "future": "future",
220 "aorist": "aorist",
221}
222for k, v in title_contains_wordtags_map.items():
223 if any(t not in valid_tags for t in v.split()): 223 ↛ 224line 223 didn't jump to line 224 because the condition on line 223 was never true
224 print(
225 "TITLE_CONTAINS_WORDTAGS_MAP UNRECOGNIZED TAG: {}: {}".format(k, v)
226 )
227title_contains_wordtags_re = re.compile(
228 r"(?i)(^|\b)({}|{})($|\b)".format(
229 table_hdr_ign_part,
230 "|".join(re.escape(x) for x in title_contains_wordtags_map.keys()),
231 )
232)
234# Parenthesized elements in title that are converted to tags in
235# "table-tags" form
236title_elements_map = {
237 "weak": "weak",
238 "strong": "strong",
239 "separable": "separable",
240 "masculine": "masculine",
241 "feminine": "feminine",
242 "neuter": "neuter",
243 "singular": "singular",
244 "plural": "plural",
245 "archaic": "archaic",
246 "dated": "dated",
247 "Attic": "Attic", # e.g. καλός/Greek/Adj
248 "Epic": "Epic", # e.g. καλός/Greek/Adj
249}
250for k, v in title_elements_map.items():
251 if any(t not in valid_tags for t in v.split()): 251 ↛ 252line 251 didn't jump to line 252 because the condition on line 251 was never true
252 print("TITLE_ELEMENTS_MAP UNRECOGNIZED TAG: {}: {}".format(k, v))
254# Parenthized element starts to map them to tags for form for the rest of
255# the element
256title_elemstart_map = {
257 "auxiliary": "auxiliary",
258 "Kotus type": "class",
259 "ÕS type": "class",
260 "class": "class",
261 "short class": "class",
262 "type": "class",
263 "strong class": "class",
264 "weak class": "class",
265 "accent paradigm": "accent-paradigm",
266 "stem in": "class",
267}
268for k, v in title_elemstart_map.items():
269 if any(t not in valid_tags for t in v.split()): 269 ↛ 270line 269 didn't jump to line 270 because the condition on line 269 was never true
270 print("TITLE_ELEMSTART_MAP UNRECOGNIZED TAG: {}: {}".format(k, v))
271title_elemstart_re = re.compile(
272 r"^({}) ".format("|".join(re.escape(x) for x in title_elemstart_map.keys()))
273)
276# Regexp for cell starts that are likely definitions of reference symbols.
277# See also nondef_re.
278def_re = re.compile(
279 r"(\s*•?\s+)?"
280 r"((\*+|[△†0123456789⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻]+)([⁾):]|\s|(?=[A-Z]))|"
281 r"\^(\*+|[△†])|"
282 r"([¹²³⁴⁵⁶⁷⁸⁹])|"
283 r"([ᴬᴮᴰᴱᴳᴴᴵᴶᴷᴸᴹᴺᴼᴾᴿᵀᵁⱽᵂᵃᵇᶜᵈᵉᶠᵍʰⁱʲᵏˡᵐⁿᵒᵖʳˢᵗᵘᵛʷˣʸᶻᵝᵞᵟᶿᶥᵠᵡ]))"
284)
285# ᴺᴸᴴ persan/Old Irish
287# Regexp for cell starts that are exceptions to def_re and do not actually
288# start a definition.
289nondef_re = re.compile(
290 r"(^\s*(1|2|3)\s+(sg|pl)\s*$|" # 1s or 3p etc.
291 r"\s*\d\d?\s*/\s*\d\d?\s*$)"
292) # taka/Swahili "15 / 17"
294# Certain tags are moved from headers in tables into word tags, as they always
295# apply to the whole word.
296TAGS_FORCED_WORDTAGS: set[str] = set(
297 [
298 # This was originally created for a issue with number paradigms in
299 # Arabic, but that is being handled elsewhere now.
300 ]
301)
304class InflCell:
305 """Cell in an inflection table."""
307 __slots__ = (
308 "text",
309 "is_title",
310 "colspan",
311 "rowspan",
312 "target",
313 )
315 def __init__(
316 self,
317 text: str,
318 is_title: bool,
319 colspan: int,
320 rowspan: int,
321 target: Optional[str],
322 ) -> None:
323 assert isinstance(text, str)
324 assert is_title in (True, False)
325 assert isinstance(colspan, int) and colspan >= 1
326 assert isinstance(rowspan, int) and rowspan >= 1
327 assert target is None or isinstance(target, str)
328 self.text = text.strip()
329 self.is_title = text and is_title
330 self.colspan = colspan
331 self.rowspan = rowspan
332 self.target = target
334 def __str__(self) -> str:
335 v = "{}/{}/{}/{!r}".format(
336 self.text, self.is_title, self.colspan, self.rowspan
337 )
338 if self.target:
339 v += ": {!r}".format(self.target)
340 return v
342 def __repr__(self) -> str:
343 return str(self)
346class HdrSpan:
347 """Saved information about a header cell/span during the parsing
348 of a table."""
350 __slots__ = (
351 "start",
352 "colspan",
353 "rowspan",
354 "rownum", # Row number where this occurred
355 "tagsets", # list of tuples
356 "text", # For debugging
357 "all_headers_row",
358 "expanded", # The header has been expanded to cover whole row/part
359 )
361 def __init__(
362 self,
363 start: int,
364 colspan: int,
365 rowspan: int,
366 rownum: int,
367 tagsets: TagSets,
368 text: str,
369 all_headers_row: bool,
370 ) -> None:
371 assert isinstance(start, int) and start >= 0
372 assert isinstance(colspan, int) and colspan >= 1
373 assert isinstance(rownum, int)
374 assert isinstance(tagsets, list)
375 for x in tagsets:
376 assert isinstance(x, tuple)
377 assert all_headers_row in (True, False)
378 self.start = start
379 self.colspan = colspan
380 self.rowspan = rowspan
381 self.rownum = rownum
382 self.tagsets = list(tuple(sorted(set(tags))) for tags in tagsets)
383 self.text = text
384 self.all_headers_row = all_headers_row
385 self.expanded = False
388def is_superscript(ch: str) -> bool:
389 """Returns True if the argument is a superscript character."""
390 assert isinstance(ch, str) and len(ch) == 1
391 try:
392 name = unicodedata.name(ch)
393 except ValueError:
394 return False
395 return (
396 re.match(
397 r"SUPERSCRIPT |"
398 r"MODIFIER LETTER SMALL |"
399 r"MODIFIER LETTER CAPITAL ",
400 name,
401 )
402 is not None
403 )
406def remove_useless_tags(lang: str, pos: str, tags: set[str]) -> None:
407 """Remove certain tag combinations from ``tags`` when they serve no purpose
408 together (cover all options)."""
409 assert isinstance(lang, str)
410 assert isinstance(pos, str)
411 assert isinstance(tags, set)
412 if (
413 "animate" in tags
414 and "inanimate" in tags
415 and get_lang_conf(lang, "animate_inanimate_remove")
416 ):
417 tags.remove("animate")
418 tags.remove("inanimate")
419 if (
420 "virile" in tags
421 and "nonvirile" in tags
422 and get_lang_conf(lang, "virile_nonvirile_remove")
423 ):
424 tags.remove("virile")
425 tags.remove("nonvirile")
426 # If all numbers in the language are listed, remove them all
427 numbers = get_lang_conf(lang, "numbers")
428 if numbers and all(x in tags for x in numbers):
429 for x in numbers:
430 tags.remove(x)
431 # If all genders in the language are listed, remove them all
432 genders = get_lang_conf(lang, "genders")
433 if genders and all(x in tags for x in genders):
434 for x in genders:
435 tags.remove(x)
436 # If all voices in the language are listed, remove them all
437 voices = get_lang_conf(lang, "voices")
438 if voices and all(x in tags for x in voices):
439 for x in voices:
440 tags.remove(x)
441 # If all strengths of the language are listed, remove them all
442 strengths = get_lang_conf(lang, "strengths")
443 if strengths and all(x in tags for x in strengths):
444 for x in strengths:
445 tags.remove(x)
446 # If all persons of the language are listed, remove them all
447 persons = get_lang_conf(lang, "persons")
448 if persons and all(x in tags for x in persons):
449 for x in persons:
450 tags.remove(x)
451 # If all definitenesses of the language are listed, remove them all
452 definitenesses = get_lang_conf(lang, "definitenesses")
453 if definitenesses and all(x in tags for x in definitenesses):
454 for x in definitenesses:
455 tags.remove(x)
458def tagset_cats(tagset: TagSets) -> set[str]:
459 """Returns a set of tag categories for the tagset (merged from all
460 alternatives)."""
461 return set(valid_tags[t] for ts in tagset for t in ts)
464def or_tagsets(
465 lang: str, pos: str, tagsets1: TagSets, tagsets2: TagSets
466) -> TagSets:
467 """Merges two tagsets (the new tagset just merges the tags from both, in
468 all combinations). If they contain simple alternatives (differ in
469 only one category), they are simply merged; otherwise they are split to
470 more alternatives. The tagsets are assumed be sets of sorted tuples."""
471 assert isinstance(tagsets1, list)
472 assert all(isinstance(x, tuple) for x in tagsets1)
473 assert isinstance(tagsets2, list)
474 assert all(isinstance(x, tuple) for x in tagsets1)
475 tagsets: TagSets = [] # This will be the result
477 def add_tags(tags1: tuple[str, ...]) -> None:
478 # CONTINUE
479 if not tags1:
480 return # empty set would merge with anything, won't change result
481 if not tagsets:
482 tagsets.append(tags1)
483 return
484 for tags2 in tagsets:
485 # Determine if tags1 can be merged with tags2
486 num_differ = 0
487 if tags1 and tags2: 487 ↛ 505line 487 didn't jump to line 505 because the condition on line 487 was always true
488 cats1 = set(valid_tags[t] for t in tags1)
489 cats2 = set(valid_tags[t] for t in tags2)
490 cats = cats1 | cats2
491 for cat in cats:
492 tags1_in_cat = set(t for t in tags1 if valid_tags[t] == cat)
493 tags2_in_cat = set(t for t in tags2 if valid_tags[t] == cat)
494 if (
495 tags1_in_cat != tags2_in_cat
496 or not tags1_in_cat
497 or not tags2_in_cat
498 ):
499 num_differ += 1
500 if not tags1_in_cat or not tags2_in_cat:
501 # Prevent merging if one is empty
502 num_differ += 1
503 # print("tags1={} tags2={} num_differ={}"
504 # .format(tags1, tags2, num_differ))
505 if num_differ <= 1:
506 # Yes, they can be merged
507 tagsets.remove(tags2)
508 tags_s = set(tags1) | set(tags2)
509 remove_useless_tags(lang, pos, tags_s)
510 tags_t = tuple(sorted(tags_s))
511 add_tags(tags_t) # Could result in further merging
512 return
513 # If we could not merge, add to tagsets
514 tagsets.append(tags1)
516 for tags in tagsets1:
517 add_tags(tags)
518 for tags in tagsets2:
519 add_tags(tags)
520 if not tagsets:
521 tagsets.append(())
523 # print("or_tagsets: {} + {} -> {}"
524 # .format(tagsets1, tagsets2, tagsets))
525 return tagsets
528def and_tagsets(
529 lang: str,
530 pos: str,
531 tagsets1: list[tuple[str, ...]],
532 tagsets2: list[tuple[str, ...]],
533) -> list[tuple[str, ...]]:
534 """Merges tagsets by taking union of all cobinations, without trying
535 to determine whether they are compatible."""
536 assert isinstance(tagsets1, list) and len(tagsets1) >= 1
537 assert all(isinstance(x, tuple) for x in tagsets1)
538 assert isinstance(tagsets2, list) and len(tagsets2) >= 1
539 assert all(isinstance(x, tuple) for x in tagsets1)
540 new_tagsets = []
541 tags: Union[set[str], tuple[str, ...]]
542 for tags1 in tagsets1:
543 for tags2 in tagsets2:
544 tags = set(tags1) | set(tags2)
545 remove_useless_tags(lang, pos, tags)
546 if "dummy-ignored-text-cell" in tags: 546 ↛ 547line 546 didn't jump to line 547 because the condition on line 546 was never true
547 tags.remove("dummy-ignored-text-cell")
548 tags = tuple(sorted(tags))
549 if tags not in new_tagsets: 549 ↛ 543line 549 didn't jump to line 543 because the condition on line 549 was always true
550 new_tagsets.append(tags)
551 # print("and_tagsets: {} + {} -> {}"
552 # .format(tagsets1, tagsets2, new_tagsets))
553 return new_tagsets
556@functools.lru_cache(65536)
557def extract_cell_content(
558 lang: str, word: str, col: str
559) -> tuple[str, list[str], list[tuple[str, str]], list[str]]:
560 """Cleans a row/column header for later processing. This returns
561 (cleaned, refs, defs, tags)."""
562 # print("EXTRACT_CELL_CONTENT {!r}".format(col))
563 hdr_tags = []
564 col = re.sub(r"(?s)\s*,\s*$", "", col)
565 col = re.sub(r"(?s)\s*•\s*$", "", col)
566 col = re.sub(r"\s+", " ", col)
567 col = col.strip()
568 if re.search(
569 r"^\s*(There are |"
570 r"\* |"
571 r"see |"
572 r"Use |"
573 r"use the |"
574 r"Only used |"
575 r"The forms in |"
576 r"these are also written |"
577 r"The genitive can be |"
578 r"Genitive forms are rare or non-existant|"
579 r"Accusative Note: |"
580 r"Classifier Note: |"
581 r"Noun: Assamese nouns are |"
582 r"the active conjugation|"
583 r"the instrumenal singular|"
584 r"Note:|"
585 r"\^* Note:|"
586 r"possible mutated form |"
587 r"The future tense: )",
588 col,
589 ):
590 return "dummy-ignored-text-cell", [], [], []
592 # Temporarily remove final parenthesized part (if separated by whitespace),
593 # so that we can extract reference markers before it.
594 final_paren = ""
595 m = re.search(r"\s+\([^)]*\)$", col)
596 if m is not None:
597 final_paren = m.group(0)
598 col = col[: m.start()]
600 # Extract references and tag markers
601 refs = []
602 special_references = get_lang_conf(lang, "special_references")
603 while True:
604 m = re.search(r"\^(.|\([^)]*\))$", col)
605 if not m:
606 break
607 r = m.group(1)
608 if r.startswith("(") and r.endswith(")"):
609 r = r[1:-1]
610 for r1 in r.split(","):
611 if r1 == "rare": 611 ↛ 612line 611 didn't jump to line 612 because the condition on line 611 was never true
612 hdr_tags.append("rare")
613 elif special_references and r1 in special_references:
614 hdr_tags.extend(special_references[r1].split())
615 else:
616 # v = m.group(1)
617 if r1.startswith("(") and r1.endswith(")"): 617 ↛ 618line 617 didn't jump to line 618 because the condition on line 617 was never true
618 r1 = r1[1:-1]
619 refs.append(unicodedata.normalize("NFKD", r1))
620 col = col[: m.start()]
621 # See if it is a ref definition
622 # print("BEFORE REF CHECK: {!r}".format(col))
623 m = def_re.match(col)
624 # print(f"Before def_re: {refs=}")
625 if m and not nondef_re.match(col):
626 ofs = 0
627 ref = None
628 deflst = []
629 for m in re.finditer(def_re, col):
630 if ref:
631 deflst.append((ref, col[ofs : m.start()].strip()))
632 ref = unicodedata.normalize(
633 "NFKD", m.group(3) or m.group(5) or m.group(6) or ""
634 )
635 ofs = m.end()
636 if ref: 636 ↛ 639line 636 didn't jump to line 639 because the condition on line 636 was always true
637 deflst.append((ref, col[ofs:].strip()))
638 # print("deflst:", deflst)
639 return "", [], deflst, []
640 # See if it *looks* like a reference to a definition
641 # print(f"After def_re: {refs=}")
642 while col:
643 if is_superscript(col[-1]) or col[-1] in ("†",):
644 if col.endswith("ʳᵃʳᵉ"):
645 hdr_tags.append("rare")
646 col = col[:-4].strip()
647 continue
648 if special_references:
649 stop_flag = False
650 for r in special_references:
651 if col.endswith(r):
652 hdr_tags.extend(special_references[r].split())
653 col = col[: -len(r)].strip()
654 stop_flag = True
655 break # this for loop
656 if stop_flag:
657 continue # this while loop
658 # Numbers and H/L/N are useful information
659 refs.append(unicodedata.normalize("NFKD", col[-1]))
660 col = col[:-1]
661 else:
662 break
664 # Check for another form of note definition
665 if ( 665 ↛ 671line 665 didn't jump to line 671 because the condition on line 665 was never true
666 len(col) > 2
667 and col[1] in (")", " ", ":")
668 and col[0].isdigit()
669 and not re.match(nondef_re, col)
670 ):
671 return "", [], [(col[0], col[2:].strip())], []
672 col = col.strip()
674 # Extract final "*" reference symbols. Sometimes there are multiple.
675 m = re.search(r"\*+$", col)
676 if m is not None:
677 col = col[: m.start()]
678 refs.append(unicodedata.normalize("NFKD", m.group(0)))
679 if col.endswith("(*)"): 679 ↛ 680line 679 didn't jump to line 680 because the condition on line 679 was never true
680 col = col[:-3].strip()
681 refs.append("*")
683 # Put back the final parenthesized part
684 col = col.strip() + final_paren
685 # print("EXTRACT_CELL_CONTENT: orig_col={!r} col={!r} refs={!r} hdr_tags={}"
686 # .format(orig_col, col, refs, hdr_tags))
687 return col.strip(), refs, [], hdr_tags
690@functools.lru_cache(10000)
691def parse_title(
692 title: str, source: str
693) -> tuple[list[str], list[str], list[FormData]]:
694 """Parses inflection table title. This returns (global_tags, table_tags,
695 extra_forms), where ``global_tags`` is tags to be added to each inflection
696 entry, ``table_tags`` are tags for the word but not to be added to every
697 form, and ``extra_forms`` is dictionary describing additional forms to be
698 included in the part-of-speech entry)."""
699 assert isinstance(title, str)
700 assert isinstance(source, str)
701 title = html.unescape(title)
702 title = re.sub(r"(?i)<[^>]*>", "", title).strip()
703 title = re.sub(r"\s+", " ", title)
704 # print("PARSE_TITLE:", title)
705 global_tags = []
706 table_tags = []
707 extra_forms = []
708 # Add certain global tags based on contained words
709 for m in re.finditer(title_contains_global_re, title):
710 v = m.group(0).lower()
711 if re.match(table_hdr_ign_part_re, v): 711 ↛ 712line 711 didn't jump to line 712 because the condition on line 711 was never true
712 continue
713 global_tags.extend(title_contains_global_map[v].split())
714 # Add certain tags to table-tags "form" based on contained words
715 for m in re.finditer(title_contains_wordtags_re, title):
716 v = m.group(0).lower()
717 if re.match(table_hdr_ign_part_re, v): 717 ↛ 718line 717 didn't jump to line 718 because the condition on line 717 was never true
718 continue
719 table_tags.extend(title_contains_wordtags_map[v].split())
720 if re.search(r"Conjugation of (s’|se ).*French verbs", title): 720 ↛ 721line 720 didn't jump to line 721 because the condition on line 720 was never true
721 global_tags.append("reflexive")
722 # Check for <x>-type at the beginning of title (e.g., Armenian) and various
723 # other ways of specifying an inflection class.
724 for m in re.finditer(
725 r"\b("
726 r"[\w/]+-type|"
727 r"accent-\w+|"
728 r"[\w/]+-stem|"
729 r"[^ ]+ gradation|"
730 r"\b(stem in [\w/ ]+)|"
731 r"[^ ]+ alternation|"
732 r"(First|Second|Third|Fourth|Fifth|Sixth|Seventh) "
733 r"(Conjugation|declension)|"
734 r"First and second declension|"
735 r"(1st|2nd|3rd|4th|5th|6th) declension|"
736 r"\w[\w/ ]* harmony"
737 r")\b",
738 title,
739 ):
740 dt: FormData = {"form": m.group(1), "source": source, "tags": ["class"]}
741 extra_forms.append(dt)
742 # Parse parenthesized part from title
743 for m in re.finditer(r"\(([^)]*)\)", title):
744 for elem in m.group(1).split(","):
745 # group(0) is the whole string, group(1) first parens
746 elem = elem.strip()
747 if elem in title_elements_map:
748 table_tags.extend(title_elements_map[elem].split())
749 else:
750 m1 = re.match(title_elemstart_re, elem)
751 if m1:
752 tags = title_elemstart_map[m1.group(1)].split()
753 dt = {
754 "form": elem[m1.end() :],
755 "source": source,
756 "tags": tags,
757 }
758 extra_forms.append(dt)
759 # For titles that contains no parenthesized parts, do some special
760 # handling to still interpret parts from them
761 if "(" not in title:
762 # No parenthesized parts
763 m1 = re.search(r"\b(Portuguese) (-.* verb) ", title)
764 if m1 is not None:
765 dt = {"form": m1.group(2), "tags": ["class"], "source": source}
766 extra_forms.append(dt)
767 for elem in title.split(","):
768 elem = elem.strip()
769 if elem in title_elements_map: 769 ↛ 770line 769 didn't jump to line 770 because the condition on line 769 was never true
770 table_tags.extend(title_elements_map[elem].split())
771 elif elem.endswith("-stem"): 771 ↛ 772line 771 didn't jump to line 772 because the condition on line 771 was never true
772 dt = {"form": elem, "tags": ["class"], "source": source}
773 extra_forms.append(dt)
774 return global_tags, table_tags, extra_forms
777def expand_header(
778 wxr: WiktextractContext,
779 tablecontext: "TableContext",
780 word: str,
781 lang: str,
782 pos: str,
783 text: str,
784 base_tags: Union[list[str], set[str], tuple[str, ...]],
785 silent=False,
786 ignore_tags=False,
787 depth=0,
788 column_number: int | None = None,
789) -> list[tuple[str, ...]]:
790 """Expands a cell header to tagset, handling conditional expressions
791 in infl_map. This returns list of tuples of tags, each list element
792 describing an alternative interpretation. ``base_tags`` is combined
793 column and row tags for the cell in which the text is being interpreted
794 (conditional expressions in inflection data may depend on it).
795 If ``silent`` is True, then no warnings will be printed. If ``ignore_tags``
796 is True, then tags listed in "if" will be ignored in the test (this is
797 used when trying to heuristically detect whether a non-<th> cell is anyway
798 a header)."""
799 assert isinstance(wxr, WiktextractContext)
800 assert isinstance(word, str)
801 assert isinstance(lang, str)
802 assert isinstance(pos, str)
803 assert isinstance(text, str)
804 assert isinstance(base_tags, (list, tuple, set))
805 assert silent in (True, False)
806 assert isinstance(depth, int)
807 # print("EXPAND_HDR: text={!r} base_tags={!r}".format(text, base_tags))
808 # First map the text using the inflection map
809 text = clean_value(wxr, text)
810 combined_return: list[tuple[str, ...]] = []
811 parts = split_at_comma_semi(text, separators=[";"])
812 for text in parts:
813 if not text: 813 ↛ 814line 813 didn't jump to line 814 because the condition on line 813 was never true
814 continue
815 if text in infl_map:
816 v = infl_map[text] # list or string
817 else:
818 m = re.match(infl_start_re, text)
819 if m is not None: 819 ↛ 820line 819 didn't jump to line 820 because the condition on line 819 was never true
820 v = infl_start_map[m.group(1)]
821 # print("INFL_START {} -> {}".format(text, v))
822 elif re.match(r"Notes", text):
823 # Ignored header
824 # print("IGNORING NOTES")
825 combined_return = or_tagsets(
826 lang, pos, combined_return, [("dummy-skip-this",)]
827 )
828 # this just adds dummy-skip-this
829 continue
830 elif text in IGNORED_COLVALUES:
831 combined_return = or_tagsets(
832 lang, pos, combined_return, [("dummy-ignore-skipped",)]
833 )
834 continue
835 # Try without final parenthesized part
836 text_without_parens = re.sub(r"[,/]?\s+\([^)]*\)\s*$", "", text)
837 if text_without_parens in infl_map:
838 v = infl_map[text_without_parens]
839 elif m is None: 839 ↛ 855line 839 didn't jump to line 855 because the condition on line 839 was always true
840 if not silent:
841 wxr.wtp.debug(
842 "inflection table: unrecognized header: {}".format(
843 repr(text)
844 ),
845 sortid="inflection/735",
846 )
847 # Unrecognized header
848 combined_return = or_tagsets(
849 lang, pos, combined_return, [("error-unrecognized-form",)]
850 )
851 continue
853 # Then loop interpreting the value, until the value is a simple string.
854 # This may evaluate nested conditional expressions.
855 default_else = None
856 while True:
857 # If it is a string, we are done.
858 if isinstance(v, str):
859 tags = set(v.split())
860 remove_useless_tags(lang, pos, tags)
861 tagset = [tuple(sorted(tags))]
862 break
863 # For a list, just interpret it as alternatives. (Currently the
864 # alternatives must directly be strings.)
865 if isinstance(v, (list, tuple)):
866 tagset = []
867 for x in v:
868 tags = set(x.split())
869 remove_useless_tags(lang, pos, tags)
870 tags_t = tuple(sorted(tags))
871 if tags_t not in tagset: 871 ↛ 867line 871 didn't jump to line 867 because the condition on line 871 was always true
872 tagset.append(tags_t)
873 break
874 # Otherwise the value should be a dictionary describing a
875 # conditional expression.
876 if not isinstance(v, dict): 876 ↛ 877line 876 didn't jump to line 877 because the condition on line 876 was never true
877 wxr.wtp.debug(
878 "inflection table: internal: "
879 "UNIMPLEMENTED INFL_MAP VALUE: {}".format(infl_map[text]),
880 sortid="inflection/767",
881 )
882 tagset = [()]
883 break
884 # Evaluate the conditional expression.
885 assert isinstance(v, dict)
886 cond: Union[bool, str] = "default-true"
887 c: Union[str, list[str], set[str]] = ""
888 # Handle "lang" condition. The value must be either a
889 # single language or a list of languages, and the
890 # condition evaluates to True if the table is one of
891 # those languages.
892 if "lang" in v:
893 c = v["lang"]
894 if isinstance(c, str):
895 cond = c == lang
896 else:
897 assert isinstance(c, (list, tuple, set))
898 cond = lang in c
899 # Handle "nested-table-depth" condition. The value must
900 # be an int or list of ints, and the condition evaluates
901 # True if the depth is one of those values.
902 # "depth" is how deep into a nested table tree the current
903 # table lies. It is first started in handle_wikitext_table,
904 # so only applies to tables-within-tables, not other
905 # WikiNode content. `depth` is currently only passed as a
906 # parameter down the table parsing stack, and not stored.
907 if cond and "nested-table-depth" in v: 907 ↛ 908line 907 didn't jump to line 908 because the condition on line 907 was never true
908 d = v["nested-table-depth"]
909 if isinstance(d, int):
910 cond = d == depth
911 else:
912 assert isinstance(d, (list, tuple, set))
913 cond = depth in d
914 # Column index: check if we're in position X of the row
915 if cond and "column-index" in v:
916 index = v["column-index"]
917 if isinstance(index, int): 917 ↛ 920line 917 didn't jump to line 920 because the condition on line 917 was always true
918 cond = index == column_number
919 else:
920 assert isinstance(index, (list, tuple, set))
921 cond = column_number in index
922 # Handle inflection-template condition. Must be a string
923 # or list of strings, and if tablecontext.template_name is in
924 # those, accept the condition.
925 # TableContext.template_name is passed down from page/
926 # parse_inflection, before parsing and expanding itself
927 # has begun.
928 if cond and tablecontext and "inflection-template" in v:
929 d1 = v["inflection-template"]
930 if isinstance(d1, str): 930 ↛ 933line 930 didn't jump to line 933 because the condition on line 930 was always true
931 cond = d1 == tablecontext.template_name
932 else:
933 assert isinstance(d1, (list, tuple, set))
934 cond = tablecontext.template_name in d1
935 # Handle "pos" condition. The value must be either a single
936 # part-of-speech or a list of them, and the condition evaluates to
937 # True if the part-of-speech is any of those listed.
938 if cond and "pos" in v:
939 c = v["pos"]
940 if isinstance(c, str):
941 cond = c == pos
942 else:
943 assert isinstance(c, (list, tuple, set))
944 cond = pos in c
945 # Handle "if" condition. The value must be a string containing a
946 # space-separated list of tags. The condition evaluates to True if
947 # ``base_tags`` contains all of the listed tags. If the condition
948 # is of the form "any: ...tags...", then any of the tags will be
949 # enough.
950 if cond and "if" in v and not ignore_tags:
951 c = v["if"]
952 assert isinstance(c, str)
953 # "if" condition is true if any of the listed tags is present if
954 # it starts with "any:", otherwise all must be present
955 if c.startswith("any: "):
956 cond = any(t in base_tags for t in c[5:].split())
957 else:
958 cond = all(t in base_tags for t in c.split())
960 # Handle "default" assignment. Store the value to be used
961 # as a default later.
962 if "default" in v:
963 assert isinstance(v["default"], str)
964 default_else = v["default"]
966 # Warning message about missing conditions for debugging.
968 if cond == "default-true" and not default_else and not silent:
969 wxr.wtp.debug(
970 "inflection table: IF MISSING COND: word={} "
971 "lang={} text={} base_tags={} c={} cond={}".format(
972 word, lang, text, base_tags, c, cond
973 ),
974 sortid="inflection/851",
975 )
976 # Based on the result of evaluating the condition, select either
977 # "then" part or "else" part.
978 if cond:
979 v = v.get("then", "")
980 else:
981 v1 = v.get("else")
982 if v1 is None:
983 if default_else:
984 v = default_else
985 else:
986 if not silent:
987 wxr.wtp.debug(
988 "inflection table: IF WITHOUT ELSE EVALS "
989 "False: "
990 "{}/{} {!r} base_tags={}".format(
991 word, lang, text, base_tags
992 ),
993 sortid="inflection/865",
994 )
995 v = "error-unrecognized-form"
996 else:
997 v = v1
999 # Merge the resulting tagset from this header part with the other
1000 # tagsets from the whole header
1001 combined_return = or_tagsets(lang, pos, combined_return, tagset)
1003 # Return the combined tagsets, or empty tagset if we got no tagsets
1004 if not combined_return:
1005 combined_return = [()]
1006 return combined_return
1009def compute_coltags(
1010 lang: str,
1011 pos: str,
1012 hdrspans: list[str],
1013 start: int,
1014 colspan: int,
1015 celltext: int,
1016) -> list[tuple[str]]:
1017 """Computes column tags for a column of the given width based on the
1018 current header spans."""
1019 assert isinstance(lang, str)
1020 assert isinstance(pos, str)
1021 assert isinstance(hdrspans, list)
1022 assert isinstance(start, int) and start >= 0
1023 assert isinstance(colspan, int) and colspan >= 1
1024 assert isinstance(celltext, str) # For debugging only
1025 # print("COMPUTE_COLTAGS CALLED start={} colspan={} celltext={!r}"
1026 # .format(start, colspan, celltext))
1027 # For debugging, set this to the form for whose cell you want debug prints
1028 if celltext == debug_cell_text: 1028 ↛ 1029line 1028 didn't jump to line 1029 because the condition on line 1028 was never true
1029 print(
1030 "COMPUTE_COLTAGS CALLED start={} colspan={} celltext={!r}".format(
1031 start, colspan, celltext
1032 )
1033 )
1034 for hdrspan in hdrspans:
1035 print(
1036 " row={} start={} colspans={} tagsets={}".format(
1037 hdrspan.rownum,
1038 hdrspan.start,
1039 hdrspan.colspan,
1040 hdrspan.tagsets,
1041 )
1042 )
1043 used = set()
1044 coltags = [()]
1045 last_header_row = 1000000
1046 # Iterate through the headers in reverse order, i.e., headers lower in the
1047 # table (closer to the cell) first.
1048 row_tagsets = [()]
1049 row_tagsets_rownum = 1000000
1050 used_hdrspans = set()
1051 for hdrspan in reversed(hdrspans):
1052 if (
1053 hdrspan.start + hdrspan.colspan <= start
1054 or hdrspan.start >= start + colspan
1055 ):
1056 # Does not horizontally overlap current cell. Ignore this hdrspan.
1057 if celltext == debug_cell_text: 1057 ↛ 1058line 1057 didn't jump to line 1058 because the condition on line 1057 was never true
1058 print(
1059 "Ignoring row={} start={} colspan={} tagsets={}".format(
1060 hdrspan.rownum,
1061 hdrspan.start,
1062 hdrspan.colspan,
1063 hdrspan.tagsets,
1064 )
1065 )
1066 continue
1067 # If the cell partially overlaps the current cell, assume we have
1068 # reached something unrelated and abort.
1069 if (
1070 hdrspan.start < start
1071 and hdrspan.start + hdrspan.colspan > start
1072 and hdrspan.start + hdrspan.colspan < start + colspan
1073 ):
1074 if celltext == debug_cell_text: 1074 ↛ 1075line 1074 didn't jump to line 1075 because the condition on line 1074 was never true
1075 print(
1076 "break on partial overlap at start {} {} {}".format(
1077 hdrspan.start, hdrspan.colspan, hdrspan.tagsets
1078 )
1079 )
1080 break
1081 if (
1082 hdrspan.start < start + colspan
1083 and hdrspan.start > start
1084 and hdrspan.start + hdrspan.colspan > start + colspan
1085 and not hdrspan.expanded
1086 ):
1087 if celltext == debug_cell_text: 1087 ↛ 1088line 1087 didn't jump to line 1088 because the condition on line 1087 was never true
1088 print(
1089 "break on partial overlap at end {} {} {}".format(
1090 hdrspan.start, hdrspan.colspan, hdrspan.tagsets
1091 )
1092 )
1093 break
1094 # Check if we have already used this cell.
1095 if id(hdrspan) in used_hdrspans:
1096 continue
1097 # We are going to use this cell.
1098 used_hdrspans.add(id(hdrspan))
1099 tagsets = hdrspan.tagsets
1100 # If the hdrspan is fully inside the current cell and does not cover
1101 # it fully, check if we should merge information from multiple cells.
1102 if not hdrspan.expanded and (
1103 hdrspan.start > start
1104 or hdrspan.start + hdrspan.colspan < start + colspan
1105 ):
1106 # Multiple columns apply to the current cell, only
1107 # gender/number/case tags present
1108 # If there are no tags outside the range in any of the
1109 # categories included in these cells, don't add anything
1110 # (assume all choices valid in the language are possible).
1111 in_cats = set(
1112 valid_tags[t]
1113 for x in hdrspans
1114 if x.rownum == hdrspan.rownum
1115 and x.start >= start
1116 and x.start + x.colspan <= start + colspan
1117 for tt in x.tagsets
1118 for t in tt
1119 )
1120 if celltext == debug_cell_text: 1120 ↛ 1121line 1120 didn't jump to line 1121 because the condition on line 1120 was never true
1121 print("in_cats={} tagsets={}".format(in_cats, tagsets))
1122 # Merge the tagsets into existing tagsets. This merges
1123 # alternatives into the same tagset if there is only one
1124 # category different; otherwise this splits the tagset into
1125 # more alternatives.
1126 includes_all_on_row = True
1127 for x in hdrspans:
1128 # print("X: x.rownum={} x.start={}".format(x.rownum, x.start))
1129 if x.rownum != hdrspan.rownum:
1130 continue
1131 if x.start < start or x.start + x.colspan > start + colspan:
1132 if celltext == debug_cell_text: 1132 ↛ 1133line 1132 didn't jump to line 1133 because the condition on line 1132 was never true
1133 print(
1134 "NOT IN RANGE: {} {} {}".format(
1135 x.start, x.colspan, x.tagsets
1136 )
1137 )
1138 includes_all_on_row = False
1139 continue
1140 if id(x) in used_hdrspans:
1141 if celltext == debug_cell_text: 1141 ↛ 1142line 1141 didn't jump to line 1142 because the condition on line 1141 was never true
1142 print(
1143 "ALREADY USED: {} {} {}".format(
1144 x.start, x.colspan, x.tagsets
1145 )
1146 )
1147 continue
1148 used_hdrspans.add(id(x))
1149 if celltext == debug_cell_text: 1149 ↛ 1150line 1149 didn't jump to line 1150 because the condition on line 1149 was never true
1150 print(
1151 "Merging into wide col: x.rownum={} "
1152 "x.start={} x.colspan={} "
1153 "start={} colspan={} tagsets={} x.tagsets={}".format(
1154 x.rownum,
1155 x.start,
1156 x.colspan,
1157 start,
1158 colspan,
1159 tagsets,
1160 x.tagsets,
1161 )
1162 )
1163 tagsets = or_tagsets(lang, pos, tagsets, x.tagsets)
1164 # If all headers on the row were included, ignore them.
1165 # See e.g. kunna/Swedish/Verb.
1166 ts_cats = tagset_cats(tagsets)
1167 if (
1168 includes_all_on_row
1169 or
1170 # Kludge, see fut/Hungarian/Verb
1171 ("tense" in ts_cats and "object" in ts_cats)
1172 ):
1173 tagsets = [()]
1174 # For limited categories, if the category doesn't appear
1175 # outside, we won't include the category
1176 if not in_cats - set(
1177 ("gender", "number", "person", "case", "category", "voice")
1178 ):
1179 # Sometimes we have masc, fem, neut and plural, so treat
1180 # number and gender as the same here (if one given, look for
1181 # the other too)
1182 if "number" in in_cats or "gender" in in_cats:
1183 in_cats.update(("number", "gender"))
1184 # Determine which categories occur outside on
1185 # the same row. Ignore headers that have been expanded
1186 # to cover the whole row/part of it.
1187 out_cats = set(
1188 valid_tags[t]
1189 for x in hdrspans
1190 if x.rownum == hdrspan.rownum
1191 and not x.expanded
1192 and (
1193 x.start < start or x.start + x.colspan > start + colspan
1194 )
1195 for tt in x.tagsets
1196 for t in tt
1197 )
1198 if celltext == debug_cell_text: 1198 ↛ 1199line 1198 didn't jump to line 1199 because the condition on line 1198 was never true
1199 print("in_cats={} out_cats={}".format(in_cats, out_cats))
1200 # Remove all inside categories that do not appear outside
1202 new_tagsets = []
1203 for ts in tagsets:
1204 tags = tuple(
1205 sorted(t for t in ts if valid_tags[t] in out_cats)
1206 )
1207 if tags not in new_tagsets: 1207 ↛ 1203line 1207 didn't jump to line 1203 because the condition on line 1207 was always true
1208 new_tagsets.append(tags)
1209 if celltext == debug_cell_text and new_tagsets != tagsets: 1209 ↛ 1210line 1209 didn't jump to line 1210 because the condition on line 1209 was never true
1210 print(
1211 "Removed tags that do not "
1212 "appear outside {} -> {}".format(
1213 # have_hdr never used?
1214 tagsets,
1215 new_tagsets,
1216 )
1217 )
1218 tagsets = new_tagsets
1219 key = (hdrspan.start, hdrspan.colspan)
1220 if key in used:
1221 if celltext == debug_cell_text: 1221 ↛ 1222line 1221 didn't jump to line 1222 because the condition on line 1221 was never true
1222 print(
1223 "Cellspan already used: start={} "
1224 "colspan={} rownum={} {}".format(
1225 hdrspan.start,
1226 hdrspan.colspan,
1227 hdrspan.rownum,
1228 hdrspan.tagsets,
1229 )
1230 )
1231 action = get_lang_conf(lang, "reuse_cellspan")
1232 # can be "stop", "skip" or "reuse"
1233 if action == "stop":
1234 break
1235 if action == "skip":
1236 continue
1237 assert action == "reuse"
1238 tcats = tagset_cats(tagsets)
1239 # Most headers block using the same column position above. However,
1240 # "register" tags don't do this (cf. essere/Italian/verb: "formal")
1241 if len(tcats) != 1 or "register" not in tcats:
1242 used.add(key)
1243 # If we have moved to a different row, merge into column tagsets
1244 # (we use different and_tagsets within the row)
1245 if row_tagsets_rownum != hdrspan.rownum:
1246 # row_tagsets_rownum was initialized as 10000000
1247 ret = and_tagsets(lang, pos, coltags, row_tagsets)
1248 if celltext == debug_cell_text: 1248 ↛ 1249line 1248 didn't jump to line 1249 because the condition on line 1248 was never true
1249 print(
1250 "merging rows: {} {} -> {}".format(
1251 coltags, row_tagsets, ret
1252 )
1253 )
1254 coltags = ret
1255 row_tagsets = [()]
1256 row_tagsets_rownum = hdrspan.rownum
1257 # Merge into coltags
1258 if hdrspan.all_headers_row and hdrspan.rownum + 1 == last_header_row:
1259 # If this row is all headers and immediately preceeds the last
1260 # header we accepted, take any header from there.
1261 row_tagsets = and_tagsets(lang, pos, row_tagsets, tagsets)
1262 if celltext == debug_cell_text: 1262 ↛ 1263line 1262 didn't jump to line 1263 because the condition on line 1262 was never true
1263 print("merged (next header row): {}".format(row_tagsets))
1264 else:
1265 # new_cats is for the new tags (higher up in the table)
1266 new_cats = tagset_cats(tagsets)
1267 # cur_cats is for the tags already collected (lower in the table)
1268 cur_cats = tagset_cats(coltags)
1269 if celltext == debug_cell_text: 1269 ↛ 1270line 1269 didn't jump to line 1270 because the condition on line 1269 was never true
1270 print(
1271 "row={} start={} colspan={} tagsets={} coltags={} "
1272 "new_cats={} cur_cats={}".format(
1273 hdrspan.rownum,
1274 hdrspan.start,
1275 hdrspan.colspan,
1276 tagsets,
1277 coltags,
1278 new_cats,
1279 cur_cats,
1280 )
1281 )
1282 if "detail" in new_cats:
1283 if not any(coltags): # Only if no tags so far
1284 coltags = or_tagsets(lang, pos, coltags, tagsets)
1285 if celltext == debug_cell_text: 1285 ↛ 1286line 1285 didn't jump to line 1286 because the condition on line 1285 was never true
1286 print("stopping on detail after merge")
1287 break
1288 # Here, we block bleeding of categories from above
1289 elif "non-finite" in cur_cats and "non-finite" in new_cats:
1290 stop = get_lang_conf(lang, "stop_non_finite_non_finite")
1291 if stop: 1291 ↛ 1317line 1291 didn't jump to line 1317 because the condition on line 1291 was always true
1292 if celltext == debug_cell_text: 1292 ↛ 1293line 1292 didn't jump to line 1293 because the condition on line 1292 was never true
1293 print("stopping on non-finite-non-finite")
1294 break
1295 elif "non-finite" in cur_cats and "voice" in new_cats:
1296 stop = get_lang_conf(lang, "stop_non_finite_voice")
1297 if stop: 1297 ↛ 1317line 1297 didn't jump to line 1317 because the condition on line 1297 was always true
1298 if celltext == debug_cell_text: 1298 ↛ 1299line 1298 didn't jump to line 1299 because the condition on line 1298 was never true
1299 print("stopping on non-finite-voice")
1300 break
1301 elif "non-finite" in new_cats and cur_cats & set(
1302 ("person", "number")
1303 ):
1304 if celltext == debug_cell_text: 1304 ↛ 1305line 1304 didn't jump to line 1305 because the condition on line 1304 was never true
1305 print("stopping on non-finite new")
1306 break
1307 elif "non-finite" in new_cats and "tense" in new_cats:
1308 stop = get_lang_conf(lang, "stop_non_finite_tense")
1309 if stop:
1310 if celltext == debug_cell_text: 1310 ↛ 1311line 1310 didn't jump to line 1311 because the condition on line 1310 was never true
1311 print("stopping on non-finite new")
1312 break
1313 elif "non-finite" in cur_cats and new_cats & set(("mood",)): 1313 ↛ 1314line 1313 didn't jump to line 1314 because the condition on line 1313 was never true
1314 if celltext == debug_cell_text:
1315 print("stopping on non-finite cur")
1316 break
1317 if (
1318 "tense" in new_cats
1319 and any("imperative" in x for x in coltags)
1320 and get_lang_conf(lang, "imperative_no_tense")
1321 ):
1322 if celltext == debug_cell_text: 1322 ↛ 1323line 1322 didn't jump to line 1323 because the condition on line 1322 was never true
1323 print("skipping tense in imperative")
1324 continue
1325 elif (
1326 "mood" in new_cats
1327 and "mood" in cur_cats
1328 and
1329 # Allow if all new tags are already in current set
1330 any(
1331 t not in ts1
1332 for ts1 in coltags # current
1333 for ts2 in tagsets # new (from above)
1334 for t in ts2
1335 )
1336 ):
1337 skip = get_lang_conf(lang, "skip_mood_mood")
1338 if skip:
1339 if celltext == debug_cell_text: 1339 ↛ 1340line 1339 didn't jump to line 1340 because the condition on line 1339 was never true
1340 print("skipping on mood-mood")
1341 # we continue to next header
1342 else:
1343 if celltext == debug_cell_text: 1343 ↛ 1344line 1343 didn't jump to line 1344 because the condition on line 1343 was never true
1344 print("stopping on mood-mood")
1345 break
1346 elif "tense" in new_cats and "tense" in cur_cats:
1347 skip = get_lang_conf(lang, "skip_tense_tense")
1348 if skip:
1349 if celltext == debug_cell_text: 1349 ↛ 1350line 1349 didn't jump to line 1350 because the condition on line 1349 was never true
1350 print("skipping on tense-tense")
1351 # we continue to next header
1352 else:
1353 if celltext == debug_cell_text: 1353 ↛ 1354line 1353 didn't jump to line 1354 because the condition on line 1353 was never true
1354 print("stopping on tense-tense")
1355 break
1356 elif "aspect" in new_cats and "aspect" in cur_cats:
1357 if celltext == debug_cell_text: 1357 ↛ 1358line 1357 didn't jump to line 1358 because the condition on line 1357 was never true
1358 print("skipping on aspect-aspect")
1359 continue
1360 elif "number" in cur_cats and "number" in new_cats:
1361 if celltext == debug_cell_text: 1361 ↛ 1362line 1361 didn't jump to line 1362 because the condition on line 1361 was never true
1362 print("stopping on number-number")
1363 break
1364 elif "number" in cur_cats and "gender" in new_cats:
1365 if celltext == debug_cell_text: 1365 ↛ 1366line 1365 didn't jump to line 1366 because the condition on line 1365 was never true
1366 print("stopping on number-gender")
1367 break
1368 elif "person" in cur_cats and "person" in new_cats:
1369 if celltext == debug_cell_text: 1369 ↛ 1370line 1369 didn't jump to line 1370 because the condition on line 1369 was never true
1370 print("stopping on person-person")
1371 break
1372 else:
1373 # Merge tags and continue to next header up/left in the table.
1374 row_tagsets = and_tagsets(lang, pos, row_tagsets, tagsets)
1375 if celltext == debug_cell_text: 1375 ↛ 1376line 1375 didn't jump to line 1376 because the condition on line 1375 was never true
1376 print("merged: {}".format(coltags))
1377 # Update the row number from which we have last taken headers
1378 last_header_row = hdrspan.rownum
1379 # Merge the final row tagset into coltags
1380 coltags = and_tagsets(lang, pos, coltags, row_tagsets)
1381 # print(
1382 # "HDRSPANS:", list((x.start, x.colspan, x.tagsets) for x in hdrspans)
1383 # )
1384 if celltext == debug_cell_text: 1384 ↛ 1385line 1384 didn't jump to line 1385 because the condition on line 1384 was never true
1385 print("COMPUTE_COLTAGS {} {}: {}".format(start, colspan, coltags))
1386 assert isinstance(coltags, list)
1387 assert all(isinstance(x, tuple) for x in coltags)
1388 return coltags
1391def parse_simple_table(
1392 wxr, tablecontext, word, lang, pos, rows, titles, source, after, depth
1393):
1394 """This is the default table parser. Despite its name, it can parse
1395 complex tables. This returns a list of forms to be added to the
1396 part-of-speech, or None if the table could not be parsed."""
1397 assert isinstance(wxr, WiktextractContext)
1398 assert isinstance(tablecontext, TableContext)
1399 assert isinstance(word, str)
1400 assert isinstance(lang, str)
1401 assert isinstance(pos, str)
1402 assert isinstance(rows, list)
1403 assert isinstance(source, str)
1404 assert isinstance(after, str)
1405 assert isinstance(depth, int)
1406 for row in rows:
1407 for col in row:
1408 assert isinstance(col, InflCell)
1409 assert isinstance(titles, list)
1410 for x in titles:
1411 assert isinstance(x, str)
1413 # print("PARSE_SIMPLE_TABLE: TITLES:", titles)
1414 if debug_cell_text: 1414 ↛ 1415line 1414 didn't jump to line 1415 because the condition on line 1414 was never true
1415 print("ROWS:")
1416 for row in rows:
1417 print(" ", row)
1419 # Check for forced rowspan kludge. See e.g.
1420 # maorski/Serbo-Croatian. These are essentially multi-row
1421 # cells implemented using <br> rather than separate cell. We fix this
1422 # by identifying rows where this happens, and splitting the current row
1423 # to multiple rows by synthesizing additional cells.
1424 new_rows = []
1425 for row in rows:
1426 split_row = (
1427 any(x.is_title and x.text in ("inanimate\nanimate",) for x in row)
1428 and
1429 # x is an InflCell
1430 all(x.rowspan == 1 for x in row)
1431 )
1432 if not split_row:
1433 new_rows.append(row)
1434 continue
1435 row1 = []
1436 row2 = []
1437 for cell in row:
1438 cell1 = copy.deepcopy(cell)
1439 if "\n" in cell.text:
1440 # Has more than one line - split this cell
1441 parts = cell.text.strip().splitlines()
1442 if len(parts) != 2: 1442 ↛ 1443line 1442 didn't jump to line 1443 because the condition on line 1442 was never true
1443 wxr.wtp.debug(
1444 "forced rowspan kludge got {} parts: {!r}".format(
1445 len(parts), cell.text
1446 ),
1447 sortid="inflection/1234",
1448 )
1449 cell2 = copy.deepcopy(cell)
1450 cell1.text = parts[0]
1451 cell2.text = parts[1]
1452 else:
1453 cell1.rowspan = 2
1454 cell2 = cell1 # ref, not a copy
1455 row1.append(cell1)
1456 row2.append(cell2)
1457 new_rows.append(row1)
1458 new_rows.append(row2)
1459 rows = new_rows
1460 # print("ROWS AFTER FORCED ROWSPAN KLUDGE:")
1461 # for row in rows:
1462 # print(" ", row)
1464 # Parse definitions for references (from table itself and from text
1465 # after it)
1466 def_ht = {}
1468 def add_defs(defs: list[tuple[str, str]]) -> None:
1469 for ref, d in defs:
1470 # print("DEF: ref={} d={}".format(ref, d))
1471 d = d.strip()
1472 d = d.split(". ")[0].strip() # text before ". "
1473 if not d: 1473 ↛ 1474line 1473 didn't jump to line 1474 because the condition on line 1473 was never true
1474 continue
1475 if d.endswith("."): # catc ".."??
1476 d = d[:-1]
1477 tags, topics = decode_tags(d, no_unknown_starts=True)
1478 # print(f"{ref=}, {transformed=}, {tags=}")
1479 if topics or any("error-unknown-tag" in ts for ts in tags):
1480 d = d[0].lower() + d[1:]
1481 tags, topics = decode_tags(
1482 d, no_unknown_starts=True
1483 )
1484 if topics or any("error-unknown-tag" in ts for ts in tags):
1485 # Failed to parse as tags
1486 # print("Failed: topics={} tags={}"
1487 # .format(topics, tags))
1488 continue
1489 tags1_s: set[str] = set()
1490 for ts in tags:
1491 # Set.update is a union operation: definition tags are flat
1492 tags1_s.update(ts)
1493 tags1 = tuple(sorted(tags1_s))
1494 # print("DEFINED: {} -> {}".format(ref, tags1))
1495 def_ht[ref] = tags1
1497 def generate_tags(
1498 rowtags: list[tuple[str]], table_tags: list[str]
1499 ) -> tuple[
1500 list[tuple[str, ...]], list[tuple[str, ...]], list[tuple[str, ...]]
1501 ]:
1502 new_coltags = []
1503 all_hdr_tags = [] # list of tuples
1504 new_rowtags = []
1505 for rt0 in rowtags:
1506 for ct0 in compute_coltags(
1507 lang,
1508 pos,
1509 hdrspans,
1510 col_idx, # col_idx=>start
1511 colspan,
1512 col, # cell_text
1513 ):
1514 base_tags: set[str] = (
1515 set(rt0)
1516 | set(ct0)
1517 | set(global_tags)
1518 | set(itertools.chain.from_iterable(table_tags))
1519 ) # Union.
1520 alt_tags = expand_header(
1521 wxr,
1522 tablecontext,
1523 word,
1524 lang,
1525 pos,
1526 text,
1527 base_tags,
1528 depth=depth,
1529 column_number=col_idx,
1530 )
1531 # base_tags are used in infl_map "if"-conds.
1532 for tt in alt_tags:
1533 if tt not in all_hdr_tags:
1534 all_hdr_tags.append(tt)
1535 tt_s = set(tt)
1536 # Certain tags are always moved to word-level tags
1537 if tt_s & TAGS_FORCED_WORDTAGS: 1537 ↛ 1538line 1537 didn't jump to line 1538 because the condition on line 1537 was never true
1538 table_tags.extend(tt_s & TAGS_FORCED_WORDTAGS)
1539 tt_s = tt_s - TAGS_FORCED_WORDTAGS
1540 # Add tags from referenced footnotes
1541 tt_s.update(refs_tags)
1542 # Sort, convert to tuple, and add to set of
1543 # alternatives.
1544 tt = tuple(sorted(tt_s))
1545 if tt not in new_coltags:
1546 new_coltags.append(tt)
1547 # Kludge (saprast/Latvian/Verb): ignore row tags
1548 # if trying to add a non-finite after mood.
1549 if any(valid_tags[t] == "mood" for t in rt0) and any(
1550 valid_tags[t] == "non-finite" for t in tt
1551 ):
1552 tags = tuple(sorted(set(tt) | set(hdr_tags)))
1553 else:
1554 tags = tuple(sorted(set(tt) | set(rt0) | set(hdr_tags)))
1555 if tags not in new_rowtags:
1556 new_rowtags.append(tags)
1557 return new_rowtags, new_coltags, all_hdr_tags
1559 def add_new_hdrspan(
1560 col: str,
1561 hdrspans: list[HdrSpan],
1562 store_new_hdrspan: bool,
1563 col0_followed_by_nonempty: bool,
1564 col0_hdrspan: Optional[HdrSpan],
1565 ) -> tuple[str, bool, Optional[HdrSpan]]:
1566 hdrspan = HdrSpan(
1567 col_idx, colspan, rowspan, rownum, new_coltags, col, all_headers
1568 )
1569 hdrspans.append(hdrspan)
1571 # infl-map tag "dummy-store-hdrspan" causes this new hdrspan
1572 # to be added to a register of stored hdrspans to be used
1573 # later with "dummy-load-stored-hdrspans".
1574 if store_new_hdrspan: 1574 ↛ 1575line 1574 didn't jump to line 1575 because the condition on line 1574 was never true
1575 tablecontext.stored_hdrspans.append(hdrspan)
1577 # Handle headers that are above left-side header
1578 # columns and are followed by personal pronouns in
1579 # remaining columns (basically headers that
1580 # evaluate to no tags). In such cases widen the
1581 # left-side header to the full row.
1582 if previously_seen: # id(cell) in seen_cells previously
1583 col0_followed_by_nonempty = True
1584 return col, col0_followed_by_nonempty, col0_hdrspan
1585 elif col0_hdrspan is None:
1586 col0_hdrspan = hdrspan
1587 elif any(all_hdr_tags): 1587 ↛ 1655line 1587 didn't jump to line 1655 because the condition on line 1587 was always true
1588 col0_cats = tagset_cats(col0_hdrspan.tagsets)
1589 later_cats = tagset_cats(all_hdr_tags)
1590 col0_allowed = get_lang_conf(lang, "hdr_expand_first")
1591 later_allowed = get_lang_conf(lang, "hdr_expand_cont")
1592 later_allowed = later_allowed | set(["dummy"])
1593 # dummy2 has different behavior than plain dummy
1594 # and does not belong here.
1596 # print("col0_cats={} later_cats={} "
1597 # "fol_by_nonempty={} col_idx={} end={} "
1598 # "tagsets={}"
1599 # .format(col0_cats, later_cats,
1600 # col0_followed_by_nonempty, col_idx,
1601 # col0_hdrspan.start +
1602 # col0_hdrspan.colspan,
1603 # col0_hdrspan.tagsets))
1604 # print("col0.rowspan={} rowspan={}"
1605 # .format(col0_hdrspan.rowspan, rowspan))
1606 # Only expand if [col0_cats and later_cats are allowed
1607 # and don't overlap] and [col0 has tags], and there have
1608 # been [no disallowed cells in between].
1609 #
1610 # There are three cases here:
1611 # - col0_hdrspan set, continue with allowed current
1612 # - col0_hdrspan set, expand, start new
1613 # - col0_hdrspan set, no expand, start new
1614 if (
1615 not col0_followed_by_nonempty
1616 and
1617 # XXX Only one cat of tags: kunna/Swedish
1618 # XXX len(col0_cats) == 1 and
1619 col0_hdrspan.rowspan >= rowspan
1620 and
1621 # from hdrspan
1622 not (later_cats - later_allowed)
1623 and not (col0_cats & later_cats)
1624 ):
1625 # First case: col0 set, continue
1626 return col, col0_followed_by_nonempty, col0_hdrspan
1627 # We are going to start new col0_hdrspan. Check if
1628 # we should expand.
1629 if (
1630 not col0_followed_by_nonempty
1631 and not (col0_cats - col0_allowed)
1632 and
1633 # Only "allowed" allowed
1634 # XXX len(col0_cats) == 1 and
1635 col_idx > col0_hdrspan.start + col0_hdrspan.colspan
1636 ):
1637 # col_idx is beyond current colspan
1638 # *Expand* current col0_hdrspan
1639 # print("EXPANDING COL0 MID: {} from {} to {} "
1640 # "cols {}"
1641 # .format(col0_hdrspan.text,
1642 # col0_hdrspan.colspan,
1643 # col_idx - col0_hdrspan.start,
1644 # col0_hdrspan.tagsets))
1645 col0_hdrspan.colspan = col_idx - col0_hdrspan.start
1646 col0_hdrspan.expanded = True
1647 # Clear old col0_hdrspan
1648 if col == debug_cell_text: 1648 ↛ 1649line 1648 didn't jump to line 1649 because the condition on line 1648 was never true
1649 print("START NEW {}".format(hdrspan.tagsets))
1650 col0_hdrspan = None
1651 # Now start new, unless it comes from previous row
1652 if not previously_seen: 1652 ↛ 1655line 1652 didn't jump to line 1655 because the condition on line 1652 was always true
1653 col0_hdrspan = hdrspan
1654 col0_followed_by_nonempty = False
1655 return col, col0_followed_by_nonempty, col0_hdrspan
1657 def split_text_into_alts(col: str) -> tuple[str, list[str], list[str]]:
1658 # Split the cell text into alternatives
1659 split_extra_tags = []
1660 if col and is_superscript(col[0]): 1660 ↛ 1661line 1660 didn't jump to line 1661 because the condition on line 1660 was never true
1661 alts = [col]
1662 else:
1663 separators = [";", "•", r"\n", " or "]
1664 if " + " not in col:
1665 separators.append(",")
1666 if not col.endswith("/"):
1667 separators.append("/")
1668 if col in special_phrase_splits:
1669 # Use language-specific special splits.
1670 # These are phrases and constructions that have
1671 # unique ways of splitting, not specific characters
1672 # to split on like with the default splitting.
1673 alts, tags = special_phrase_splits[col]
1674 split_extra_tags = tags.split()
1675 for x in split_extra_tags:
1676 assert x in valid_tags
1677 assert isinstance(alts, (list, tuple))
1678 assert isinstance(tags, str)
1679 else:
1680 # Use default splitting. However, recognize
1681 # language-specific replacements and change them to magic
1682 # characters before splitting. This way we won't split
1683 # them. This is important for, e.g., recognizing
1684 # alternative pronouns.
1685 # The magic characters are characters out of Unicode scope
1686 # that are given a simple incremental value, int > unicode.
1687 repls = {}
1688 magic_ch = MAGIC_FIRST
1689 trs = get_lang_conf(lang, "form_transformations")
1690 # trs is a list of lists of strings
1691 for _, v, _, _ in trs:
1692 # v is a pattern string, like "^ich"
1693 # form_transformations data is doing double-duty here,
1694 # because the pattern strings are already known to us and
1695 # not meant to be split.
1696 m = re.search(v, col)
1697 if m is not None:
1698 # if pattern found in text
1699 magic = chr(magic_ch)
1700 magic_ch += 1 # next magic character value
1701 col = re.sub(v, magic, col) # replace with magic ch
1702 repls[magic] = m.group(0)
1703 # remember what regex match string each magic char
1704 # replaces. .group(0) is the whole match.
1705 alts0 = split_at_comma_semi(col, separators=separators)
1706 # with magic characters in place, split the text so that
1707 # pre-transformation text is out of the way.
1708 alts = []
1709 for alt in alts0:
1710 # create a new list with the separated items and
1711 # the magic characters replaced with the original texts.
1712 for k, v in repls.items():
1713 alt = re.sub(k, v, alt)
1714 alts.append(alt)
1716 # Remove "*" from beginning of forms, as in non-attested
1717 # or reconstructed forms. Otherwise it might confuse romanization
1718 # detection.
1719 alts = list(re.sub(r"^\*\*?([^ ])", r"\1", x) for x in alts)
1720 alts = list(
1721 x for x in alts if not re.match(r"pronounced with |\(with ", x)
1722 )
1723 alts = list(
1724 re.sub(r"^\((in the sense [^)]*)\)\s+", "", x) for x in alts
1725 )
1726 return col, alts, split_extra_tags
1728 def handle_mixed_lines(alts: list[str]) -> list[tuple[str, str, str]]:
1729 # Handle the special case where romanization is given under
1730 # normal form, e.g. in Russian. There can be multiple
1731 # comma-separated forms in each case. We also handle the case
1732 # where instead of romanization we have IPA pronunciation
1733 # (e.g., avoir/French/verb).
1734 len2 = len(alts) // 2
1735 # Check for IPAs (forms first, IPAs under)
1736 # base, base, IPA, IPA
1737 if (
1738 len(alts) % 2 == 0 # Divisibly by two
1739 and all(
1740 re.match(r"^\s*/.*/\s*$", x) # Inside slashes = IPA
1741 for x in alts[len2:]
1742 )
1743 ): # In the second half of alts
1744 nalts = list(
1745 (alts[i], "", alts[i + len2])
1746 # List of tuples: (base, "", ipa)
1747 for i in range(len2)
1748 )
1749 # base, base, base, IPA
1750 elif (
1751 len(alts) > 2
1752 and re.match(r"^\s*/.*/\s*$", alts[-1])
1753 and all(not x.startswith("/") for x in alts[:-1])
1754 ):
1755 # Only if the last alt is IPA
1756 nalts = list((alts[i], "", alts[-1]) for i in range(len(alts) - 1))
1757 # base, IPA, IPA, IPA
1758 elif (
1759 len(alts) > 2
1760 and not alts[0].startswith("/")
1761 and all(
1762 re.match(r"^\s*/.*/\s*$", alts[i]) for i in range(1, len(alts))
1763 )
1764 ):
1765 # First is base and the rest is IPA alternatives
1766 nalts = list((alts[0], "", alts[i]) for i in range(1, len(alts)))
1768 # Check for romanizations, forms first, romanizations under
1769 elif (
1770 len(alts) % 2 == 0
1771 and not any("(" in x for x in alts)
1772 and all(
1773 classify_desc(
1774 re.sub(
1775 r"\^.*$",
1776 "",
1777 # Remove ends of strings starting from ^.
1778 # Supescripts have been already removed
1779 # from the string, while ^xyz needs to be
1780 # removed separately, though it's usually
1781 # something with a single letter?
1782 "".join(xx for xx in x if not is_superscript(xx)),
1783 )
1784 )
1785 == "other"
1786 for x in alts[:len2]
1787 )
1788 and all(
1789 classify_desc(
1790 re.sub(
1791 r"\^.*$",
1792 "",
1793 "".join(xx for xx in x if not is_superscript(xx)),
1794 )
1795 )
1796 in ("romanization", "english")
1797 for x in alts[len2:]
1798 )
1799 ):
1800 nalts = list((alts[i], alts[i + len2], "") for i in range(len2))
1801 # Check for romanizations, forms and romanizations alternating
1802 elif (
1803 len(alts) % 2 == 0
1804 and not any("(" in x for x in alts)
1805 and all(
1806 classify_desc(
1807 re.sub(
1808 r"\^.*$",
1809 "",
1810 "".join(xx for xx in alts[i] if not is_superscript(xx)),
1811 )
1812 )
1813 == "other"
1814 for i in range(0, len(alts), 2)
1815 )
1816 and all(
1817 classify_desc(
1818 re.sub(
1819 r"\^.*$",
1820 "",
1821 "".join(xx for xx in alts[i] if not is_superscript(xx)),
1822 )
1823 )
1824 in ("romanization", "english")
1825 for i in range(1, len(alts), 2)
1826 )
1827 ):
1828 # odds
1829 nalts = list(
1830 (alts[i], alts[i + 1], "") for i in range(0, len(alts), 2)
1831 )
1832 # evens
1833 # Handle complex Georgian entries with alternative forms and*
1834 # *romanizations. It's a bit of a mess. Remove this kludge if not
1835 # needed anymore. NOTE THAT THE PARENTHESES ON THE WEBSITE ARE NOT
1836 # DISPLAYED. They are put inside their own span elements that are
1837 # then hidden with some CSS.
1838 # https://en.wiktionary.org/wiki/%E1%83%90%E1%83%9B%E1%83%94%E1%83%A0%E1%83%98%E1%83%99%E1%83%98%E1%83%A1_%E1%83%A8%E1%83%94%E1%83%94%E1%83%A0%E1%83%97%E1%83%94%E1%83%91%E1%83%A3%E1%83%9A%E1%83%98_%E1%83%A8%E1%83%A2%E1%83%90%E1%83%A2%E1%83%94%E1%83%91%E1%83%98
1839 # ამერიკის შეერთებულ შტატებს(ა) (ameriḳis šeertebul šṭaṭebs(a))
1840 # The above should generate two alts entries, with two different
1841 # parallel versions, one without (a) and with (a) at the end,
1842 # for both the Georgian original and the romanization.
1843 elif ( 1843 ↛ 1848line 1843 didn't jump to line 1848 because the condition on line 1843 was never true
1844 tablecontext.template_name == "ka-decl-noun"
1845 and len(alts) == 1
1846 and " (" in alts[0]
1847 ):
1848 nalts = ka_decl_noun_template_cell(alts)
1849 else:
1850 new_alts = []
1851 for alt in alts:
1852 lst = [""]
1853 idx = 0
1854 for m in re.finditer(
1855 r"(^|\w|\*)\((\w+" r"(/\w+)*)\)",
1856 # start OR letter OR asterisk (word/word*)
1857 # \\___________group 1_______/ \ \_g3_///
1858 # \ \__gr. 2_//
1859 # \_____________group 0________________/
1860 alt,
1861 ):
1862 v = m.group(2) # (word/word/word...)
1863 if (
1864 classify_desc(v) == "tags" # Tags inside parens
1865 or m.group(0) == alt
1866 ): # All in parens
1867 continue
1868 new_lst = []
1869 for x in lst:
1870 x += alt[idx : m.start()] + m.group(1)
1871 # alt until letter or asterisk
1872 idx = m.end()
1873 vparts = v.split("/")
1874 # group(2) = ["word", "wörd"...]
1875 if len(vparts) == 1:
1876 new_lst.append(x)
1877 new_lst.append(x + v)
1878 # "kind(er)" -> ["kind", "kinder"]
1879 else:
1880 for vv in vparts:
1881 new_lst.append(x + vv)
1882 # "lampai(tten/den)" ->
1883 # ["lampaitten", "lampaiden"]
1884 lst = new_lst
1885 for x in lst:
1886 new_alts.append(x + alt[idx:])
1887 # add the end of alt
1888 nalts = list((x, "", "") for x in new_alts)
1889 # [form, no romz, no ipa]
1890 return nalts
1892 def find_semantic_parens(form: str) -> tuple[str, list[str]]:
1893 # "Some languages" (=Greek) use brackets to mark things that
1894 # require tags, like (informality), [rarity] and {archaicity}.
1895 extra_tags = []
1896 if re.match(r"\([^][(){}]*\)$", form):
1897 if get_lang_conf(lang, "parentheses_for_informal"):
1898 form = form[1:-1]
1899 extra_tags.append("informal")
1900 else:
1901 form = form[1:-1]
1902 elif re.match(r"\{\[[^][(){}]*\]\}$", form):
1903 if get_lang_conf( 1903 ↛ 1910line 1903 didn't jump to line 1910 because the condition on line 1903 was always true
1904 lang, "square_brackets_for_rare"
1905 ) and get_lang_conf(lang, "curly_brackets_for_archaic"):
1906 # είμαι/Greek/Verb
1907 form = form[2:-2]
1908 extra_tags.extend(["rare", "archaic"])
1909 else:
1910 form = form[2:-2]
1911 elif re.match(r"\{[^][(){}]*\}$", form):
1912 if get_lang_conf(lang, "curly_brackets_for_archaic"): 1912 ↛ 1917line 1912 didn't jump to line 1917 because the condition on line 1912 was always true
1913 # είμαι/Greek/Verb
1914 form = form[1:-1]
1915 extra_tags.extend(["archaic"])
1916 else:
1917 form = form[1:-1]
1918 elif re.match(r"\[[^][(){}]*\]$", form):
1919 if get_lang_conf(lang, "square_brackets_for_rare"): 1919 ↛ 1924line 1919 didn't jump to line 1924 because the condition on line 1919 was always true
1920 # είμαι/Greek/Verb
1921 form = form[1:-1]
1922 extra_tags.append("rare")
1923 else:
1924 form = form[1:-1]
1925 return form, extra_tags
1927 def handle_parens(
1928 form: str, roman: str, clitic: str, extra_tags: list[str]
1929 ) -> tuple[str, str, str]:
1930 if re.match(r"[’'][a-z]([a-z][a-z]?)?$", paren):
1931 # is there a clitic starting with apostrophe?
1932 clitic = paren
1933 # assume the whole paren is a clitic
1934 # then remove paren from form
1935 form = (form[: m.start()] + subst + form[m.end() :]).strip()
1936 elif classify_desc(paren) == "tags":
1937 tagsets1, topics1 = decode_tags(paren)
1938 if not topics1: 1938 ↛ 1959line 1938 didn't jump to line 1959 because the condition on line 1938 was always true
1939 for ts in tagsets1:
1940 ts = tuple(x for x in ts if " " not in x)
1941 # There are some generated tags containing
1942 # spaces; do not let them through here.
1943 extra_tags.extend(ts)
1944 form = (form[: m.start()] + subst + form[m.end() :]).strip()
1945 # brackets contain romanization
1946 elif (
1947 m.start() > 0
1948 and not roman
1949 and classify_desc(form[: m.start()]) == "other"
1950 and
1951 # "other" ~ text
1952 classify_desc(paren) in ("romanization", "english")
1953 and not re.search(r"^with |-form$", paren)
1954 ):
1955 roman = paren
1956 form = (form[: m.start()] + subst + form[m.end() :]).strip()
1957 elif re.search(r"^with |-form", paren): 1957 ↛ 1958line 1957 didn't jump to line 1958 because the condition on line 1957 was never true
1958 form = (form[: m.start()] + subst + form[m.end() :]).strip()
1959 return form, roman, clitic
1961 def merge_row_and_column_tags(form, some_has_covered_text):
1962 # Merge column tags and row tags. We give preference
1963 # to moods etc coming from rowtags (cf. austteigen/German/Verb
1964 # imperative forms).
1966 # In certain cases, what a tag means depends on whether
1967 # it is a row or column header. Depending on the language,
1968 # we replace certain tags with others if they're in
1969 # a column or row
1971 ret = []
1972 # rtagreplacs = get_lang_conf(lang, "rowtag_replacements")
1973 # ctagreplacs = get_lang_conf(lang, "coltag_replacements")
1974 for rt in sorted(rowtags):
1975 if "dummy-use-as-coltags" in rt: 1975 ↛ 1976line 1975 didn't jump to line 1976 because the condition on line 1975 was never true
1976 continue
1977 # if lang was in rowtag_replacements)
1978 # if not rtagreplacs == None:
1979 # rt = replace_directional_tags(rt, rtagreplacs)
1980 for ct in sorted(coltags):
1981 if "dummy-use-as-rowtags" in ct: 1981 ↛ 1982line 1981 didn't jump to line 1982 because the condition on line 1981 was never true
1982 continue
1983 # if lang was in coltag_replacements
1984 # if not ctagreplacs == None:
1985 # ct = replace_directional_tags(ct,
1986 # ctagreplacs)
1987 tags = set(global_tags)
1988 tags.update(extra_tags)
1989 tags.update(rt)
1990 tags.update(refs_tags)
1991 tags.update(tablecontext.section_header)
1992 # Merge tags from column. For certain kinds of tags,
1993 # those coming from row take precedence.
1994 old_tags = set(tags)
1995 for t in ct:
1996 c = valid_tags[t]
1997 if c in ("mood", "case", "number") and any(
1998 valid_tags[tt] == c for tt in old_tags
1999 ):
2000 continue
2001 tags.add(t)
2003 # Extract language-specific tags from the
2004 # form. This may also adjust the form.
2005 form, lang_tags = lang_specific_tags(lang, pos, form)
2006 tags.update(lang_tags)
2008 # For non-finite verb forms, see if they have
2009 # a gender/class suffix
2010 if pos == "verb" and any(
2011 valid_tags[t] == "non-finite" for t in tags
2012 ):
2013 form, tt = parse_head_final_tags(wxr, lang, form)
2014 tags.update(tt)
2016 # Remove "personal" tag if have nth person; these
2017 # come up with e.g. reconhecer/Portuguese/Verb. But
2018 # not if we also have "pronoun"
2019 if (
2020 "personal" in tags
2021 and "pronoun" not in tags
2022 and any(
2023 x in tags
2024 for x in [
2025 "first-person",
2026 "second-person",
2027 "third-person",
2028 ]
2029 )
2030 ):
2031 tags.remove("personal")
2033 # If we have impersonal, remove person and number.
2034 # This happens with e.g. viajar/Portuguese/Verb
2035 if "impersonal" in tags:
2036 tags = tags - set(
2037 [
2038 "first-person",
2039 "second-person",
2040 "third-person",
2041 "singular",
2042 "plural",
2043 ]
2044 )
2046 # Remove unnecessary "positive" tag from verb forms
2047 if pos == "verb" and "positive" in tags:
2048 if "negative" in tags: 2048 ↛ 2049line 2048 didn't jump to line 2049 because the condition on line 2048 was never true
2049 tags.remove("negative")
2050 tags.remove("positive")
2052 # Many Russian (and other Slavic) inflection tables
2053 # have animate/inanimate distinction that generates
2054 # separate entries for neuter/feminine, but the
2055 # distinction only applies to masculine. Remove them
2056 # form neuter/feminine and eliminate duplicates.
2057 if get_lang_conf(lang, "masc_only_animate"):
2058 for t1 in ("animate", "inanimate"):
2059 for t2 in ("neuter", "feminine"):
2060 if (
2061 t1 in tags
2062 and t2 in tags
2063 and "masculine" not in tags
2064 and "plural" not in tags
2065 ):
2066 tags.remove(t1)
2068 # German adjective tables contain "(keiner)" etc
2069 # for mixed declension plural. When the adjective
2070 # disappears and it becomes just one word, remove
2071 # the "includes-article" tag. e.g. eiskalt/German
2072 if "includes-article" in tags and " " not in form:
2073 tags.remove("includes-article")
2075 # Handle ignored forms. We mark that the form was
2076 # provided. This is important information; some words
2077 # just do not have a certain form. However, there also
2078 # many cases where no word in a language has a
2079 # particular form. Post-processing could detect and
2080 # remove such cases.
2081 if form in IGNORED_COLVALUES:
2082 # if cell text seems to be ignorable
2083 if "dummy-ignore-skipped" in tags:
2084 continue
2085 if (
2086 col_idx not in has_covering_hdr
2087 and some_has_covered_text
2088 ):
2089 continue
2090 # don't ignore this cell if there's been a header
2091 # above it
2092 form = "-"
2093 elif col_idx in has_covering_hdr:
2094 some_has_covered_text = True
2096 # Handle ambiguous object concord. If a header
2097 # gives the "dummy-object-concord"-tag to a word,
2098 # replace person, number and gender tags with
2099 # their "object-" counterparts so that the verb
2100 # agrees with the object instead.
2101 # Use only when the verb has ONLY object agreement!
2102 # a پخول/Pashto
2103 if "dummy-object-concord" in tags: 2103 ↛ 2104line 2103 didn't jump to line 2104 because the condition on line 2103 was never true
2104 for subtag, objtag in object_concord_replacements.items():
2105 if subtag in tags:
2106 tags.remove(subtag)
2107 tags.add(objtag)
2109 # Remove the dummy mood tag that we sometimes
2110 # use to block adding other mood and related
2111 # tags
2112 tags = tags - set(
2113 [
2114 "dummy-mood",
2115 "dummy-tense",
2116 "dummy-ignore-skipped",
2117 "dummy-object-concord",
2118 "dummy-reset-headers",
2119 "dummy-use-as-coltags",
2120 "dummy-use-as-rowtags",
2121 "dummy-store-hdrspan",
2122 "dummy-load-stored-hdrspans",
2123 "dummy-reset-stored-hdrspans",
2124 "dummy-section-header",
2125 ]
2126 )
2128 # Perform language-specific tag replacements according
2129 # to rules in a table.
2130 lang_tag_mappings = get_lang_conf(lang, "lang_tag_mappings")
2131 if lang_tag_mappings is not None: 2131 ↛ 2132line 2131 didn't jump to line 2132 because the condition on line 2131 was never true
2132 for pre, post in lang_tag_mappings.items():
2133 if all(t in tags for t in pre):
2134 tags = (tags - set(pre)) | set(post)
2136 # Warn if there are entries with empty tags
2137 if not tags:
2138 wxr.wtp.debug(
2139 "inflection table: empty tags for {}".format(form),
2140 sortid="inflection/1826",
2141 )
2143 # Warn if form looks like IPA
2144 ########## XXX ########
2145 # Because IPA is its own unicode block, we could also
2146 # technically do a Unicode name check to see if a string
2147 # contains IPA. Not all valid IPA characters are in the
2148 # IPA extension block, so you can technically have false
2149 # negatives if it's something like /toki/, but it
2150 # shouldn't give false positives.
2151 # Alternatively, you could make a list of IPA-admissible
2152 # characters and reject non-IPA stuff with that.
2153 if re.match(r"\s*/.*/\s*$", form): 2153 ↛ 2154line 2153 didn't jump to line 2154 because the condition on line 2153 was never true
2154 wxr.wtp.debug(
2155 "inflection table form looks like IPA: "
2156 "form={} tags={}".format(form, tags),
2157 sortid="inflection/1840",
2158 )
2160 # Note that this checks `form`, not `in tags`
2161 if form == "dummy-ignored-text-cell": 2161 ↛ 2162line 2161 didn't jump to line 2162 because the condition on line 2161 was never true
2162 continue
2164 if "dummy-remove-this-cell" in tags: 2164 ↛ 2165line 2164 didn't jump to line 2165 because the condition on line 2164 was never true
2165 continue
2167 # Add the form
2168 tags = list(sorted(tags))
2169 dt = {"form": form, "tags": tags, "source": source}
2170 if roman:
2171 dt["roman"] = roman
2172 if ipa:
2173 dt["ipa"] = ipa
2174 ret.append(dt)
2175 # If we got separate clitic form, add it
2176 if clitic:
2177 dt = {
2178 "form": clitic,
2179 "tags": tags + ["clitic"],
2180 "source": source,
2181 }
2182 ret.append(dt)
2183 return ret, form, some_has_covered_text
2185 # First extract definitions from cells
2186 # See defs_ht for footnote defs stuff
2187 for row in rows:
2188 for cell in row:
2189 text, refs, defs, hdr_tags = extract_cell_content(
2190 lang, word, cell.text
2191 )
2192 # refs, defs = footnote stuff, defs -> (ref, def)
2193 add_defs(defs)
2194 # Extract definitions from text after table
2195 text, refs, defs, hdr_tags = extract_cell_content(lang, word, after)
2196 add_defs(defs)
2198 # Then extract the actual forms
2199 ret = []
2200 hdrspans = []
2201 first_col_has_text = False
2202 rownum = 0
2203 title = None
2204 global_tags = []
2205 table_tags = []
2206 special_phrase_splits = get_lang_conf(lang, "special_phrase_splits")
2207 form_replacements = get_lang_conf(lang, "form_replacements")
2208 form_transformations = get_lang_conf(lang, "form_transformations")
2209 possibly_ignored_forms = get_lang_conf(lang, "conditionally_ignored_cells")
2210 cleanup_rules = get_lang_conf(lang, "minor_text_cleanups")
2212 for title in titles:
2213 more_global_tags, more_table_tags, extra_forms = parse_title(
2214 title, source
2215 )
2216 global_tags.extend(more_global_tags)
2217 table_tags.extend(more_table_tags)
2218 ret.extend(extra_forms)
2219 cell_rowcnt = collections.defaultdict(int)
2220 seen_cells = set()
2221 has_covering_hdr = set()
2222 some_has_covered_text = False
2223 for row in rows:
2224 # print("ROW:", row)
2225 # print("====")
2226 # print(f"Start of PREVIOUS row hdrspans:"
2227 # f"{tuple(sp.tagsets for sp in hdrspans)}")
2228 # print(f"Start of row txt: {tuple(t.text for t in row)}")
2229 if not row: 2229 ↛ 2230line 2229 didn't jump to line 2230 because the condition on line 2229 was never true
2230 continue # Skip empty rows
2231 all_headers = all(x.is_title or not x.text.strip() for x in row)
2232 text = row[0].text
2233 if (
2234 row[0].is_title
2235 and text
2236 and not is_superscript(text[0])
2237 and text not in infl_map # zealous inflation map?
2238 and (
2239 re.match(r"Inflection ", text)
2240 or re.sub(
2241 r"\s+",
2242 " ", # flatten whitespace
2243 re.sub(
2244 r"\s*\([^)]*\)",
2245 "",
2246 # Remove whitespace+parens
2247 text,
2248 ),
2249 ).strip()
2250 not in infl_map
2251 )
2252 and not re.match(infl_start_re, text)
2253 and all(
2254 x.is_title == row[0].is_title and x.text == text
2255 # all InflCells in `row` have the same is_title and text
2256 for x in row
2257 )
2258 ):
2259 if text and title is None:
2260 # Only if there were no titles previously make the first
2261 # text that is found the title
2262 title = text
2263 if re.match(r"(Note:|Notes:)", title): 2263 ↛ 2264line 2263 didn't jump to line 2264 because the condition on line 2263 was never true
2264 continue # not a title
2265 more_global_tags, more_table_tags, extra_forms = parse_title(
2266 title, source
2267 )
2268 global_tags.extend(more_global_tags)
2269 table_tags.extend(more_table_tags)
2270 ret.extend(extra_forms)
2271 continue # Skip title rows without incrementing i
2272 if "dummy-skip-this" in global_tags: 2272 ↛ 2273line 2272 didn't jump to line 2273 because the condition on line 2272 was never true
2273 return []
2274 rowtags = [()]
2275 # have_hdr = False
2276 # have_hdr never used?
2277 have_text = False
2278 samecell_cnt = 0
2279 col0_hdrspan = None # col0 or later header (despite its name)
2280 col0_followed_by_nonempty = False
2281 row_empty = True
2282 for col_idx, cell in enumerate(row):
2283 colspan = cell.colspan # >= 1
2284 rowspan = cell.rowspan # >= 1
2285 previously_seen = id(cell) in seen_cells
2286 # checks to see if this cell was in the previous ROW
2287 seen_cells.add(id(cell))
2288 if samecell_cnt == 0:
2289 # First column of a (possible multi-column) cell
2290 samecell_cnt = colspan - 1
2291 else:
2292 assert samecell_cnt > 0
2293 samecell_cnt -= 1
2294 continue
2296 # is_first_row_of_cell = cell_rowcnt[id(cell)] == 0
2297 # never used?
2299 # defaultdict(int) around line 1900
2300 cell_rowcnt[id(cell)] += 1
2301 # => how many cols this spans
2302 col = cell.text
2303 if not col:
2304 continue
2305 row_empty = False
2306 is_title = cell.is_title
2308 # If the cell has a target, i.e., text after colon, interpret
2309 # it as simply specifying a value for that value and ignore
2310 # it otherwise.
2311 if cell.target:
2312 text, refs, defs, hdr_tags = extract_cell_content(
2313 lang, word, col
2314 )
2315 if not text: 2315 ↛ 2316line 2315 didn't jump to line 2316 because the condition on line 2315 was never true
2316 continue
2317 refs_tags = set()
2318 for ref in refs: # gets tags from footnotes 2318 ↛ 2319line 2318 didn't jump to line 2319 because the loop on line 2318 never started
2319 if ref in def_ht:
2320 refs_tags.update(def_ht[ref])
2321 rowtags = expand_header(
2322 wxr,
2323 tablecontext,
2324 word,
2325 lang,
2326 pos,
2327 text,
2328 [],
2329 silent=True,
2330 depth=depth,
2331 column_number=col_idx,
2332 )
2333 rowtags = list(
2334 set(tuple(sorted(set(x) | refs_tags)) for x in rowtags)
2335 )
2336 is_title = False
2337 col = cell.target
2339 # print(rownum, col_idx, col)
2340 # print(f"is_title: {is_title}")
2341 if is_title:
2342 # It is a header cell
2343 text, refs, defs, hdr_tags = extract_cell_content(
2344 lang, word, col
2345 )
2346 if not text:
2347 continue
2348 # Extract tags from referenced footnotes
2349 refs_tags = set()
2350 for ref in refs:
2351 if ref in def_ht:
2352 refs_tags.update(def_ht[ref])
2354 # Expand header to tags
2355 v = expand_header(
2356 wxr,
2357 tablecontext,
2358 word,
2359 lang,
2360 pos,
2361 text,
2362 [],
2363 silent=True,
2364 depth=depth,
2365 column_number=col_idx,
2366 )
2367 # print("EXPANDED {!r} to {}".format(text, v))
2369 if col_idx == 0:
2370 # first_col_has_text is used for a test to ignore
2371 # upper-left cells that are just text without
2372 # header info
2373 first_col_has_text = True
2374 # Check if the header expands to reset hdrspans
2375 if any("dummy-reset-headers" in tt for tt in v):
2376 new_hdrspans = []
2377 for hdrspan in hdrspans:
2378 # if there are HdrSpan objects (abstract headers with
2379 # row- and column-spans) that are to the left or at the
2380 # same row or below, KEEP those; things above and to
2381 # the right of the hdrspan with dummy-reset-headers
2382 # are discarded. Tags from the header together with
2383 # dummy-reset-headers are kept as normal.
2384 if (
2385 hdrspan.start + hdrspan.colspan < col_idx
2386 or hdrspan.rownum > rownum - cell.rowspan
2387 ):
2388 new_hdrspans.append(hdrspan)
2389 hdrspans = new_hdrspans
2391 for tt in v:
2392 if "dummy-section-header" in tt: 2392 ↛ 2393line 2392 didn't jump to line 2393 because the condition on line 2392 was never true
2393 tablecontext.section_header = tt
2394 break
2395 if "dummy-reset-section-header" in tt: 2395 ↛ 2396line 2395 didn't jump to line 2396 because the condition on line 2395 was never true
2396 tablecontext.section_header = []
2397 # Text between headers on a row causes earlier headers to
2398 # be reset
2399 if have_text:
2400 # print(" HAVE_TEXT BEFORE HDR:", col)
2401 # Reset rowtags if new title column after previous
2402 # text cells
2403 # +-----+-----+-----+-----+
2404 # |hdr-a|txt-a|hdr-B|txt-B|
2405 # +-----+-----+-----+-----+
2406 # ^reset rowtags=>
2407 # XXX beware of header "—": "" - must not clear on that if
2408 # it expands to no tags
2409 rowtags = [()]
2410 # have_hdr = True
2411 # have_hdr never used?
2412 # print("HAVE_HDR: {} rowtags={}".format(col, rowtags))
2413 # Update rowtags and coltags
2414 has_covering_hdr.add(col_idx) # col_idx == current column
2415 # has_covering_hdr is a set that has the col_idx-ids of columns
2416 # that have previously had some kind of header. It is never
2417 # resetted inside the col_idx-loops OR the bigger rows-loop, so
2418 # applies to the whole table.
2420 rowtags, new_coltags, all_hdr_tags = generate_tags(
2421 rowtags, table_tags
2422 )
2424 if any("dummy-skip-this" in ts for ts in rowtags):
2425 continue # Skip this cell
2427 if any("dummy-load-stored-hdrspans" in ts for ts in v): 2427 ↛ 2428line 2427 didn't jump to line 2428 because the condition on line 2427 was never true
2428 hdrspans.extend(tablecontext.stored_hdrspans)
2430 if any("dummy-reset-stored-hdrspans" in ts for ts in v): 2430 ↛ 2431line 2430 didn't jump to line 2431 because the condition on line 2430 was never true
2431 tablecontext.stored_hdrspans = []
2433 if any("dummy-store-hdrspan" in ts for ts in v): 2433 ↛ 2435line 2433 didn't jump to line 2435 because the condition on line 2433 was never true
2434 # print(f"STORED: {col}")
2435 store_new_hdrspan = True
2436 else:
2437 store_new_hdrspan = False
2439 new_coltags = list(
2440 x
2441 for x in new_coltags
2442 if not any(t in noinherit_tags for t in x)
2443 )
2444 # print("new_coltags={} previously_seen={} all_hdr_tags={}"
2445 # .format(new_coltags, previously_seen, all_hdr_tags))
2446 if any(new_coltags):
2447 (
2448 col,
2449 col0_followed_by_nonempty,
2450 col0_hdrspan,
2451 ) = add_new_hdrspan(
2452 col,
2453 hdrspans,
2454 store_new_hdrspan,
2455 col0_followed_by_nonempty,
2456 col0_hdrspan,
2457 )
2459 continue
2461 # These values are ignored, at least for now
2462 if re.match(r"^(# |\(see )", col): 2462 ↛ 2463line 2462 didn't jump to line 2463 because the condition on line 2462 was never true
2463 continue
2465 if any("dummy-skip-this" in ts for ts in rowtags):
2466 continue # Skip this cell
2468 # If the word has no rowtags and is a multi-row cell, then
2469 # ignore this. This happens with empty separator rows
2470 # within a rowspan>1 cell. cf. wander/English/Conjugation.
2471 if rowtags == [()] and rowspan > 1:
2472 continue
2474 # Minor cleanup. See e.g. είμαι/Greek/Verb present participle.
2475 if cleanup_rules:
2476 for regx, substitution in cleanup_rules.items():
2477 col = re.sub(regx, substitution, col)
2479 if ( 2479 ↛ 2484line 2479 didn't jump to line 2484 because the condition on line 2479 was never true
2480 col_idx == 0
2481 and not first_col_has_text
2482 and get_lang_conf(lang, "ignore_top_left_text_cell") is True
2483 ):
2484 continue # Skip text at top left, as in Icelandic, Faroese
2486 # if col0_hdrspan is not None:
2487 # print("COL0 FOLLOWED NONHDR: {!r} by {!r}"
2488 # .format(col0_hdrspan.text, col))
2489 col0_followed_by_nonempty = True
2490 have_text = True
2492 # Determine column tags for the multi-column cell
2493 combined_coltags = compute_coltags(
2494 lang, pos, hdrspans, col_idx, colspan, col
2495 )
2496 if any("dummy-ignored-text-cell" in ts for ts in combined_coltags): 2496 ↛ 2497line 2496 didn't jump to line 2497 because the condition on line 2496 was never true
2497 continue
2499 # Split the text into separate forms. First simplify spaces except
2500 # newline.
2501 col = re.sub(r"[ \t\r]+", " ", col)
2502 # Split the cell text into alternatives
2504 col, alts, split_extra_tags = split_text_into_alts(col)
2506 # Some cells have mixed form content, like text and romanization,
2507 # or text and IPA. Handle these.
2508 alts = handle_mixed_lines(alts)
2510 alts = list((x, combined_coltags) for x in alts)
2512 # Generate forms from the alternatives
2513 # alts is a list of (tuple of forms, tuple of tags)
2514 for (form, base_roman, ipa), coltags in alts:
2515 form = form.strip()
2516 extra_tags = []
2517 extra_tags.extend(split_extra_tags)
2518 # Handle special splits again here, so that we can have custom
2519 # mappings from form to form and tags.
2520 if form in form_replacements:
2521 replacement, tags = form_replacements[form]
2522 for x in tags.split():
2523 assert x in valid_tags
2524 assert isinstance(replacement, str)
2525 assert isinstance(tags, str)
2526 form = replacement
2527 extra_tags.extend(tags.split())
2529 check_romanization_form_transformation = False
2530 # loop over regexes in form_transformation and replace text
2531 # in form using regex patterns
2532 # this does a bit of the same stuff the above does,
2533 # but with regexes and re.sub() instead
2534 for (
2535 form_transformations_pos,
2536 v,
2537 subst,
2538 tags,
2539 ) in form_transformations:
2540 # v is a pattern string, like "^ich"
2541 if pos != form_transformations_pos:
2542 continue
2543 m = re.search(v, form)
2544 if m is not None:
2545 form = re.sub(v, subst, form)
2546 for x in tags.split():
2547 assert x in valid_tags
2548 extra_tags.extend(tags.split())
2549 check_romanization_form_transformation = True
2550 break
2552 # Clean the value, extracting reference symbols
2553 form, refs, defs, hdr_tags = extract_cell_content(
2554 lang, word, form
2555 )
2556 # if refs:
2557 # print("REFS:", refs)
2558 extra_tags.extend(hdr_tags)
2559 # Extract tags from referenced footnotes
2560 refs_tags = set()
2561 for ref in refs:
2562 if ref in def_ht:
2563 refs_tags.update(def_ht[ref])
2565 if base_roman:
2566 if check_romanization_form_transformation: 2566 ↛ 2570line 2566 didn't jump to line 2570 because the condition on line 2566 was never true
2567 # because form_transformations are used to handle things
2568 # where the romanization has the "same" structure, we
2569 # need to handle that here too....
2570 for (
2571 _,
2572 v,
2573 subst,
2574 _,
2575 ) in form_transformations:
2576 # v is a pattern string, like "^ich"
2577 m = re.search(v, base_roman)
2578 if m is not None:
2579 base_roman = re.sub(v, subst, base_roman)
2580 # XXX add tag stuff here if needed
2581 break
2583 base_roman, _, _, hdr_tags = extract_cell_content(
2584 lang, word, base_roman
2585 )
2586 extra_tags.extend(hdr_tags)
2588 # Do some additional cleanup on the cell.
2589 form = re.sub(r"^\s*,\s*", "", form)
2590 form = re.sub(r"\s*,\s*$", "", form)
2591 form = re.sub(r"\s*(,\s*)+", ", ", form)
2592 form = re.sub(r"(?i)^Main:", "", form)
2593 form = re.sub(r"\s+", " ", form)
2594 form = form.strip()
2596 # Look for parentheses that have semantic meaning
2597 form, et = find_semantic_parens(form)
2598 extra_tags.extend(et)
2600 # Handle parentheses in the table element. We parse
2601 # tags anywhere and romanizations anywhere but beginning.
2602 roman = base_roman
2603 paren = None
2604 clitic = None
2605 m = re.search(r"(\s+|^)\(([^)]*)\)", form)
2606 # start|spaces + (anything)
2607 if m is not None:
2608 subst = m.group(1)
2609 paren = m.group(2)
2610 else:
2611 m = re.search(r"\(([^)]*)\)(\s+|$)", form)
2612 # (anything) + spaces|end
2613 if m is not None: 2613 ↛ 2614line 2613 didn't jump to line 2614 because the condition on line 2613 was never true
2614 paren = m.group(1)
2615 subst = m.group(2)
2616 if paren is not None:
2617 form, roman, clitic = handle_parens(
2618 form, roman, clitic, extra_tags
2619 )
2621 # Ignore certain forms that are not really forms,
2622 # unless they're really, really close to the article title
2623 if form in ( 2623 ↛ 2628line 2623 didn't jump to line 2628 because the condition on line 2623 was never true
2624 "",
2625 "unchanged",
2626 "after an", # in sona/Irish/Adj/Mutation
2627 ):
2628 Lev = distw([form], word)
2629 if form and Lev < 0.1:
2630 wxr.wtp.debug(
2631 "accepted possible false positive '{}' with"
2632 "> 0.1 Levenshtein distance in {}/{}".format(
2633 form, word, lang
2634 ),
2635 sortid="inflection/2213",
2636 )
2637 elif form and Lev < 0.3:
2638 wxr.wtp.debug(
2639 "skipped possible match '{}' with > 0.3"
2640 "Levenshtein distance in {}/{}".format(
2641 form, word, lang
2642 ),
2643 sortid="inflection/2218",
2644 )
2645 continue
2646 else:
2647 continue
2648 # print("ROWTAGS={} COLTAGS={} REFS_TAGS={} "
2649 # "FORM={!r} ROMAN={!r}"
2650 # .format(rowtags, coltags, refs_tags,
2651 # form, roman))
2653 # Merge tags from row and column and do miscellaneous
2654 # tag-related handling.
2655 (
2656 merge_ret,
2657 form,
2658 some_has_covered_text,
2659 ) = merge_row_and_column_tags(form, some_has_covered_text)
2660 ret.extend(merge_ret)
2662 # End of row.
2663 rownum += 1
2664 # For certain languages, if the row was empty, reset
2665 # hdrspans (saprast/Latvian/Verb, but not aussteigen/German/Verb).
2666 if row_empty and get_lang_conf(lang, "empty_row_resets"):
2667 hdrspans = []
2668 # Check if we should expand col0_hdrspan.
2669 if col0_hdrspan is not None:
2670 col0_allowed = get_lang_conf(lang, "hdr_expand_first")
2671 col0_cats = tagset_cats(col0_hdrspan.tagsets)
2672 # Only expand if col0_cats and later_cats are allowed
2673 # and don't overlap and col0 has tags, and there have
2674 # been no disallowed cells in between.
2675 if (
2676 not col0_followed_by_nonempty
2677 and not (col0_cats - col0_allowed)
2678 and
2679 # len(col0_cats) == 1 and
2680 col_idx > col0_hdrspan.start + col0_hdrspan.colspan
2681 ):
2682 # If an earlier header is only followed by headers that yield
2683 # no tags, expand it to entire row
2684 # print("EXPANDING COL0: {} from {} to {} cols {}"
2685 # .format(col0_hdrspan.text, col0_hdrspan.colspan,
2686 # len(row) - col0_hdrspan.start,
2687 # col0_hdrspan.tagsets))
2688 col0_hdrspan.colspan = len(row) - col0_hdrspan.start
2689 col0_hdrspan.expanded = True
2690 # XXX handle refs and defs
2691 # for x in hdrspans:
2692 # print(" HDRSPAN {} {} {} {!r}"
2693 # .format(x.start, x.colspan, x.tagsets, x.text))
2695 # Post-process German nouns with articles in separate columns. We move the
2696 # definite/indefinite/usually-without-article markers into the noun and
2697 # remove the article entries.
2698 if get_lang_conf(lang, "articles_in_separate_columns") and any(
2699 "noun" in x["tags"] for x in ret
2700 ):
2701 new_ret = []
2702 saved_tags = set()
2703 had_noun = False
2704 for dt in ret:
2705 tags = dt["tags"]
2706 # print(tags)
2707 if "noun" in tags:
2708 tags = list(
2709 sorted(set(t for t in tags if t != "noun") | saved_tags)
2710 )
2711 had_noun = True
2712 elif ( 2712 ↛ 2739line 2712 didn't jump to line 2739 because the condition on line 2712 was always true
2713 "indefinite" in tags
2714 or "definite" in tags
2715 or "usually-without-article" in tags
2716 or "without-article" in tags
2717 ):
2718 if had_noun:
2719 saved_tags = set(tags)
2720 else:
2721 saved_tags = saved_tags | set(tags) # E.g. Haus/German
2722 remove_useless_tags(lang, pos, saved_tags)
2723 saved_tags = saved_tags & set(
2724 [
2725 "masculine",
2726 "feminine",
2727 "neuter",
2728 "singular",
2729 "plural",
2730 "indefinite",
2731 "definite",
2732 "usually-without-article",
2733 "without-article",
2734 ]
2735 )
2736 had_noun = False
2737 continue # Skip the articles
2739 dt = dt.copy()
2740 dt["tags"] = tags
2741 new_ret.append(dt)
2742 ret = new_ret
2744 elif possibly_ignored_forms:
2745 # Some languages have tables with cells that are kind of separated
2746 # and difficult to handle, like eulersche Formel/German where
2747 # the definite and indefinite articles are just floating.
2748 # If a language has a dict of conditionally_ignored_cells,
2749 # and if the contents of a cell is found in one of the rules
2750 # there, ignore that cell if it
2751 # 1. Does not have the appropriate tag (like "definite" for "die")
2752 # and
2753 # 2. The title of the article is not one of the other co-words
2754 # (ie. it's an article for the definite articles in german etc.)
2755 # pass
2756 new_ret = []
2757 for cell_data in ret:
2758 tags = cell_data["tags"]
2759 text = cell_data["form"]
2760 skip_this = False
2761 for key_tag, ignored_forms in possibly_ignored_forms.items():
2762 if text not in ignored_forms: 2762 ↛ 2764line 2762 didn't jump to line 2764 because the condition on line 2762 was always true
2763 continue
2764 if word in ignored_forms:
2765 continue
2766 if key_tag not in tags:
2767 skip_this = True
2769 if skip_this: 2769 ↛ 2770line 2769 didn't jump to line 2770 because the condition on line 2769 was never true
2770 continue
2771 new_ret.append(cell_data)
2773 ret = new_ret
2775 # Post-process English inflection tables, addding "multiword-construction"
2776 # when the number of words has increased.
2777 if lang == "English" and pos == "verb":
2778 word_words = len(word.split())
2779 new_ret = []
2780 for dt in ret:
2781 form = dt.get("form", "")
2782 if len(form.split()) > word_words:
2783 dt = dt.copy()
2784 dt["tags"] = list(dt.get("tags", []))
2785 # This strange copy-assigning shuffle is preventative black
2786 # magic; do not touch lest you invoke deep bugs.
2787 data_append(dt, "tags", "multiword-construction")
2788 new_ret.append(dt)
2789 ret = new_ret
2791 # Always insert "table-tags" detail as the first entry in any inflection
2792 # table. This way we can reliably detect where a new table starts.
2793 # Table-tags applies until the next table-tags entry.
2794 if ret or table_tags:
2795 table_tags = list(sorted(set(table_tags)))
2796 dt = {
2797 "form": " ".join(table_tags),
2798 "source": source,
2799 "tags": ["table-tags"],
2800 }
2801 if dt["form"] == "":
2802 dt["form"] = "no-table-tags"
2803 if tablecontext.template_name:
2804 tn = {
2805 "form": tablecontext.template_name,
2806 "source": source,
2807 "tags": ["inflection-template"],
2808 }
2809 ret = [dt] + [tn] + ret
2810 else:
2811 ret = [dt] + ret
2813 return ret
2816def handle_generic_table(
2817 wxr, tablecontext, data, word, lang, pos, rows, titles, source, after, depth
2818):
2819 assert isinstance(wxr, WiktextractContext)
2820 assert isinstance(data, dict)
2821 assert isinstance(word, str)
2822 assert isinstance(lang, str)
2823 assert isinstance(pos, str)
2824 assert isinstance(rows, list)
2825 assert isinstance(source, str)
2826 assert isinstance(after, str)
2827 assert isinstance(depth, int)
2828 for row in rows:
2829 assert isinstance(row, list)
2830 for x in row:
2831 assert isinstance(x, InflCell)
2832 assert isinstance(titles, list)
2833 for x in titles:
2834 assert isinstance(x, str)
2836 # Try to parse the table as a simple table
2837 ret = parse_simple_table(
2838 wxr, tablecontext, word, lang, pos, rows, titles, source, after, depth
2839 )
2840 if ret is None: 2840 ↛ 2843line 2840 didn't jump to line 2843 because the condition on line 2840 was never true
2841 # XXX handle other table formats
2842 # We were not able to handle the table
2843 wxr.wtp.debug(
2844 "unhandled inflection table format, {}/{}".format(word, lang),
2845 sortid="inflection/2370",
2846 )
2847 return
2849 # Add the returned forms but eliminate duplicates.
2850 have_forms = set()
2851 for dt in ret:
2852 fdt = freeze(dt)
2853 if fdt in have_forms:
2854 continue # Don't add duplicates
2855 # Some Russian words have Declension and Pre-reform declension partially
2856 # duplicating same data. Don't add "dated" tags variant if already have
2857 # the same without "dated" from the modern declension table
2859 tags = dt.get("tags", [])
2860 for dated_tag in ("dated",):
2861 if dated_tag in tags:
2862 dt2 = dt.copy()
2863 tags2 = list(x for x in tags if x != dated_tag)
2864 dt2["tags"] = tags2
2865 if tags2 and freeze(dt2) in have_forms: 2865 ↛ 2866line 2865 didn't jump to line 2866 because the condition on line 2865 was never true
2866 break # Already have without archaic
2867 else:
2868 if "table-tags" not in tags:
2869 have_forms.add(fdt)
2870 data_append(data, "forms", dt)
2873def determine_header(
2874 wxr,
2875 tablecontext,
2876 lang,
2877 word,
2878 pos,
2879 table_kind,
2880 kind,
2881 style,
2882 row,
2883 col,
2884 celltext,
2885 titletext,
2886 cols_headered,
2887 target,
2888 cellstyle,
2889):
2890 assert isinstance(table_kind, NodeKind)
2891 assert isinstance(kind, (NodeKind, str))
2892 assert style is None or isinstance(style, str)
2893 assert cellstyle is None or isinstance(cellstyle, str)
2895 if table_kind == NodeKind.TABLE:
2896 header_kind = NodeKind.TABLE_HEADER_CELL
2897 elif table_kind == NodeKind.HTML: 2897 ↛ 2899line 2897 didn't jump to line 2899 because the condition on line 2897 was always true
2898 header_kind = "th"
2899 idx = celltext.find(": ")
2900 is_title = False
2901 # remove anything in parentheses, compress whitespace, .strip()
2902 cleaned_titletext = re.sub(
2903 r"\s+", " ", re.sub(r"\s*\([^)]*\)", "", titletext)
2904 ).strip()
2905 cleaned, _, _, _ = extract_cell_content(lang, word, celltext)
2906 cleaned = re.sub(r"\s+", " ", cleaned)
2907 hdr_expansion = expand_header(
2908 wxr,
2909 tablecontext,
2910 word,
2911 lang,
2912 pos,
2913 cleaned,
2914 [],
2915 silent=True,
2916 ignore_tags=True,
2917 )
2918 candidate_hdr = not any(
2919 any(t.startswith("error-") for t in ts) for ts in hdr_expansion
2920 )
2921 # KJ candidate_hdr says that a specific cell is a candidate
2922 # for being a header because it passed through expand_header
2923 # without getting any "error-" tags; that is, the contents
2924 # is "valid" for being a header; these are the false positives
2925 # we want to catch
2926 ignored_cell = any(
2927 any(t.startswith("dummy-") for t in ts) for ts in hdr_expansion
2928 )
2929 # ignored_cell should NOT be used to filter for headers, like
2930 # candidate_hdr is used, but only to filter for related *debug
2931 # messages*: some dummy-tags are actually half-way to headers,
2932 # like ones with "Notes", so they MUST be headers, but later
2933 # on they're ignored *as* headers so they don't need to print
2934 # out any cells-as-headers debug messages.
2935 if (
2936 candidate_hdr
2937 and kind != header_kind
2938 and cleaned != ""
2939 and cleaned != "dummy-ignored-text-cell"
2940 and cleaned not in IGNORED_COLVALUES
2941 ):
2942 # print("col: {}".format(col))
2943 if not ignored_cell and lang not in LANGUAGES_WITH_CELLS_AS_HEADERS:
2944 wxr.wtp.debug(
2945 "rejected heuristic header: "
2946 "table cell identified as header and given "
2947 "candidate status, BUT {} is not in "
2948 "LANGUAGES_WITH_CELLS_AS_HEADERS; "
2949 "cleaned text: {}".format(lang, cleaned),
2950 sortid="inflection/2447",
2951 )
2952 candidate_hdr = False
2953 elif cleaned not in LANGUAGES_WITH_CELLS_AS_HEADERS.get(lang, ""):
2954 wxr.wtp.debug(
2955 "rejected heuristic header: "
2956 "table cell identified as header and given "
2957 "candidate status, BUT the cleaned text is "
2958 "not in LANGUAGES_WITH_CELLS_AS_HEADERS[{}]; "
2959 "cleaned text: {}".format(lang, cleaned),
2960 sortid="inflection/2457",
2961 )
2962 candidate_hdr = False
2963 else:
2964 wxr.wtp.debug(
2965 "accepted heuristic header: "
2966 "table cell identified as header and given "
2967 "candidate status, AND the cleaned text is "
2968 "in LANGUAGES_WITH_CELLS_AS_HEADERS[{}]; "
2969 "cleaned text: {}".format(lang, cleaned),
2970 sortid="inflection/2466",
2971 )
2973 # If the cell starts with something that could start a
2974 # definition (typically a reference symbol), make it a candidate
2975 # regardless of whether the language is listed.
2976 if re.match(def_re, cleaned) and not re.match(nondef_re, cleaned): 2976 ↛ 2977line 2976 didn't jump to line 2977 because the condition on line 2976 was never true
2977 candidate_hdr = True
2979 # print("titletext={!r} hdr_expansion={!r} candidate_hdr={!r} "
2980 # "lang={} pos={}"
2981 # .format(titletext, hdr_expansion, candidate_hdr,
2982 # lang, pos))
2983 if idx >= 0 and titletext[:idx] in infl_map:
2984 target = titletext[idx + 2 :].strip()
2985 celltext = celltext[:idx]
2986 is_title = True
2987 elif (
2988 kind == header_kind
2989 and " + " not in titletext # For "avoir + blah blah"?
2990 and not any(
2991 isinstance(x, WikiNode)
2992 and x.kind == NodeKind.HTML
2993 and x.sarg == "span"
2994 and x.attrs.get("lang") in ("az",)
2995 for x in col.children
2996 )
2997 ):
2998 is_title = True
2999 elif (
3000 candidate_hdr
3001 and cleaned_titletext not in IGNORED_COLVALUES
3002 and distw([cleaned_titletext], word) > 0.3
3003 and cleaned_titletext not in ("I", "es")
3004 ):
3005 is_title = True
3006 # if first column or same style as first column
3007 elif (
3008 style == cellstyle
3009 and
3010 # and title is not identical to word name
3011 titletext != word
3012 and cleaned not in IGNORED_COLVALUES
3013 and cleaned != "dummy-ignored-text-cell"
3014 and
3015 # the style composite string is not broken
3016 not style.startswith("////")
3017 and " + " not in titletext
3018 ):
3019 if not ignored_cell and lang not in LANGUAGES_WITH_CELLS_AS_HEADERS: 3019 ↛ 3020line 3019 didn't jump to line 3020 because the condition on line 3019 was never true
3020 wxr.wtp.debug(
3021 "rejected heuristic header: "
3022 "table cell identified as header based "
3023 "on style, BUT {} is not in "
3024 "LANGUAGES_WITH_CELLS_AS_HEADERS; "
3025 "cleaned text: {}, style: {}".format(lang, cleaned, style),
3026 sortid="inflection/2512",
3027 )
3028 elif ( 3028 ↛ 3032line 3028 didn't jump to line 3032 because the condition on line 3028 was never true
3029 not ignored_cell
3030 and cleaned not in LANGUAGES_WITH_CELLS_AS_HEADERS.get(lang, "")
3031 ):
3032 wxr.wtp.debug(
3033 "rejected heuristic header: "
3034 "table cell identified as header based "
3035 "on style, BUT the cleaned text is "
3036 "not in LANGUAGES_WITH_CELLS_AS_HEADERS[{}]; "
3037 "cleaned text: {}, style: {}".format(lang, cleaned, style),
3038 sortid="inflection/2522",
3039 )
3040 else:
3041 wxr.wtp.debug(
3042 "accepted heuristic header: "
3043 "table cell identified as header based "
3044 "on style, AND the cleaned text is "
3045 "in LANGUAGES_WITH_CELLS_AS_HEADERS[{}]; "
3046 "cleaned text: {}, style: {}".format(lang, cleaned, style),
3047 sortid="inflection/2530",
3048 )
3049 is_title = True
3050 if ( 3050 ↛ 3057line 3050 didn't jump to line 3057 because the condition on line 3050 was never true
3051 not is_title
3052 and len(row) < len(cols_headered)
3053 and cols_headered[len(row)]
3054 ):
3055 # Whole column has title suggesting they are headers
3056 # (e.g. "Case")
3057 is_title = True
3058 if re.match(
3059 r"Conjugation of |Declension of |Inflection of |"
3060 r"Mutation of |Notes\b", # \b is word-boundary
3061 titletext,
3062 ):
3063 is_title = True
3064 return is_title, hdr_expansion, target, celltext
3067class TableContext:
3068 """Saved context used when parsing a table and its subtables."""
3070 __slot__ = (
3071 "stored_hdrspans",
3072 "section_header",
3073 "template_name",
3074 )
3076 def __init__(self, template_name=None):
3077 self.stored_hdrspans = []
3078 self.section_header = []
3079 if not template_name:
3080 self.template_name = ""
3081 else:
3082 self.template_name = template_name
3085def handle_wikitext_or_html_table(
3086 wxr, word, lang, pos, data, tree, titles, source, after, tablecontext=None
3087):
3088 """Parses a table from parsed Wikitext format into rows and columns of
3089 InflCell objects and then calls handle_generic_table() to parse it into
3090 forms. This adds the forms into ``data``."""
3091 assert isinstance(wxr, WiktextractContext)
3092 assert isinstance(word, str)
3093 assert isinstance(lang, str)
3094 assert isinstance(pos, str)
3095 assert isinstance(data, dict)
3096 assert isinstance(tree, WikiNode)
3097 assert tree.kind == NodeKind.TABLE or (
3098 tree.kind == NodeKind.HTML and tree.sarg == "table"
3099 )
3100 assert isinstance(titles, list)
3101 assert isinstance(source, str)
3102 for x in titles:
3103 assert isinstance(x, str)
3104 assert isinstance(after, str)
3105 assert tablecontext is None or isinstance(tablecontext, TableContext)
3106 # Imported here to avoid a circular import
3107 from wiktextract.page import clean_node, recursively_extract
3109 # from wikitextprocessor.parser import print_tree
3110 # print_tree(tree)
3111 # print("-------==========-------")
3113 if not tablecontext:
3114 tablecontext = TableContext()
3116 def handle_table1(
3117 wxr,
3118 tablecontext,
3119 word,
3120 lang,
3121 pos,
3122 data,
3123 tree,
3124 titles,
3125 source,
3126 after,
3127 depth,
3128 ):
3129 """Helper function allowing the 'flattening' out of the table
3130 recursion: instead of handling the tables in the wrong order
3131 (recursively), this function adds to new_row that is then
3132 iterated through in the main function at the end, creating
3133 a longer table (still in pieces) in the correct order."""
3135 assert isinstance(data, dict)
3136 assert isinstance(titles, list)
3137 assert isinstance(source, str)
3138 for x in titles:
3139 assert isinstance(x, str)
3140 assert isinstance(after, str)
3141 assert isinstance(depth, int)
3142 # print("HANDLE_WIKITEXT_TABLE", titles)
3144 col_gap_data = [] # Filling for columns with rowspan > 1
3145 # col_gap_data contains None or InflCell
3146 vertical_still_left = [] # Number of remaining rows for which to fill
3147 # the column; vertical_still_left contains int
3148 cols_headered = [] # [F, T, F, F...]
3149 # True when the whole column contains headers, even
3150 # when the cell is not considered a header; triggered
3151 # by the "*" inflmap meta-tag.
3152 rows = []
3154 sub_ret = []
3156 # from wikitextprocessor.parser import print_tree
3157 # print_tree(tree)
3158 for node in tree.children:
3159 if not isinstance(node, WikiNode):
3160 continue
3161 if node.kind == NodeKind.HTML:
3162 kind = node.sarg
3163 else:
3164 kind = node.kind
3166 # print(" {}".format(node))
3167 if kind in (NodeKind.TABLE_CAPTION, "caption"):
3168 # print(" CAPTION:", node)
3169 pass
3170 elif kind in (NodeKind.TABLE_ROW, "tr"):
3171 if "vsShow" in node.attrs.get("class", "").split():
3172 # vsShow rows are those that are intially shown in tables
3173 # that have more data. The hidden data duplicates these
3174 # rows, so we skip it and just process the hidden data.
3175 continue
3177 # if (
3178 # len(node.children) == 1
3179 # and node.children[0].attrs.get("class") == "separator"
3180 # ):
3181 # print("------------------ skip separator")
3182 # continue
3184 # Parse a table row.
3185 row = []
3186 style = None
3187 row_has_nonempty_cells = False
3188 # Have nonempty cell not from rowspan
3189 for col in get_table_cells(node):
3190 # loop through each cell in the ROW
3192 # The below skip is not needed anymore, because we "skip" in
3193 # get_table_cells, but left here as a comment
3194 # if not isinstance(col, WikiNode):
3195 # # This skip is not used for counting,
3196 # # "None" is not used in
3197 # # indexing or counting or looping.
3198 # continue
3199 if col.kind == NodeKind.HTML:
3200 kind = col.sarg
3201 else:
3202 kind = col.kind
3203 if kind not in ( 3203 ↛ 3209line 3203 didn't jump to line 3209 because the condition on line 3203 was never true
3204 NodeKind.TABLE_HEADER_CELL,
3205 NodeKind.TABLE_CELL,
3206 "th",
3207 "td",
3208 ):
3209 print(" UNEXPECTED ROW CONTENT: {}".format(col))
3210 continue
3212 while (
3213 len(row) < len(vertical_still_left)
3214 and vertical_still_left[len(row)] > 0
3215 ):
3216 # vertical_still_left is [...0, 0, 2...] for each
3217 # column. It is populated at the end of the loop, at the
3218 # same time as col_gap_data. This needs to be looped and
3219 # filled this way because each `for col`-looping jumps
3220 # straight to the next meaningful cell; there is no
3221 # "None" cells, only emptiness between, and rowspan and
3222 # colspan are just to generate the "fill-
3223 vertical_still_left[len(row)] -= 1
3224 row.append(col_gap_data[len(row)])
3226 # appending row is how "indexing" is
3227 # done here; something is appended,
3228 # like a filler-cell here or a "start"
3229 # cell at the end of the row-loop,
3230 # which increased len(row) which is
3231 # then used as the target-index to check
3232 # for gaps. vertical_still_left is
3233 # the countdown to when to stop
3234 # filling in gaps, and goes down to 0,
3235 # and col_gap_data is not touched
3236 # except when a new rowspan is needed,
3237 # at the same time that
3238 # vertical_still_left gets reassigned.
3240 try:
3241 rowspan = int(col.attrs.get("rowspan", "1")) # 🡙
3242 colspan = int(col.attrs.get("colspan", "1")) # 🡘
3243 except ValueError:
3244 rowspan = 1
3245 colspan = 1
3246 # print("COL:", col)
3248 # Too many of these errors
3249 if colspan > 100:
3250 # wxr.wtp.error(
3251 # f"Colspan {colspan} over 30, set to 1",
3252 # sortid="inflection/20250113a",
3253 # )
3254 colspan = 100
3255 if rowspan > 100: 3255 ↛ 3260line 3255 didn't jump to line 3260 because the condition on line 3255 was never true
3256 # wxr.wtp.error(
3257 # f"Rowspan {rowspan} over 30, set to 1",
3258 # sortid="inflection/20250113b",
3259 # )
3260 rowspan = 100
3262 # Process any nested tables recursively.
3263 tables, rest = recursively_extract(
3264 col,
3265 lambda x: isinstance(x, WikiNode)
3266 and (x.kind == NodeKind.TABLE or x.sarg == "table"),
3267 )
3269 # Clean the rest of the cell.
3270 celltext = clean_node(wxr, None, rest)
3271 # print("CLEANED:", celltext)
3272 # print(f"SUBTABLES: {tables}")
3274 # Handle nested tables.
3275 for tbl in tables:
3276 # Some nested tables (e.g., croí/Irish) have subtitles
3277 # as normal paragraphs in the same cell under a descrip-
3278 # tive text that should be treated as a title (e.g.,
3279 # "Forms with the definite article", with "definite" not
3280 # mentioned elsewhere).
3281 new_titles = list(titles)
3282 if celltext:
3283 new_titles.append(celltext)
3284 subtbl = handle_table1(
3285 wxr,
3286 tablecontext,
3287 word,
3288 lang,
3289 pos,
3290 data,
3291 tbl,
3292 new_titles,
3293 source,
3294 "",
3295 depth + 1,
3296 )
3297 if subtbl: 3297 ↛ 3275line 3297 didn't jump to line 3275 because the condition on line 3297 was always true
3298 sub_ret.append((rows, titles, after, depth))
3299 rows = []
3300 titles = []
3301 after = ""
3302 sub_ret.extend(subtbl)
3304 # This magic value is used as part of header detection
3305 cellstyle = (
3306 col.attrs.get("style", "")
3307 + "//"
3308 + col.attrs.get("class", "")
3309 + "//"
3310 + str(kind)
3311 )
3313 if not row: # if first column in row
3314 style = cellstyle
3315 target = None
3316 titletext = celltext.strip()
3317 while titletext and is_superscript(titletext[-1]):
3318 titletext = titletext[:-1]
3320 (
3321 is_title,
3322 hdr_expansion,
3323 target,
3324 celltext,
3325 ) = determine_header(
3326 wxr,
3327 tablecontext,
3328 lang,
3329 word,
3330 pos,
3331 tree.kind,
3332 kind,
3333 style,
3334 row,
3335 col,
3336 celltext,
3337 titletext,
3338 cols_headered,
3339 None,
3340 cellstyle,
3341 )
3343 if is_title:
3344 # If this cell gets a "*" tag, make the whole column
3345 # below it (toggling it in cols_headered = [F, F, T...])
3346 # into headers.
3347 while len(cols_headered) <= len(row):
3348 cols_headered.append(False)
3349 if any("*" in tt for tt in hdr_expansion):
3350 cols_headered[len(row)] = True
3351 celltext = ""
3352 # if row_has_nonempty_cells has been True at some point, it
3353 # keeps on being True.
3354 # if row_has_nonempty_cells or is_title or celltext != "":
3355 # row_has_nonempty_cells = True
3356 # ⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓
3357 row_has_nonempty_cells |= is_title or celltext != ""
3358 cell = InflCell(
3359 celltext, is_title, colspan, rowspan, target
3360 )
3361 for _ in range(0, colspan):
3362 # colspan🡘 current loop (col) or 1
3363 # All the data-filling for colspan
3364 # is done simply in this loop,
3365 # while rowspan needs to use
3366 # vertical_still_left to count gaps
3367 # and col_gap_data to fill in
3368 # those gaps with InflCell data.
3369 if rowspan > 1: # rowspan🡙 current loop (col) or 1
3370 while len(col_gap_data) <= len(row):
3371 # Initialize col_gap_data/ed if
3372 # it is lacking slots
3373 # for each column; col_gap_data and
3374 # vertical_still_left are never
3375 # reset to [], during
3376 # the whole table function.
3377 col_gap_data.append(None)
3378 vertical_still_left.append(0)
3379 # Below is where the "rectangle" block of rowspan
3380 # and colspan is filled for the future.
3381 col_gap_data[len(row)] = cell
3382 # col_gap_data contains cells that
3383 # will be used in the
3384 # future, or None
3385 vertical_still_left[len(row)] = rowspan - 1
3386 # A counter for how many gaps🡙 are still left to be
3387 # filled (row.append or
3388 # row[col_gap_data[len(row)] =>
3389 # rows), it is not reset to [], but decremented to 0
3390 # each time a row gets something from col_gap_data.
3391 # Append this cell 1+ times for colspan🡘
3392 row.append(cell)
3393 if not row:
3394 continue
3395 # After looping the original row-nodes above, fill
3396 # in the rest of the row if the final cell has colspan
3397 # (inherited from above, so a cell with rowspan and colspan)
3398 for i in range(len(row), len(vertical_still_left)):
3399 if vertical_still_left[i] <= 0:
3400 continue
3401 vertical_still_left[i] -= 1
3402 while len(row) < i:
3403 row.append(InflCell("", False, 1, 1, None))
3404 row.append(col_gap_data[i])
3405 # print(" ROW {!r}".format(row))
3406 if row_has_nonempty_cells: 3406 ↛ 3158line 3406 didn't jump to line 3158 because the condition on line 3406 was always true
3407 rows.append(row)
3408 elif kind in ( 3408 ↛ 3158line 3408 didn't jump to line 3158 because the condition on line 3408 was always true
3409 NodeKind.TABLE_HEADER_CELL,
3410 NodeKind.TABLE_CELL,
3411 "th",
3412 "td",
3413 "span",
3414 ):
3415 # print(" TOP-LEVEL CELL", node)
3416 pass
3418 if sub_ret:
3419 main_ret = sub_ret
3420 main_ret.append((rows, titles, after, depth))
3421 else:
3422 main_ret = [(rows, titles, after, depth)]
3423 return main_ret
3425 new_rows = handle_table1(
3426 wxr, tablecontext, word, lang, pos, data, tree, titles, source, after, 0
3427 )
3429 # Now we have a table that has been parsed into rows and columns of
3430 # InflCell objects. Parse the inflection table from that format.
3431 if new_rows: 3431 ↛ exitline 3431 didn't return from function 'handle_wikitext_or_html_table' because the condition on line 3431 was always true
3432 for rows, titles, after, depth in new_rows:
3433 handle_generic_table(
3434 wxr,
3435 tablecontext,
3436 data,
3437 word,
3438 lang,
3439 pos,
3440 rows,
3441 titles,
3442 source,
3443 after,
3444 depth,
3445 )
3448def get_table_cells(node: WikiNode) -> Generator[WikiNode, None, None]:
3449 """If a wikitext table cell contains HTML cells `<td>`, as they sometimes
3450 do because it is easier to write wikitext conditionals that way,
3451 those td-elements are parsed as child elements of the Wikitext cell.
3452 This generator will yield wikitext and HTML direct children of
3453 `node` and if a Wikitext TABLE_CELL has direct td-element children,
3454 those are also yielded."""
3455 for col in node.children:
3456 if not isinstance(col, WikiNode):
3457 continue
3458 if any(
3459 isinstance(c, HTMLNode) and c.sarg in ("th", "td")
3460 for c in col.children
3461 ):
3462 html_cells = []
3463 content = []
3464 for c in col.children:
3465 if isinstance(c, HTMLNode) and c.sarg in ("th", "td"):
3466 html_cells.append(c)
3467 else:
3468 content.append(c)
3469 # Remove td-elements from col so they are not returned twice
3470 col.children = content
3471 yield col
3472 for c in html_cells:
3473 yield c
3474 else:
3475 yield col
3478def handle_html_table(
3479 wxr, word, lang, pos, data, tree, titles, source, after, tablecontext=None
3480):
3481 """A passer-on function for html-tables, XXX, remove these?"""
3482 handle_wikitext_or_html_table(
3483 wxr, word, lang, pos, data, tree, titles, source, after, tablecontext
3484 )
3487def handle_wikitext_table(
3488 wxr, word, lang, pos, data, tree, titles, source, after, tablecontext=None
3489):
3490 """A passer-on function for html-tables, XXX, remove these?"""
3491 handle_wikitext_or_html_table(
3492 wxr, word, lang, pos, data, tree, titles, source, after, tablecontext
3493 )
3496def parse_inflection_section(
3497 wxr, data, word, lang, pos, section, tree, tablecontext=None
3498):
3499 """Parses an inflection section on a page. ``data`` should be the
3500 data for a part-of-speech, and inflections will be added to it."""
3502 # print("PARSE_INFLECTION_SECTION {}/{}/{}/{}"
3503 # .format(word, lang, pos, section))
3504 assert isinstance(wxr, WiktextractContext)
3505 assert isinstance(data, dict)
3506 assert isinstance(word, str)
3507 assert isinstance(lang, str)
3508 assert isinstance(section, str)
3509 assert isinstance(tree, WikiNode)
3510 assert tablecontext is None or isinstance(tablecontext, TableContext)
3511 source = section
3512 tables = []
3513 titleparts = []
3514 preceding_bolded_title = ""
3516 # from wikitextprocessor.parser import print_tree
3517 # print_tree(tree)
3518 # print("--------------******************----------------")
3520 def process_tables():
3521 for kind, node, titles, after in tables:
3522 after = "".join(after).strip()
3523 after = clean_value(wxr, after)
3524 if kind == "wikitext":
3525 handle_wikitext_table(
3526 wxr,
3527 word,
3528 lang,
3529 pos,
3530 data,
3531 node,
3532 titles,
3533 source,
3534 after,
3535 tablecontext=tablecontext,
3536 )
3537 elif kind == "html": 3537 ↛ 3551line 3537 didn't jump to line 3551 because the condition on line 3537 was always true
3538 handle_html_table(
3539 wxr,
3540 word,
3541 lang,
3542 pos,
3543 data,
3544 node,
3545 titles,
3546 source,
3547 after,
3548 tablecontext=tablecontext,
3549 )
3550 else:
3551 raise RuntimeError(
3552 "{}: unimplemented table kind {}".format(word, kind)
3553 )
3555 def recurse_navframe(node, titles):
3556 nonlocal tables
3557 nonlocal titleparts
3558 titleparts = []
3559 old_tables = tables
3560 tables = []
3562 recurse(node, [], navframe=True)
3564 process_tables()
3565 tables = old_tables
3567 def recurse(node, titles, navframe=False):
3568 nonlocal tables
3569 if isinstance(node, (list, tuple)):
3570 for x in node:
3571 recurse(x, titles, navframe)
3572 return
3573 if isinstance(node, str):
3574 if tables:
3575 tables[-1][-1].append(node)
3576 elif navframe:
3577 titleparts.append(node)
3578 return
3579 if not isinstance(node, WikiNode): 3579 ↛ 3580line 3579 didn't jump to line 3580 because the condition on line 3579 was never true
3580 if navframe:
3581 wxr.wtp.debug(
3582 "inflection table: unhandled in NavFrame: {}".format(node),
3583 sortid="inflection/2907",
3584 )
3585 return
3586 kind = node.kind
3587 if navframe:
3588 if kind == NodeKind.HTML:
3589 classes = node.attrs.get("class", "").split()
3590 if "NavToggle" in classes: 3590 ↛ 3591line 3590 didn't jump to line 3591 because the condition on line 3590 was never true
3591 return
3592 if "NavHead" in classes:
3593 # print("NAVHEAD:", node)
3594 recurse(node.children, titles, navframe)
3595 return
3596 if "NavContent" in classes:
3597 # print("NAVCONTENT:", node)
3598 title = "".join(titleparts).strip()
3599 title = html.unescape(title)
3600 title = title.strip()
3601 new_titles = list(titles)
3602 if not re.match(r"(Note:|Notes:)", title): 3602 ↛ 3604line 3602 didn't jump to line 3604 because the condition on line 3602 was always true
3603 new_titles.append(title)
3604 recurse(node, new_titles, navframe=False)
3605 return
3606 else:
3607 if kind == NodeKind.TABLE:
3608 tables.append(["wikitext", node, titles, []])
3609 return
3610 elif kind == NodeKind.HTML and node.sarg == "table":
3611 classes = node.attrs.get("class", ())
3612 if "audiotable" in classes:
3613 return
3614 tables.append(["html", node, titles, []])
3615 return
3616 elif kind in ( 3616 ↛ 3623line 3616 didn't jump to line 3623 because the condition on line 3616 was never true
3617 NodeKind.LEVEL2,
3618 NodeKind.LEVEL3,
3619 NodeKind.LEVEL4,
3620 NodeKind.LEVEL5,
3621 NodeKind.LEVEL6,
3622 ):
3623 return # Skip subsections
3624 if (
3625 kind == NodeKind.HTML
3626 and node.sarg == "div"
3627 and "NavFrame" in node.attrs.get("class", "").split()
3628 ):
3629 recurse_navframe(node, titles)
3630 return
3631 if kind == NodeKind.LINK:
3632 if len(node.largs) > 1:
3633 recurse(node.largs[1:], titles, navframe)
3634 else:
3635 recurse(node.largs[0], titles, navframe)
3636 return
3637 if kind == NodeKind.HTML and node.sarg == "ref":
3638 return
3639 if kind == NodeKind.LIST and node.sarg == ";":
3640 nonlocal preceding_bolded_title
3641 from wiktextract.page import clean_node
3643 preceding_bolded_title = clean_node(wxr, None, node).strip("; ")
3644 for x in node.children:
3645 recurse(x, titles, navframe)
3647 assert tree.kind == NodeKind.ROOT
3648 for x in tree.children:
3649 if preceding_bolded_title != "":
3650 recurse(x, [preceding_bolded_title])
3651 else:
3652 recurse(x, [])
3654 # Process the tables we found
3655 process_tables()
3657 # XXX this code is used for extracting tables for inflection tests
3658 if wxr.config.expand_tables: 3658 ↛ 3659line 3658 didn't jump to line 3659 because the condition on line 3658 was never true
3659 if section != "Mutation":
3660 with open(wxr.config.expand_tables, "w") as f:
3661 f.write(word + "\n")
3662 f.write(lang + "\n")
3663 f.write(pos + "\n")
3664 f.write(section + "\n")
3665 text = wxr.wtp.node_to_wikitext(tree)
3666 f.write(text + "\n")