Coverage for src/wiktextract/extractor/en/inflection.py: 86%

1# Code for parsing inflection tables.

5import collections

6import copy

7import functools

8import html

9import itertools

10import re

11import unicodedata

12from typing import Optional, Union

14from wikitextprocessor import MAGIC_FIRST, NodeKind, WikiNode

16from ...clean import clean_value

17from ...datautils import data_append, freeze, split_at_comma_semi

18from ...tags import valid_tags

19from ...wxr_context import WiktextractContext

20from .form_descriptions import (

21 classify_desc,

22 decode_tags,

23 distw,

24 parse_head_final_tags,

25)

26from .inflectiondata import infl_map, infl_start_map, infl_start_re

27from .lang_specific_configs import get_lang_conf, lang_specific_tags

28from .table_headers_heuristics_data import LANGUAGES_WITH_CELLS_AS_HEADERS

29from .type_utils import FormData

31# --debug-text-cell WORD

32# Command-line parameter for debugging. When parsing inflection tables,

33# print out debug messages when encountering this text.

34debug_cell_text: Optional[str] = None

37def set_debug_cell_text(text: str) -> None:

38 global debug_cell_text

39 debug_cell_text = text

42TagSets = list[tuple[str, ...]]

44# Column texts that are interpreted as an empty column.

45IGNORED_COLVALUES = {

46 "-",

47 "־",

48 "᠆",

49 "‐",

50 "‑",

51 "‒",

52 "–",

53 "—",

54 "―",

55 "−",

56 "⸺",

57 "⸻",

58 "﹘",

59 "﹣",

60 "－",

61 "/",

62 "?",

63 "not used",

64 "not applicable",

65}

67# These tags are never inherited from above

68# XXX merge with lang_specific

69noinherit_tags = {

70 "infinitive-i",

71 "infinitive-i-long",

72 "infinitive-ii",

73 "infinitive-iii",

74 "infinitive-iv",

75 "infinitive-v",

76}

78# Subject->object transformation mapping, when using dummy-object-concord

79# to replace subject concord tags with object concord tags

80object_concord_replacements = {

81 "first-person": "object-first-person",

82 "second-person": "object-second-person",

83 "third-person": "object-third-person",

84 "singular": "object-singular",

85 "plural": "object-plural",

86 "definite": "object-definite",

87 "indefinite": "object-indefinite",

88 "class-1": "object-class-1",

89 "class-2": "object-class-2",

90 "class-3": "object-class-3",

91 "class-4": "object-class-4",

92 "class-5": "object-class-5",

93 "class-6": "object-class-6",

94 "class-7": "object-class-7",

95 "class-8": "object-class-8",

96 "class-9": "object-class-9",

97 "class-10": "object-class-10",

98 "class-11": "object-class-11",

99 "class-12": "object-class-12",

100 "class-13": "object-class-13",

101 "class-14": "object-class-14",

102 "class-15": "object-class-15",

103 "class-16": "object-class-16",

104 "class-17": "object-class-17",

105 "class-18": "object-class-18",

106 "masculine": "object-masculine",

107 "feminine": "object-feminine",

108}

109

110# Words in title that cause addition of tags in all entries

111title_contains_global_map = {

112 "possessive": "possessive",

113 "possessed forms of": "possessive",

114 "predicative forms of": "predicative",

115 "negative": "negative",

116 "positive definite forms": "positive definite",

117 "positive indefinite forms": "positive indefinite",

118 "comparative": "comparative",

119 "superlative": "superlative",

120 "combined forms": "combined-form",

121 "mutation": "mutation",

122 "definite article": "definite",

123 "indefinite article": "indefinite",

124 "indefinite declension": "indefinite",

125 "bare forms": "indefinite", # e.g., cois/Irish

126 "definite declension": "definite",

127 "pre-reform": "dated",

128 "personal pronouns": "personal pronoun",

129 "composed forms of": "multiword-construction",

130 "subordinate-clause forms of": "subordinate-clause",

131 "participles of": "participle",

132 "variation of": "dummy-skip-this", # a'/Scottish Gaelic

133 "command form of": "imperative", # a راتلل/Pashto

134 "historical inflection of": "dummy-skip-this", # kork/Norwegian Nynorsk

135 "obsolete declension": "obsolete", # März/German 20241111

136}

137for k, v in title_contains_global_map.items():

138 if any(t not in valid_tags for t in v.split()): 138 ↛ 139line 138 didn't jump to line 139 because the condition on line 138 was never true

139 print("TITLE_CONTAINS_GLOBAL_MAP UNRECOGNIZED TAG: {}: {}".format(k, v))

140table_hdr_ign_part = r"(Inflection|Conjugation|Declension|Mutation) of [^\s]"

141

142table_hdr_ign_part_re = re.compile(r"(?i)(" + table_hdr_ign_part + ")")

143# (?i) python regex extension, ignore case

144title_contains_global_re = re.compile(

145 r"(?i)(^|\b)({}|{})($|\b)".format(

146 table_hdr_ign_part,

147 "|".join(re.escape(x) for x in title_contains_global_map.keys()),

148 )

149)

150

151# Words in title that cause addition of tags to table-tags "form"

152title_contains_wordtags_map = {

153 "pf": "perfective",

154 "impf": "imperfective",

155 "strong": "strong",

156 "weak": "weak",

157 "countable": "countable",

158 "uncountable": "uncountable",

159 "inanimate": "inanimate",

160 "animate": "animate",

161 "transitive": "transitive",

162 "intransitive": "intransitive",

163 "ditransitive": "ditransitive",

164 "ambitransitive": "ambitransitive",

165 "archaic": "archaic",

166 "dated": "dated",

167 "affirmative": "affirmative",

168 "negative": "negative",

169 "subject pronouns": "subjective",

170 "object pronouns": "objective",

171 "emphatic": "emphatic",

172 "proper noun": "proper-noun",

173 "no plural": "no-plural",

174 "imperfective": "imperfective",

175 "perfective": "perfective",

176 "no supine stem": "no-supine",

177 "no perfect stem": "no-perfect",

178 "deponent": "deponent",

179 "irregular": "irregular",

180 "no short forms": "no-short-form",

181 "iō-variant": "iō-variant",

182 "1st declension": "declension-1",

183 "2nd declension": "declension-2",

184 "3rd declension": "declension-3",

185 "4th declension": "declension-4",

186 "5th declension": "declension-5",

187 "6th declension": "declension-6",

188 "first declension": "declension-1",

189 "second declension": "declension-2",

190 "third declension": "declension-3",

191 "fourth declension": "declension-4",

192 "fifth declension": "declension-5",

193 "sixth declension": "declension-6",

194 "1st conjugation": "conjugation-1",

195 "2nd conjugation": "conjugation-2",

196 "3rd conjugation": "conjugation-3",

197 "4th conjugation": "conjugation-4",

198 "5th conjugation": "conjugation-5",

199 "6th conjugation": "conjugation-6",

200 "7th conjugation": "conjugation-7",

201 "first conjugation": "conjugation-1",

202 "second conjugation": "conjugation-2",

203 "third conjugation": "conjugation-3",

204 "fourth conjugation": "conjugation-4",

205 "fifth conjugation": "conjugation-5",

206 "sixth conjugation": "conjugation-6",

207 "seventh conjugation": "conjugation-7",

208 # Corsican regional tags in table header

209 "cismontane": "Cismontane",

210 "ultramontane": "Ultramontane",

211 "western lombard": "Western-Lombard",

212 "eastern lombard": "Eastern-Lombard",

213}

214for k, v in title_contains_wordtags_map.items():

215 if any(t not in valid_tags for t in v.split()): 215 ↛ 216line 215 didn't jump to line 216 because the condition on line 215 was never true

216 print(

217 "TITLE_CONTAINS_WORDTAGS_MAP UNRECOGNIZED TAG: {}: {}".format(k, v)

218 )

219title_contains_wordtags_re = re.compile(

220 r"(?i)(^|\b)({}|{})($|\b)".format(

221 table_hdr_ign_part,

222 "|".join(re.escape(x) for x in title_contains_wordtags_map.keys()),

223 )

224)

225

226# Parenthesized elements in title that are converted to tags in

227# "table-tags" form

228title_elements_map = {

229 "weak": "weak",

230 "strong": "strong",

231 "separable": "separable",

232 "masculine": "masculine",

233 "feminine": "feminine",

234 "neuter": "neuter",

235 "singular": "singular",

236 "plural": "plural",

237 "archaic": "archaic",

238 "dated": "dated",

239 "Attic": "Attic", # e.g. καλός/Greek/Adj

240 "Epic": "Epic", # e.g. καλός/Greek/Adj

241}

242for k, v in title_elements_map.items():

243 if any(t not in valid_tags for t in v.split()): 243 ↛ 244line 243 didn't jump to line 244 because the condition on line 243 was never true

244 print("TITLE_ELEMENTS_MAP UNRECOGNIZED TAG: {}: {}".format(k, v))

245

246# Parenthized element starts to map them to tags for form for the rest of

247# the element

248title_elemstart_map = {

249 "auxiliary": "auxiliary",

250 "Kotus type": "class",

251 "ÕS type": "class",

252 "class": "class",

253 "short class": "class",

254 "type": "class",

255 "strong class": "class",

256 "weak class": "class",

257 "accent paradigm": "accent-paradigm",

258 "stem in": "class",

259}

260for k, v in title_elemstart_map.items():

261 if any(t not in valid_tags for t in v.split()): 261 ↛ 262line 261 didn't jump to line 262 because the condition on line 261 was never true

262 print("TITLE_ELEMSTART_MAP UNRECOGNIZED TAG: {}: {}".format(k, v))

263title_elemstart_re = re.compile(

264 r"^({}) ".format("|".join(re.escape(x) for x in title_elemstart_map.keys()))

265)

266

267

268# Regexp for cell starts that are likely definitions of reference symbols.

269# See also nondef_re.

270def_re = re.compile(

271 r"(\s*•?\s+)?"

272 r"((\*+|[△†0123456789⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻]+)([⁾):]|\s|(?=[A-Z]))|"

273 r"\^(\*+|[△†])|"

274 r"([¹²³⁴⁵⁶⁷⁸⁹])|"

275 r"([ᴬᴮᴰᴱᴳᴴᴵᴶᴷᴸᴹᴺᴼᴾᴿᵀᵁⱽᵂᵃᵇᶜᵈᵉᶠᵍʰⁱʲᵏˡᵐⁿᵒᵖʳˢᵗᵘᵛʷˣʸᶻᵝᵞᵟᶿᶥᵠᵡ]))"

276)

277# ᴺᴸᴴ persan/Old Irish

278

279# Regexp for cell starts that are exceptions to def_re and do not actually

280# start a definition.

281nondef_re = re.compile(

282 r"(^\s*(1|2|3)\s+(sg|pl)\s*$|" # 1s or 3p etc.

283 r"\s*\d\d?\s*/\s*\d\d?\s*$)"

284) # taka/Swahili "15 / 17"

285

286# Certain tags are moved from headers in tables into word tags, as they always

287# apply to the whole word.

288TAGS_FORCED_WORDTAGS: set[str] = set(

289 [

290 # This was originally created for a issue with number paradigms in

291 # Arabic, but that is being handled elsewhere now.

292 ]

293)

294

295

296class InflCell:

297 """Cell in an inflection table."""

298

299 __slots__ = (

300 "text",

301 "is_title",

302 "colspan",

303 "rowspan",

304 "target",

305 )

306

307 def __init__(

308 self,

309 text: str,

310 is_title: bool,

311 colspan: int,

312 rowspan: int,

313 target: Optional[str],

314 ) -> None:

315 assert isinstance(text, str)

316 assert is_title in (True, False)

317 assert isinstance(colspan, int) and colspan >= 1

318 assert isinstance(rowspan, int) and rowspan >= 1

319 assert target is None or isinstance(target, str)

320 self.text = text.strip()

321 self.is_title = text and is_title

322 self.colspan = colspan

323 self.rowspan = rowspan

324 self.target = target

325

326 def __str__(self) -> str:

327 v = "{}/{}/{}/{!r}".format(

328 self.text, self.is_title, self.colspan, self.rowspan

329 )

330 if self.target:

331 v += ": {!r}".format(self.target)

332 return v

333

334 def __repr__(self) -> str:

335 return str(self)

336

337

338class HdrSpan:

339 """Saved information about a header cell/span during the parsing

340 of a table."""

341

342 __slots__ = (

343 "start",

344 "colspan",

345 "rowspan",

346 "rownum", # Row number where this occurred

347 "tagsets", # list of tuples

348 "text", # For debugging

349 "all_headers_row",

350 "expanded", # The header has been expanded to cover whole row/part

351 )

352

353 def __init__(

354 self,

355 start: int,

356 colspan: int,

357 rowspan: int,

358 rownum: int,

359 tagsets: TagSets,

360 text: str,

361 all_headers_row: bool,

362 ) -> None:

363 assert isinstance(start, int) and start >= 0

364 assert isinstance(colspan, int) and colspan >= 1

365 assert isinstance(rownum, int)

366 assert isinstance(tagsets, list)

367 for x in tagsets:

368 assert isinstance(x, tuple)

369 assert all_headers_row in (True, False)

370 self.start = start

371 self.colspan = colspan

372 self.rowspan = rowspan

373 self.rownum = rownum

374 self.tagsets = list(tuple(sorted(set(tags))) for tags in tagsets)

375 self.text = text

376 self.all_headers_row = all_headers_row

377 self.expanded = False

378

379

380def is_superscript(ch: str) -> bool:

381 """Returns True if the argument is a superscript character."""

382 assert isinstance(ch, str) and len(ch) == 1

383 try:

384 name = unicodedata.name(ch)

385 except ValueError:

386 return False

387 return (

388 re.match(

389 r"SUPERSCRIPT |"

390 r"MODIFIER LETTER SMALL |"

391 r"MODIFIER LETTER CAPITAL ",

392 name,

393 )

394 is not None

395 )

396

397

398def remove_useless_tags(lang: str, pos: str, tags: set[str]) -> None:

399 """Remove certain tag combinations from ``tags`` when they serve no purpose

400 together (cover all options)."""

401 assert isinstance(lang, str)

402 assert isinstance(pos, str)

403 assert isinstance(tags, set)

404 if (

405 "animate" in tags

406 and "inanimate" in tags

407 and get_lang_conf(lang, "animate_inanimate_remove")

408 ):

409 tags.remove("animate")

410 tags.remove("inanimate")

411 if (

412 "virile" in tags

413 and "nonvirile" in tags

414 and get_lang_conf(lang, "virile_nonvirile_remove")

415 ):

416 tags.remove("virile")

417 tags.remove("nonvirile")

418 # If all numbers in the language are listed, remove them all

419 numbers = get_lang_conf(lang, "numbers")

420 if numbers and all(x in tags for x in numbers):

421 for x in numbers:

422 tags.remove(x)

423 # If all genders in the language are listed, remove them all

424 genders = get_lang_conf(lang, "genders")

425 if genders and all(x in tags for x in genders):

426 for x in genders:

427 tags.remove(x)

428 # If all voices in the language are listed, remove them all

429 voices = get_lang_conf(lang, "voices")

430 if voices and all(x in tags for x in voices):

431 for x in voices:

432 tags.remove(x)

433 # If all strengths of the language are listed, remove them all

434 strengths = get_lang_conf(lang, "strengths")

435 if strengths and all(x in tags for x in strengths):

436 for x in strengths:

437 tags.remove(x)

438 # If all persons of the language are listed, remove them all

439 persons = get_lang_conf(lang, "persons")

440 if persons and all(x in tags for x in persons):

441 for x in persons:

442 tags.remove(x)

443 # If all definitenesses of the language are listed, remove them all

444 definitenesses = get_lang_conf(lang, "definitenesses")

445 if definitenesses and all(x in tags for x in definitenesses):

446 for x in definitenesses:

447 tags.remove(x)

448

449

450def tagset_cats(tagset: TagSets) -> set[str]:

451 """Returns a set of tag categories for the tagset (merged from all

452 alternatives)."""

453 return set(valid_tags[t] for ts in tagset for t in ts)

454

455

456def or_tagsets(

457 lang: str, pos: str, tagsets1: TagSets, tagsets2: TagSets

458) -> TagSets:

459 """Merges two tagsets (the new tagset just merges the tags from both, in

460 all combinations). If they contain simple alternatives (differ in

461 only one category), they are simply merged; otherwise they are split to

462 more alternatives. The tagsets are assumed be sets of sorted tuples."""

463 assert isinstance(tagsets1, list)

464 assert all(isinstance(x, tuple) for x in tagsets1)

465 assert isinstance(tagsets2, list)

466 assert all(isinstance(x, tuple) for x in tagsets1)

467 tagsets: TagSets = [] # This will be the result

468

469 def add_tags(tags1: tuple[str, ...]) -> None:

470 # CONTINUE

471 if not tags1:

472 return # empty set would merge with anything, won't change result

473 if not tagsets:

474 tagsets.append(tags1)

475 return

476 for tags2 in tagsets:

477 # Determine if tags1 can be merged with tags2

478 num_differ = 0

479 if tags1 and tags2: 479 ↛ 497line 479 didn't jump to line 497 because the condition on line 479 was always true

480 cats1 = set(valid_tags[t] for t in tags1)

481 cats2 = set(valid_tags[t] for t in tags2)

482 cats = cats1 | cats2

483 for cat in cats:

484 tags1_in_cat = set(t for t in tags1 if valid_tags[t] == cat)

485 tags2_in_cat = set(t for t in tags2 if valid_tags[t] == cat)

486 if (

487 tags1_in_cat != tags2_in_cat

488 or not tags1_in_cat

489 or not tags2_in_cat

490 ):

491 num_differ += 1

492 if not tags1_in_cat or not tags2_in_cat:

493 # Prevent merging if one is empty

494 num_differ += 1

495 # print("tags1={} tags2={} num_differ={}"

496 # .format(tags1, tags2, num_differ))

497 if num_differ <= 1:

498 # Yes, they can be merged

499 tagsets.remove(tags2)

500 tags_s = set(tags1) | set(tags2)

501 remove_useless_tags(lang, pos, tags_s)

502 tags_t = tuple(sorted(tags_s))

503 add_tags(tags_t) # Could result in further merging

504 return

505 # If we could not merge, add to tagsets

506 tagsets.append(tags1)

507

508 for tags in tagsets1:

509 add_tags(tags)

510 for tags in tagsets2:

511 add_tags(tags)

512 if not tagsets:

513 tagsets.append(())

514

515 # print("or_tagsets: {} + {} -> {}"

516 # .format(tagsets1, tagsets2, tagsets))

517 return tagsets

518

519

520def and_tagsets(

521 lang: str,

522 pos: str,

523 tagsets1: list[tuple[str, ...]],

524 tagsets2: list[tuple[str, ...]],

525) -> list[tuple[str, ...]]:

526 """Merges tagsets by taking union of all cobinations, without trying

527 to determine whether they are compatible."""

528 assert isinstance(tagsets1, list) and len(tagsets1) >= 1

529 assert all(isinstance(x, tuple) for x in tagsets1)

530 assert isinstance(tagsets2, list) and len(tagsets2) >= 1

531 assert all(isinstance(x, tuple) for x in tagsets1)

532 new_tagsets = []

533 tags: Union[set[str], tuple[str, ...]]

534 for tags1 in tagsets1:

535 for tags2 in tagsets2:

536 tags = set(tags1) | set(tags2)

537 remove_useless_tags(lang, pos, tags)

538 if "dummy-ignored-text-cell" in tags: 538 ↛ 539line 538 didn't jump to line 539 because the condition on line 538 was never true

539 tags.remove("dummy-ignored-text-cell")

540 tags = tuple(sorted(tags))

541 if tags not in new_tagsets: 541 ↛ 535line 541 didn't jump to line 535 because the condition on line 541 was always true

542 new_tagsets.append(tags)

543 # print("and_tagsets: {} + {} -> {}"

544 # .format(tagsets1, tagsets2, new_tagsets))

545 return new_tagsets

546

547

548@functools.lru_cache(65536)

549def extract_cell_content(

550 lang: str, word: str, col: str

551) -> tuple[str, list[str], list[tuple[str, str]], list[str]]:

552 """Cleans a row/column header for later processing. This returns

553 (cleaned, refs, defs, tags)."""

554 # print("EXTRACT_CELL_CONTENT {!r}".format(col))

555 hdr_tags = []

556 col = re.sub(r"(?s)\s*,\s*$", "", col)

557 col = re.sub(r"(?s)\s*•\s*$", "", col)

558 col = re.sub(r"\s+", " ", col)

559 col = col.strip()

560 if re.search(

561 r"^\s*(There are |"

562 r"\* |"

563 r"see |"

564 r"Use |"

565 r"use the |"

566 r"Only used |"

567 r"The forms in |"

568 r"these are also written |"

569 r"The genitive can be |"

570 r"Genitive forms are rare or non-existant|"

571 r"Accusative Note: |"

572 r"Classifier Note: |"

573 r"Noun: Assamese nouns are |"

574 r"the active conjugation|"

575 r"the instrumenal singular|"

576 r"Note:|"

577 r"\^* Note:|"

578 r"possible mutated form |"

579 r"The future tense: )",

580 col,

581 ):

582 return "dummy-ignored-text-cell", [], [], []

583

584 # Temporarily remove final parenthesized part (if separated by whitespace),

585 # so that we can extract reference markers before it.

586 final_paren = ""

587 m = re.search(r"\s+$[^)]*$$", col)

588 if m is not None:

589 final_paren = m.group(0)

590 col = col[: m.start()]

591

592 # Extract references and tag markers

593 refs = []

594 special_references = get_lang_conf(lang, "special_references")

595 while True:

596 m = re.search(r"\^(.|$[^)]*$)$", col)

597 if not m:

598 break

599 r = m.group(1)

600 if r.startswith("(") and r.endswith(")"):

601 r = r[1:-1]

602 for r1 in r.split(","):

603 if r1 == "rare": 603 ↛ 604line 603 didn't jump to line 604 because the condition on line 603 was never true

604 hdr_tags.append("rare")

605 elif special_references and r1 in special_references:

606 hdr_tags.extend(special_references[r1].split())

607 else:

608 # v = m.group(1)

609 if r1.startswith("(") and r1.endswith(")"): 609 ↛ 610line 609 didn't jump to line 610 because the condition on line 609 was never true

610 r1 = r1[1:-1]

611 refs.append(unicodedata.normalize("NFKD", r1))

612 col = col[: m.start()]

613 # See if it is a ref definition

614 # print("BEFORE REF CHECK: {!r}".format(col))

615 m = def_re.match(col)

616 # print(f"Before def_re: {refs=}")

617 if m and not nondef_re.match(col):

618 ofs = 0

619 ref = None

620 deflst = []

621 for m in re.finditer(def_re, col):

622 if ref:

623 deflst.append((ref, col[ofs : m.start()].strip()))

624 ref = unicodedata.normalize(

625 "NFKD", m.group(3) or m.group(5) or m.group(6) or ""

626 )

627 ofs = m.end()

628 if ref: 628 ↛ 631line 628 didn't jump to line 631 because the condition on line 628 was always true

629 deflst.append((ref, col[ofs:].strip()))

630 # print("deflst:", deflst)

631 return "", [], deflst, []

632 # See if it *looks* like a reference to a definition

633 # print(f"After def_re: {refs=}")

634 while col:

635 if is_superscript(col[-1]) or col[-1] in ("†",):

636 if col.endswith("ʳᵃʳᵉ"):

637 hdr_tags.append("rare")

638 col = col[:-4].strip()

639 continue

640 if special_references:

641 stop_flag = False

642 for r in special_references:

643 if col.endswith(r):

644 hdr_tags.extend(special_references[r].split())

645 col = col[: -len(r)].strip()

646 stop_flag = True

647 break # this for loop

648 if stop_flag:

649 continue # this while loop

650 # Numbers and H/L/N are useful information

651 refs.append(unicodedata.normalize("NFKD", col[-1]))

652 col = col[:-1]

653 else:

654 break

655

656 # Check for another form of note definition

657 if ( 657 ↛ 663line 657 didn't jump to line 663 because the condition on line 657 was never true

658 len(col) > 2

659 and col[1] in (")", " ", ":")

660 and col[0].isdigit()

661 and not re.match(nondef_re, col)

662 ):

663 return "", [], [(col[0], col[2:].strip())], []

664 col = col.strip()

665

666 # Extract final "*" reference symbols. Sometimes there are multiple.

667 m = re.search(r"\*+$", col)

668 if m is not None:

669 col = col[: m.start()]

670 refs.append(unicodedata.normalize("NFKD", m.group(0)))

671 if col.endswith("(*)"): 671 ↛ 672line 671 didn't jump to line 672 because the condition on line 671 was never true

672 col = col[:-3].strip()

673 refs.append("*")

674

675 # Put back the final parenthesized part

676 col = col.strip() + final_paren

677 # print("EXTRACT_CELL_CONTENT: orig_col={!r} col={!r} refs={!r} hdr_tags={}"

678 # .format(orig_col, col, refs, hdr_tags))

679 return col.strip(), refs, [], hdr_tags

680

681

682@functools.lru_cache(10000)

683def parse_title(

684 title: str, source: str

685) -> tuple[list[str], list[str], list[FormData]]:

686 """Parses inflection table title. This returns (global_tags, table_tags,

687 extra_forms), where ``global_tags`` is tags to be added to each inflection

688 entry, ``table_tags`` are tags for the word but not to be added to every

689 form, and ``extra_forms`` is dictionary describing additional forms to be

690 included in the part-of-speech entry)."""

691 assert isinstance(title, str)

692 assert isinstance(source, str)

693 title = html.unescape(title)

694 title = re.sub(r"(?i)<[^>]*>", "", title).strip()

695 title = re.sub(r"\s+", " ", title)

696 # print("PARSE_TITLE:", title)

697 global_tags = []

698 table_tags = []

699 extra_forms = []

700 # Add certain global tags based on contained words

701 for m in re.finditer(title_contains_global_re, title):

702 v = m.group(0).lower()

703 if re.match(table_hdr_ign_part_re, v): 703 ↛ 704line 703 didn't jump to line 704 because the condition on line 703 was never true

704 continue

705 global_tags.extend(title_contains_global_map[v].split())

706 # Add certain tags to table-tags "form" based on contained words

707 for m in re.finditer(title_contains_wordtags_re, title):

708 v = m.group(0).lower()

709 if re.match(table_hdr_ign_part_re, v): 709 ↛ 710line 709 didn't jump to line 710 because the condition on line 709 was never true

710 continue

711 table_tags.extend(title_contains_wordtags_map[v].split())

712 if re.search(r"Conjugation of (s’|se ).*French verbs", title): 712 ↛ 713line 712 didn't jump to line 713 because the condition on line 712 was never true

713 global_tags.append("reflexive")

714 # Check for <x>-type at the beginning of title (e.g., Armenian) and various

715 # other ways of specifying an inflection class.

716 for m in re.finditer(

717 r"\b("

718 r"[\w/]+-type|"

719 r"accent-\w+|"

720 r"[\w/]+-stem|"

721 r"[^ ]+ gradation|"

722 r"\b(stem in [\w/ ]+)|"

723 r"[^ ]+ alternation|"

725 r"(Conjugation|declension)|"

726 r"First and second declension|"

727 r"(1st|2nd|3rd|4th|5th|6th) declension|"

728 r"\w[\w/ ]* harmony"

729 r")\b",

730 title,

731 ):

732 dt: FormData = {"form": m.group(1), "source": source, "tags": ["class"]}

733 extra_forms.append(dt)

734 # Parse parenthesized part from title

735 for m in re.finditer(r"$([^)]*)$", title):

736 for elem in m.group(1).split(","):

737 # group(0) is the whole string, group(1) first parens

738 elem = elem.strip()

739 if elem in title_elements_map:

740 table_tags.extend(title_elements_map[elem].split())

741 else:

742 m1 = re.match(title_elemstart_re, elem)

743 if m1:

744 tags = title_elemstart_map[m1.group(1)].split()

745 dt = {

746 "form": elem[m1.end() :],

747 "source": source,

748 "tags": tags,

749 }

750 extra_forms.append(dt)

751 # For titles that contains no parenthesized parts, do some special

752 # handling to still interpret parts from them

753 if "(" not in title:

754 # No parenthesized parts

755 m1 = re.search(r"\b(Portuguese) (-.* verb) ", title)

756 if m1 is not None:

757 dt = {"form": m1.group(2), "tags": ["class"], "source": source}

758 extra_forms.append(dt)

759 for elem in title.split(","):

760 elem = elem.strip()

761 if elem in title_elements_map: 761 ↛ 762line 761 didn't jump to line 762 because the condition on line 761 was never true

762 table_tags.extend(title_elements_map[elem].split())

763 elif elem.endswith("-stem"): 763 ↛ 764line 763 didn't jump to line 764 because the condition on line 763 was never true

764 dt = {"form": elem, "tags": ["class"], "source": source}

765 extra_forms.append(dt)

766 return global_tags, table_tags, extra_forms

767

768

769def expand_header(

770 wxr: WiktextractContext,

771 tablecontext: "TableContext",

772 word: str,

773 lang: str,

774 pos: str,

775 text: str,

776 base_tags: Union[list[str], set[str], tuple[str, ...]],

777 silent=False,

778 ignore_tags=False,

779 depth=0,

780) -> list[tuple[str, ...]]:

781 """Expands a cell header to tagset, handling conditional expressions

782 in infl_map. This returns list of tuples of tags, each list element

783 describing an alternative interpretation. ``base_tags`` is combined

784 column and row tags for the cell in which the text is being interpreted

785 (conditional expressions in inflection data may depend on it).

786 If ``silent`` is True, then no warnings will be printed. If ``ignore_tags``

787 is True, then tags listed in "if" will be ignored in the test (this is

788 used when trying to heuristically detect whether a non-<th> cell is anyway

789 a header)."""

790 assert isinstance(wxr, WiktextractContext)

791 assert isinstance(word, str)

792 assert isinstance(lang, str)

793 assert isinstance(pos, str)

794 assert isinstance(text, str)

795 assert isinstance(base_tags, (list, tuple, set))

796 assert silent in (True, False)

797 assert isinstance(depth, int)

798 # print("EXPAND_HDR: text={!r} base_tags={!r}".format(text, base_tags))

799 # First map the text using the inflection map

800 text = clean_value(wxr, text)

801 combined_return: list[tuple[str, ...]] = []

802 parts = split_at_comma_semi(text, separators=[";"])

803 for text in parts:

804 if not text: 804 ↛ 805line 804 didn't jump to line 805 because the condition on line 804 was never true

805 continue

806 if text in infl_map:

807 v = infl_map[text] # list or string

808 else:

809 m = re.match(infl_start_re, text)

810 if m is not None: 810 ↛ 811line 810 didn't jump to line 811 because the condition on line 810 was never true

811 v = infl_start_map[m.group(1)]

812 # print("INFL_START {} -> {}".format(text, v))

813 elif re.match(r"Notes", text):

814 # Ignored header

815 # print("IGNORING NOTES")

816 combined_return = or_tagsets(

817 lang, pos, combined_return, [("dummy-skip-this",)]

818 )

819 # this just adds dummy-skip-this

820 continue

821 elif text in IGNORED_COLVALUES:

822 combined_return = or_tagsets(

823 lang, pos, combined_return, [("dummy-ignore-skipped",)]

824 )

825 continue

826 # Try without final parenthesized part

827 text_without_parens = re.sub(r"[,/]?\s+$[^)]*$\s*$", "", text)

828 if text_without_parens in infl_map:

829 v = infl_map[text_without_parens]

830 elif m is None: 830 ↛ 846line 830 didn't jump to line 846 because the condition on line 830 was always true

831 if not silent:

832 wxr.wtp.debug(

833 "inflection table: unrecognized header: {}".format(

834 repr(text)

835 ),

836 sortid="inflection/735",

837 )

838 # Unrecognized header

839 combined_return = or_tagsets(

840 lang, pos, combined_return, [("error-unrecognized-form",)]

841 )

842 continue

843

844 # Then loop interpreting the value, until the value is a simple string.

845 # This may evaluate nested conditional expressions.

846 default_then = None

847 while True:

848 # If it is a string, we are done.

849 if isinstance(v, str):

850 tags = set(v.split())

851 remove_useless_tags(lang, pos, tags)

852 tagset = [tuple(sorted(tags))]

853 break

854 # For a list, just interpret it as alternatives. (Currently the

855 # alternatives must directly be strings.)

856 if isinstance(v, (list, tuple)):

857 tagset = []

858 for x in v:

859 tags = set(x.split())

860 remove_useless_tags(lang, pos, tags)

861 tags_t = tuple(sorted(tags))

862 if tags_t not in tagset: 862 ↛ 858line 862 didn't jump to line 858 because the condition on line 862 was always true

863 tagset.append(tags_t)

864 break

865 # Otherwise the value should be a dictionary describing a

866 # conditional expression.

867 if not isinstance(v, dict): 867 ↛ 868line 867 didn't jump to line 868 because the condition on line 867 was never true

868 wxr.wtp.debug(

869 "inflection table: internal: "

870 "UNIMPLEMENTED INFL_MAP VALUE: {}".format(infl_map[text]),

871 sortid="inflection/767",

872 )

873 tagset = [()]

874 break

875 # Evaluate the conditional expression.

876 assert isinstance(v, dict)

877 cond: Union[bool, str] = "default-true"

878 c: Union[str, list[str], set[str]] = ""

879 # Handle "lang" condition. The value must be either a

880 # single language or a list of languages, and the

881 # condition evaluates to True if the table is one of

882 # those languages.

883 if "lang" in v:

884 c = v["lang"]

885 if isinstance(c, str):

886 cond = c == lang

887 else:

888 assert isinstance(c, (list, tuple, set))

889 cond = lang in c

890 # Handle "nested-table-depth" condition. The value must

891 # be an int or list of ints, and the condition evaluates

892 # True if the depth is one of those values.

893 # "depth" is how deep into a nested table tree the current

894 # table lies. It is first started in handle_wikitext_table,

895 # so only applies to tables-within-tables, not other

896 # WikiNode content. `depth` is currently only passed as a

897 # parameter down the table parsing stack, and not stored.

898 if cond and "nested-table-depth" in v: 898 ↛ 899line 898 didn't jump to line 899 because the condition on line 898 was never true

899 d = v["nested-table-depth"]

900 if isinstance(d, int):

901 cond = d == depth

902 else:

903 assert isinstance(d, (list, tuple, set))

904 cond = depth in d

905 # Handle inflection-template condition. Must be a string

906 # or list of strings, and if tablecontext.template_name is in

907 # those, accept the condition.

908 # TableContext.template_name is passed down from page/

909 # parse_inflection, before parsing and expanding itself

910 # has begun.

911 if cond and tablecontext and "inflection-template" in v:

912 d1 = v["inflection-template"]

913 if isinstance(d1, str): 913 ↛ 916line 913 didn't jump to line 916 because the condition on line 913 was always true

914 cond = d1 == tablecontext.template_name

915 else:

916 assert isinstance(d1, (list, tuple, set))

917 cond = tablecontext.template_name in d1

918 # Handle "pos" condition. The value must be either a single

919 # part-of-speech or a list of them, and the condition evaluates to

920 # True if the part-of-speech is any of those listed.

921 if cond and "pos" in v:

922 c = v["pos"]

923 if isinstance(c, str):

924 cond = c == pos

925 else:

926 assert isinstance(c, (list, tuple, set))

927 cond = pos in c

928 # Handle "if" condition. The value must be a string containing a

929 # space-separated list of tags. The condition evaluates to True if

930 # ``base_tags`` contains all of the listed tags. If the condition

931 # is of the form "any: ...tags...", then any of the tags will be

932 # enough.

933 if cond and "if" in v and not ignore_tags:

934 c = v["if"]

935 assert isinstance(c, str)

936 # "if" condition is true if any of the listed tags is present if

937 # it starts with "any:", otherwise all must be present

938 if c.startswith("any: "):

939 cond = any(t in base_tags for t in c[5:].split())

940 else:

941 cond = all(t in base_tags for t in c.split())

942

943 # Handle "default" assignment. Store the value to be used

944 # as a default later.

945 if "default" in v:

946 assert isinstance(v["default"], str)

947 default_then = v["default"]

948

949 # Warning message about missing conditions for debugging.

950

951 if cond == "default-true" and not default_then and not silent:

952 wxr.wtp.debug(

953 "inflection table: IF MISSING COND: word={} "

954 "lang={} text={} base_tags={} c={} cond={}".format(

955 word, lang, text, base_tags, c, cond

956 ),

957 sortid="inflection/851",

958 )

959 # Based on the result of evaluating the condition, select either

960 # "then" part or "else" part.

961 if cond:

962 v = v.get("then", "")

963 else:

964 v1 = v.get("else")

965 if v1 is None:

966 if default_then:

967 v = default_then

968 else:

969 if not silent:

970 wxr.wtp.debug(

971 "inflection table: IF WITHOUT ELSE EVALS "

972 "False: "

973 "{}/{} {!r} base_tags={}".format(

974 word, lang, text, base_tags

975 ),

976 sortid="inflection/865",

977 )

978 v = "error-unrecognized-form"

979 else:

980 v = v1

981

982 # Merge the resulting tagset from this header part with the other

983 # tagsets from the whole header

984 combined_return = or_tagsets(lang, pos, combined_return, tagset)

985

986 # Return the combined tagsets, or empty tagset if we got no tagsets

987 if not combined_return:

988 combined_return = [()]

989 return combined_return

990

991

992def compute_coltags(

993 lang: str,

994 pos: str,

995 hdrspans: list[str],

996 start: int,

997 colspan: int,

998 celltext: int,

999) -> list[tuple[str]]:

1000 """Computes column tags for a column of the given width based on the

1001 current header spans."""

1002 assert isinstance(lang, str)

1003 assert isinstance(pos, str)

1004 assert isinstance(hdrspans, list)

1005 assert isinstance(start, int) and start >= 0

1006 assert isinstance(colspan, int) and colspan >= 1

1007 assert isinstance(celltext, str) # For debugging only

1008 # print("COMPUTE_COLTAGS CALLED start={} colspan={} celltext={!r}"

1009 # .format(start, colspan, celltext))

1010 # For debugging, set this to the form for whose cell you want debug prints

1011 if celltext == debug_cell_text: 1011 ↛ 1012line 1011 didn't jump to line 1012 because the condition on line 1011 was never true

1012 print(

1013 "COMPUTE_COLTAGS CALLED start={} colspan={} celltext={!r}".format(

1014 start, colspan, celltext

1015 )

1016 )

1017 for hdrspan in hdrspans:

1018 print(

1019 " row={} start={} colspans={} tagsets={}".format(

1020 hdrspan.rownum,

1021 hdrspan.start,

1022 hdrspan.colspan,

1023 hdrspan.tagsets,

1024 )

1025 )

1026 used = set()

1027 coltags = [()]

1028 last_header_row = 1000000

1029 # Iterate through the headers in reverse order, i.e., headers lower in the

1030 # table (closer to the cell) first.

1031 row_tagsets = [()]

1032 row_tagsets_rownum = 1000000

1033 used_hdrspans = set()

1034 for hdrspan in reversed(hdrspans):

1035 if (

1036 hdrspan.start + hdrspan.colspan <= start

1037 or hdrspan.start >= start + colspan

1038 ):

1039 # Does not horizontally overlap current cell. Ignore this hdrspan.

1040 if celltext == debug_cell_text: 1040 ↛ 1041line 1040 didn't jump to line 1041 because the condition on line 1040 was never true

1041 print(

1042 "Ignoring row={} start={} colspan={} tagsets={}".format(

1043 hdrspan.rownum,

1044 hdrspan.start,

1045 hdrspan.colspan,

1046 hdrspan.tagsets,

1047 )

1048 )

1049 continue

1050 # If the cell partially overlaps the current cell, assume we have

1051 # reached something unrelated and abort.

1052 if (

1053 hdrspan.start < start

1054 and hdrspan.start + hdrspan.colspan > start

1055 and hdrspan.start + hdrspan.colspan < start + colspan

1056 ):

1057 if celltext == debug_cell_text: 1057 ↛ 1058line 1057 didn't jump to line 1058 because the condition on line 1057 was never true

1058 print(

1059 "break on partial overlap at start {} {} {}".format(

1060 hdrspan.start, hdrspan.colspan, hdrspan.tagsets

1061 )

1062 )

1063 break

1064 if (

1065 hdrspan.start < start + colspan

1066 and hdrspan.start > start

1067 and hdrspan.start + hdrspan.colspan > start + colspan

1068 and not hdrspan.expanded

1069 ):

1070 if celltext == debug_cell_text: 1070 ↛ 1071line 1070 didn't jump to line 1071 because the condition on line 1070 was never true

1071 print(

1072 "break on partial overlap at end {} {} {}".format(

1073 hdrspan.start, hdrspan.colspan, hdrspan.tagsets

1074 )

1075 )

1076 break

1077 # Check if we have already used this cell.

1078 if id(hdrspan) in used_hdrspans:

1079 continue

1080 # We are going to use this cell.

1081 used_hdrspans.add(id(hdrspan))

1082 tagsets = hdrspan.tagsets

1083 # If the hdrspan is fully inside the current cell and does not cover

1084 # it fully, check if we should merge information from multiple cells.

1085 if not hdrspan.expanded and (

1086 hdrspan.start > start

1087 or hdrspan.start + hdrspan.colspan < start + colspan

1088 ):

1089 # Multiple columns apply to the current cell, only

1090 # gender/number/case tags present

1091 # If there are no tags outside the range in any of the

1092 # categories included in these cells, don't add anything

1093 # (assume all choices valid in the language are possible).

1094 in_cats = set(

1095 valid_tags[t]

1096 for x in hdrspans

1097 if x.rownum == hdrspan.rownum

1098 and x.start >= start

1099 and x.start + x.colspan <= start + colspan

1100 for tt in x.tagsets

1101 for t in tt

1102 )

1103 if celltext == debug_cell_text: 1103 ↛ 1104line 1103 didn't jump to line 1104 because the condition on line 1103 was never true

1104 print("in_cats={} tagsets={}".format(in_cats, tagsets))

1105 # Merge the tagsets into existing tagsets. This merges

1106 # alternatives into the same tagset if there is only one

1107 # category different; otherwise this splits the tagset into

1108 # more alternatives.

1109 includes_all_on_row = True

1110 for x in hdrspans:

1111 # print("X: x.rownum={} x.start={}".format(x.rownum, x.start))

1112 if x.rownum != hdrspan.rownum:

1113 continue

1114 if x.start < start or x.start + x.colspan > start + colspan:

1115 if celltext == debug_cell_text: 1115 ↛ 1116line 1115 didn't jump to line 1116 because the condition on line 1115 was never true

1116 print(

1117 "NOT IN RANGE: {} {} {}".format(

1118 x.start, x.colspan, x.tagsets

1119 )

1120 )

1121 includes_all_on_row = False

1122 continue

1123 if id(x) in used_hdrspans:

1124 if celltext == debug_cell_text: 1124 ↛ 1125line 1124 didn't jump to line 1125 because the condition on line 1124 was never true

1125 print(

1126 "ALREADY USED: {} {} {}".format(

1127 x.start, x.colspan, x.tagsets

1128 )

1129 )

1130 continue

1131 used_hdrspans.add(id(x))

1132 if celltext == debug_cell_text: 1132 ↛ 1133line 1132 didn't jump to line 1133 because the condition on line 1132 was never true

1133 print(

1134 "Merging into wide col: x.rownum={} "

1135 "x.start={} x.colspan={} "

1136 "start={} colspan={} tagsets={} x.tagsets={}".format(

1137 x.rownum,

1138 x.start,

1139 x.colspan,

1140 start,

1141 colspan,

1142 tagsets,

1143 x.tagsets,

1144 )

1145 )

1146 tagsets = or_tagsets(lang, pos, tagsets, x.tagsets)

1147 # If all headers on the row were included, ignore them.

1148 # See e.g. kunna/Swedish/Verb.

1149 ts_cats = tagset_cats(tagsets)

1150 if (

1151 includes_all_on_row

1152 or

1153 # Kludge, see fut/Hungarian/Verb

1154 ("tense" in ts_cats and "object" in ts_cats)

1155 ):

1156 tagsets = [()]

1157 # For limited categories, if the category doesn't appear

1158 # outside, we won't include the category

1159 if not in_cats - set(

1160 ("gender", "number", "person", "case", "category", "voice")

1161 ):

1162 # Sometimes we have masc, fem, neut and plural, so treat

1163 # number and gender as the same here (if one given, look for

1164 # the other too)

1165 if "number" in in_cats or "gender" in in_cats:

1166 in_cats.update(("number", "gender"))

1167 # Determine which categories occur outside on

1168 # the same row. Ignore headers that have been expanded

1169 # to cover the whole row/part of it.

1170 out_cats = set(

1171 valid_tags[t]

1172 for x in hdrspans

1173 if x.rownum == hdrspan.rownum

1174 and not x.expanded

1175 and (

1176 x.start < start or x.start + x.colspan > start + colspan

1177 )

1178 for tt in x.tagsets

1179 for t in tt

1180 )

1181 if celltext == debug_cell_text: 1181 ↛ 1182line 1181 didn't jump to line 1182 because the condition on line 1181 was never true

1182 print("in_cats={} out_cats={}".format(in_cats, out_cats))

1183 # Remove all inside categories that do not appear outside

1184

1185 new_tagsets = []

1186 for ts in tagsets:

1187 tags = tuple(

1188 sorted(t for t in ts if valid_tags[t] in out_cats)

1189 )

1190 if tags not in new_tagsets: 1190 ↛ 1186line 1190 didn't jump to line 1186 because the condition on line 1190 was always true

1191 new_tagsets.append(tags)

1192 if celltext == debug_cell_text and new_tagsets != tagsets: 1192 ↛ 1193line 1192 didn't jump to line 1193 because the condition on line 1192 was never true

1193 print(

1194 "Removed tags that do not "

1195 "appear outside {} -> {}".format(

1196 # have_hdr never used?

1197 tagsets,

1198 new_tagsets,

1199 )

1200 )

1201 tagsets = new_tagsets

1202 key = (hdrspan.start, hdrspan.colspan)

1203 if key in used:

1204 if celltext == debug_cell_text: 1204 ↛ 1205line 1204 didn't jump to line 1205 because the condition on line 1204 was never true

1205 print(

1206 "Cellspan already used: start={} "

1207 "colspan={} rownum={} {}".format(

1208 hdrspan.start,

1209 hdrspan.colspan,

1210 hdrspan.rownum,

1211 hdrspan.tagsets,

1212 )

1213 )

1214 action = get_lang_conf(lang, "reuse_cellspan")

1215 # can be "stop", "skip" or "reuse"

1216 if action == "stop":

1217 break

1218 if action == "skip":

1219 continue

1220 assert action == "reuse"

1221 tcats = tagset_cats(tagsets)

1222 # Most headers block using the same column position above. However,

1223 # "register" tags don't do this (cf. essere/Italian/verb: "formal")

1224 if len(tcats) != 1 or "register" not in tcats:

1225 used.add(key)

1226 # If we have moved to a different row, merge into column tagsets

1227 # (we use different and_tagsets within the row)

1228 if row_tagsets_rownum != hdrspan.rownum:

1229 # row_tagsets_rownum was initialized as 10000000

1230 ret = and_tagsets(lang, pos, coltags, row_tagsets)

1231 if celltext == debug_cell_text: 1231 ↛ 1232line 1231 didn't jump to line 1232 because the condition on line 1231 was never true

1232 print(

1233 "merging rows: {} {} -> {}".format(

1234 coltags, row_tagsets, ret

1235 )

1236 )

1237 coltags = ret

1238 row_tagsets = [()]

1239 row_tagsets_rownum = hdrspan.rownum

1240 # Merge into coltags

1241 if hdrspan.all_headers_row and hdrspan.rownum + 1 == last_header_row:

1242 # If this row is all headers and immediately preceeds the last

1243 # header we accepted, take any header from there.

1244 row_tagsets = and_tagsets(lang, pos, row_tagsets, tagsets)

1245 if celltext == debug_cell_text: 1245 ↛ 1246line 1245 didn't jump to line 1246 because the condition on line 1245 was never true

1246 print("merged (next header row): {}".format(row_tagsets))

1247 else:

1248 # new_cats is for the new tags (higher up in the table)

1249 new_cats = tagset_cats(tagsets)

1250 # cur_cats is for the tags already collected (lower in the table)

1251 cur_cats = tagset_cats(coltags)

1252 if celltext == debug_cell_text: 1252 ↛ 1253line 1252 didn't jump to line 1253 because the condition on line 1252 was never true

1253 print(

1254 "row={} start={} colspan={} tagsets={} coltags={} "

1255 "new_cats={} cur_cats={}".format(

1256 hdrspan.rownum,

1257 hdrspan.start,

1258 hdrspan.colspan,

1259 tagsets,

1260 coltags,

1261 new_cats,

1262 cur_cats,

1263 )

1264 )

1265 if "detail" in new_cats:

1266 if not any(coltags): # Only if no tags so far

1267 coltags = or_tagsets(lang, pos, coltags, tagsets)

1268 if celltext == debug_cell_text: 1268 ↛ 1269line 1268 didn't jump to line 1269 because the condition on line 1268 was never true

1269 print("stopping on detail after merge")

1270 break

1271 # Here, we block bleeding of categories from above

1272 elif "non-finite" in cur_cats and "non-finite" in new_cats:

1273 stop = get_lang_conf(lang, "stop_non_finite_non_finite")

1274 if stop: 1274 ↛ 1300line 1274 didn't jump to line 1300 because the condition on line 1274 was always true

1275 if celltext == debug_cell_text: 1275 ↛ 1276line 1275 didn't jump to line 1276 because the condition on line 1275 was never true

1276 print("stopping on non-finite-non-finite")

1277 break

1278 elif "non-finite" in cur_cats and "voice" in new_cats:

1279 stop = get_lang_conf(lang, "stop_non_finite_voice")

1280 if stop: 1280 ↛ 1300line 1280 didn't jump to line 1300 because the condition on line 1280 was always true

1281 if celltext == debug_cell_text: 1281 ↛ 1282line 1281 didn't jump to line 1282 because the condition on line 1281 was never true

1282 print("stopping on non-finite-voice")

1283 break

1284 elif "non-finite" in new_cats and cur_cats & set(

1285 ("person", "number")

1286 ):

1287 if celltext == debug_cell_text: 1287 ↛ 1288line 1287 didn't jump to line 1288 because the condition on line 1287 was never true

1288 print("stopping on non-finite new")

1289 break

1290 elif "non-finite" in new_cats and "tense" in new_cats:

1291 stop = get_lang_conf(lang, "stop_non_finite_tense")

1292 if stop:

1293 if celltext == debug_cell_text: 1293 ↛ 1294line 1293 didn't jump to line 1294 because the condition on line 1293 was never true

1294 print("stopping on non-finite new")

1295 break

1296 elif "non-finite" in cur_cats and new_cats & set(("mood",)): 1296 ↛ 1297line 1296 didn't jump to line 1297 because the condition on line 1296 was never true

1297 if celltext == debug_cell_text:

1298 print("stopping on non-finite cur")

1299 break

1300 if (

1301 "tense" in new_cats

1302 and any("imperative" in x for x in coltags)

1303 and get_lang_conf(lang, "imperative_no_tense")

1304 ):

1305 if celltext == debug_cell_text: 1305 ↛ 1306line 1305 didn't jump to line 1306 because the condition on line 1305 was never true

1306 print("skipping tense in imperative")

1307 continue

1308 elif (

1309 "mood" in new_cats

1310 and "mood" in cur_cats

1311 and

1312 # Allow if all new tags are already in current set

1313 any(

1314 t not in ts1

1315 for ts1 in coltags # current

1316 for ts2 in tagsets # new (from above)

1317 for t in ts2

1318 )

1319 ):

1320 skip = get_lang_conf(lang, "skip_mood_mood")

1321 if skip:

1322 if celltext == debug_cell_text: 1322 ↛ 1323line 1322 didn't jump to line 1323 because the condition on line 1322 was never true

1323 print("skipping on mood-mood")

1324 # we continue to next header

1325 else:

1326 if celltext == debug_cell_text: 1326 ↛ 1327line 1326 didn't jump to line 1327 because the condition on line 1326 was never true

1327 print("stopping on mood-mood")

1328 break

1329 elif "tense" in new_cats and "tense" in cur_cats:

1330 skip = get_lang_conf(lang, "skip_tense_tense")

1331 if skip:

1332 if celltext == debug_cell_text: 1332 ↛ 1333line 1332 didn't jump to line 1333 because the condition on line 1332 was never true

1333 print("skipping on tense-tense")

1334 # we continue to next header

1335 else:

1336 if celltext == debug_cell_text: 1336 ↛ 1337line 1336 didn't jump to line 1337 because the condition on line 1336 was never true

1337 print("stopping on tense-tense")

1338 break

1339 elif "aspect" in new_cats and "aspect" in cur_cats:

1340 if celltext == debug_cell_text: 1340 ↛ 1341line 1340 didn't jump to line 1341 because the condition on line 1340 was never true

1341 print("skipping on aspect-aspect")

1342 continue

1343 elif "number" in cur_cats and "number" in new_cats:

1344 if celltext == debug_cell_text: 1344 ↛ 1345line 1344 didn't jump to line 1345 because the condition on line 1344 was never true

1345 print("stopping on number-number")

1346 break

1347 elif "number" in cur_cats and "gender" in new_cats:

1348 if celltext == debug_cell_text: 1348 ↛ 1349line 1348 didn't jump to line 1349 because the condition on line 1348 was never true

1349 print("stopping on number-gender")

1350 break

1351 elif "person" in cur_cats and "person" in new_cats:

1352 if celltext == debug_cell_text: 1352 ↛ 1353line 1352 didn't jump to line 1353 because the condition on line 1352 was never true

1353 print("stopping on person-person")

1354 break

1355 else:

1356 # Merge tags and continue to next header up/left in the table.

1357 row_tagsets = and_tagsets(lang, pos, row_tagsets, tagsets)

1358 if celltext == debug_cell_text: 1358 ↛ 1359line 1358 didn't jump to line 1359 because the condition on line 1358 was never true

1359 print("merged: {}".format(coltags))

1360 # Update the row number from which we have last taken headers

1361 last_header_row = hdrspan.rownum

1362 # Merge the final row tagset into coltags

1363 coltags = and_tagsets(lang, pos, coltags, row_tagsets)

1364 # print(

1365 # "HDRSPANS:", list((x.start, x.colspan, x.tagsets) for x in hdrspans)

1366 # )

1367 if celltext == debug_cell_text: 1367 ↛ 1368line 1367 didn't jump to line 1368 because the condition on line 1367 was never true

1368 print("COMPUTE_COLTAGS {} {}: {}".format(start, colspan, coltags))

1369 assert isinstance(coltags, list)

1370 assert all(isinstance(x, tuple) for x in coltags)

1371 return coltags

1372

1373

1374def parse_simple_table(

1375 wxr, tablecontext, word, lang, pos, rows, titles, source, after, depth

1376):

1377 """This is the default table parser. Despite its name, it can parse

1378 complex tables. This returns a list of forms to be added to the

1379 part-of-speech, or None if the table could not be parsed."""

1380 assert isinstance(wxr, WiktextractContext)

1381 assert isinstance(tablecontext, TableContext)

1382 assert isinstance(word, str)

1383 assert isinstance(lang, str)

1384 assert isinstance(pos, str)

1385 assert isinstance(rows, list)

1386 assert isinstance(source, str)

1387 assert isinstance(after, str)

1388 assert isinstance(depth, int)

1389 for row in rows:

1390 for col in row:

1391 assert isinstance(col, InflCell)

1392 assert isinstance(titles, list)

1393 for x in titles:

1394 assert isinstance(x, str)

1395

1396 # print("PARSE_SIMPLE_TABLE: TITLES:", titles)

1397 if debug_cell_text: 1397 ↛ 1398line 1397 didn't jump to line 1398 because the condition on line 1397 was never true

1398 print("ROWS:")

1399 for row in rows:

1400 print(" ", row)

1401

1402 # Check for forced rowspan kludge. See e.g.

1403 # maorski/Serbo-Croatian. These are essentially multi-row

1404 # cells implemented using <br> rather than separate cell. We fix this

1405 # by identifying rows where this happens, and splitting the current row

1406 # to multiple rows by synthesizing additional cells.

1407 new_rows = []

1408 for row in rows:

1409 split_row = (

1410 any(x.is_title and x.text in ("inanimate\nanimate",) for x in row)

1411 and

1412 # x is an InflCell

1413 all(x.rowspan == 1 for x in row)

1414 )

1415 if not split_row:

1416 new_rows.append(row)

1417 continue

1418 row1 = []

1419 row2 = []

1420 for cell in row:

1421 cell1 = copy.deepcopy(cell)

1422 if "\n" in cell.text:

1423 # Has more than one line - split this cell

1424 parts = cell.text.strip().splitlines()

1425 if len(parts) != 2: 1425 ↛ 1426line 1425 didn't jump to line 1426 because the condition on line 1425 was never true

1426 wxr.wtp.debug(

1427 "forced rowspan kludge got {} parts: {!r}".format(

1428 len(parts), cell.text

1429 ),

1430 sortid="inflection/1234",

1431 )

1432 cell2 = copy.deepcopy(cell)

1433 cell1.text = parts[0]

1434 cell2.text = parts[1]

1435 else:

1436 cell1.rowspan = 2

1437 cell2 = cell1 # ref, not a copy

1438 row1.append(cell1)

1439 row2.append(cell2)

1440 new_rows.append(row1)

1441 new_rows.append(row2)

1442 rows = new_rows

1443 # print("ROWS AFTER FORCED ROWSPAN KLUDGE:")

1444 # for row in rows:

1445 # print(" ", row)

1446

1447 # Parse definitions for references (from table itself and from text

1448 # after it)

1449 def_ht = {}

1450

1451 def add_defs(defs: list[tuple[str, str]]) -> None:

1452 for ref, d in defs:

1453 # print("DEF: ref={} d={}".format(ref, d))

1454 d = d.strip()

1455 d = d.split(". ")[0].strip() # text before ". "

1456 if not d: 1456 ↛ 1457line 1456 didn't jump to line 1457 because the condition on line 1456 was never true

1457 continue

1458 if d.endswith("."): # catc ".."??

1459 d = d[:-1]

1460 tags, topics = decode_tags(d, no_unknown_starts=True)

1461 # print(f"{ref=}, {d=}, {tags=}")

1462 if topics or any("error-unknown-tag" in ts for ts in tags):

1463 d = d[0].lower() + d[1:]

1464 tags, topics = decode_tags(d, no_unknown_starts=True)

1465 if topics or any("error-unknown-tag" in ts for ts in tags):

1466 # Failed to parse as tags

1467 # print("Failed: topics={} tags={}"

1468 # .format(topics, tags))

1469 continue

1470 tags1_s: set[str] = set()

1471 for ts in tags:

1472 tags1_s.update(ts)

1473 tags1 = tuple(sorted(tags1_s))

1474 # print("DEFINED: {} -> {}".format(ref, tags1))

1475 def_ht[ref] = tags1

1476

1477 def generate_tags(

1478 rowtags: list[tuple[str]], table_tags: list[str]

1479 ) -> tuple[

1480 list[tuple[str, ...]], list[tuple[str, ...]], list[tuple[str, ...]]

1481 ]:

1482 new_coltags = []

1483 all_hdr_tags = [] # list of tuples

1484 new_rowtags = []

1485 for rt0 in rowtags:

1486 for ct0 in compute_coltags(

1487 lang,

1488 pos,

1489 hdrspans,

1490 col_idx, # col_idx=>start

1491 colspan,

1492 col, # cell_text

1493 ):

1494 base_tags: set[str] = (

1495 set(rt0)

1496 | set(ct0)

1497 | set(global_tags)

1498 | set(itertools.chain.from_iterable(table_tags))

1499 ) # Union.

1500 alt_tags = expand_header(

1501 wxr,

1502 tablecontext,

1503 word,

1504 lang,

1505 pos,

1506 text,

1507 base_tags,

1508 depth=depth,

1509 )

1510 # base_tags are used in infl_map "if"-conds.

1511 for tt in alt_tags:

1512 if tt not in all_hdr_tags:

1513 all_hdr_tags.append(tt)

1514 tt_s = set(tt)

1515 # Certain tags are always moved to word-level tags

1516 if tt_s & TAGS_FORCED_WORDTAGS: 1516 ↛ 1517line 1516 didn't jump to line 1517 because the condition on line 1516 was never true

1517 table_tags.extend(tt_s & TAGS_FORCED_WORDTAGS)

1518 tt_s = tt_s - TAGS_FORCED_WORDTAGS

1519 # Add tags from referenced footnotes

1520 tt_s.update(refs_tags)

1521 # Sort, convert to tuple, and add to set of

1522 # alternatives.

1523 tt = tuple(sorted(tt_s))

1524 if tt not in new_coltags:

1525 new_coltags.append(tt)

1526 # Kludge (saprast/Latvian/Verb): ignore row tags

1527 # if trying to add a non-finite after mood.

1528 if any(valid_tags[t] == "mood" for t in rt0) and any(

1529 valid_tags[t] == "non-finite" for t in tt

1530 ):

1531 tags = tuple(sorted(set(tt) | set(hdr_tags)))

1532 else:

1533 tags = tuple(sorted(set(tt) | set(rt0) | set(hdr_tags)))

1534 if tags not in new_rowtags:

1535 new_rowtags.append(tags)

1536 return new_rowtags, new_coltags, all_hdr_tags

1537

1538 def add_new_hdrspan(

1539 col: str,

1540 hdrspans: list[HdrSpan],

1541 store_new_hdrspan: bool,

1542 col0_followed_by_nonempty: bool,

1543 col0_hdrspan: Optional[HdrSpan],

1544 ) -> tuple[str, bool, Optional[HdrSpan]]:

1545 hdrspan = HdrSpan(

1546 col_idx, colspan, rowspan, rownum, new_coltags, col, all_headers

1547 )

1548 hdrspans.append(hdrspan)

1549

1550 # infl-map tag "dummy-store-hdrspan" causes this new hdrspan

1551 # to be added to a register of stored hdrspans to be used

1552 # later with "dummy-load-stored-hdrspans".

1553 if store_new_hdrspan: 1553 ↛ 1554line 1553 didn't jump to line 1554 because the condition on line 1553 was never true

1554 tablecontext.stored_hdrspans.append(hdrspan)

1555

1556 # Handle headers that are above left-side header

1557 # columns and are followed by personal pronouns in

1558 # remaining columns (basically headers that

1559 # evaluate to no tags). In such cases widen the

1560 # left-side header to the full row.

1561 if previously_seen: # id(cell) in seen_cells previously

1562 col0_followed_by_nonempty = True

1563 return col, col0_followed_by_nonempty, col0_hdrspan

1564 elif col0_hdrspan is None:

1565 col0_hdrspan = hdrspan

1566 elif any(all_hdr_tags): 1566 ↛ 1634line 1566 didn't jump to line 1634 because the condition on line 1566 was always true

1567 col0_cats = tagset_cats(col0_hdrspan.tagsets)

1568 later_cats = tagset_cats(all_hdr_tags)

1569 col0_allowed = get_lang_conf(lang, "hdr_expand_first")

1570 later_allowed = get_lang_conf(lang, "hdr_expand_cont")

1571 later_allowed = later_allowed | set(["dummy"])

1572 # dummy2 has different behavior than plain dummy

1573 # and does not belong here.

1574

1575 # print("col0_cats={} later_cats={} "

1576 # "fol_by_nonempty={} col_idx={} end={} "

1577 # "tagsets={}"

1578 # .format(col0_cats, later_cats,

1579 # col0_followed_by_nonempty, col_idx,

1580 # col0_hdrspan.start +

1581 # col0_hdrspan.colspan,

1582 # col0_hdrspan.tagsets))

1583 # print("col0.rowspan={} rowspan={}"

1584 # .format(col0_hdrspan.rowspan, rowspan))

1585 # Only expand if [col0_cats and later_cats are allowed

1586 # and don't overlap] and [col0 has tags], and there have

1587 # been [no disallowed cells in between].

1588 #

1589 # There are three cases here:

1590 # - col0_hdrspan set, continue with allowed current

1591 # - col0_hdrspan set, expand, start new

1592 # - col0_hdrspan set, no expand, start new

1593 if (

1594 not col0_followed_by_nonempty

1595 and

1596 # XXX Only one cat of tags: kunna/Swedish

1597 # XXX len(col0_cats) == 1 and

1598 col0_hdrspan.rowspan >= rowspan

1599 and

1600 # from hdrspan

1601 not (later_cats - later_allowed)

1602 and not (col0_cats & later_cats)

1603 ):

1604 # First case: col0 set, continue

1605 return col, col0_followed_by_nonempty, col0_hdrspan

1606 # We are going to start new col0_hdrspan. Check if

1607 # we should expand.

1608 if (

1609 not col0_followed_by_nonempty

1610 and not (col0_cats - col0_allowed)

1611 and

1612 # Only "allowed" allowed

1613 # XXX len(col0_cats) == 1 and

1614 col_idx > col0_hdrspan.start + col0_hdrspan.colspan

1615 ):

1616 # col_idx is beyond current colspan

1617 # *Expand* current col0_hdrspan

1618 # print("EXPANDING COL0 MID: {} from {} to {} "

1619 # "cols {}"

1620 # .format(col0_hdrspan.text,

1621 # col0_hdrspan.colspan,

1622 # col_idx - col0_hdrspan.start,

1623 # col0_hdrspan.tagsets))

1624 col0_hdrspan.colspan = col_idx - col0_hdrspan.start

1625 col0_hdrspan.expanded = True

1626 # Clear old col0_hdrspan

1627 if col == debug_cell_text: 1627 ↛ 1628line 1627 didn't jump to line 1628 because the condition on line 1627 was never true

1628 print("START NEW {}".format(hdrspan.tagsets))

1629 col0_hdrspan = None

1630 # Now start new, unless it comes from previous row

1631 if not previously_seen: 1631 ↛ 1634line 1631 didn't jump to line 1634 because the condition on line 1631 was always true

1632 col0_hdrspan = hdrspan

1633 col0_followed_by_nonempty = False

1634 return col, col0_followed_by_nonempty, col0_hdrspan

1635

1636 def split_text_into_alts(col: str) -> tuple[str, list[str], list[str]]:

1637 # Split the cell text into alternatives

1638 split_extra_tags = []

1639 if col and is_superscript(col[0]): 1639 ↛ 1640line 1639 didn't jump to line 1640 because the condition on line 1639 was never true

1640 alts = [col]

1641 else:

1642 separators = [";", "•", r"\n", " or "]

1643 if " + " not in col:

1644 separators.append(",")

1645 if not col.endswith("/"):

1646 separators.append("/")

1647 if col in special_phrase_splits:

1648 # Use language-specific special splits.

1649 # These are phrases and constructions that have

1650 # unique ways of splitting, not specific characters

1651 # to split on like with the default splitting.

1652 alts, tags = special_phrase_splits[col]

1653 split_extra_tags = tags.split()

1654 for x in split_extra_tags:

1655 assert x in valid_tags

1656 assert isinstance(alts, (list, tuple))

1657 assert isinstance(tags, str)

1658 else:

1659 # Use default splitting. However, recognize

1660 # language-specific replacements and change them to magic

1661 # characters before splitting. This way we won't split

1662 # them. This is important for, e.g., recognizing

1663 # alternative pronouns.

1664 # The magic characters are characters out of Unicode scope

1665 # that are given a simple incremental value, int > unicode.

1666 repls = {}

1667 magic_ch = MAGIC_FIRST

1668 trs = get_lang_conf(lang, "form_transformations")

1669 # trs is a list of lists of strings

1670 for _, v, _, _ in trs:

1671 # v is a pattern string, like "^ich"

1672 # form_transformations data is doing double-duty here,

1673 # because the pattern strings are already known to us and

1674 # not meant to be split.

1675 m = re.search(v, col)

1676 if m is not None:

1677 # if pattern found in text

1678 magic = chr(magic_ch)

1679 magic_ch += 1 # next magic character value

1680 col = re.sub(v, magic, col) # replace with magic ch

1681 repls[magic] = m.group(0)

1682 # remember what regex match string each magic char

1683 # replaces. .group(0) is the whole match.

1684 alts0 = split_at_comma_semi(col, separators=separators)

1685 # with magic characters in place, split the text so that

1686 # pre-transformation text is out of the way.

1687 alts = []

1688 for alt in alts0:

1689 # create a new list with the separated items and

1690 # the magic characters replaced with the original texts.

1691 for k, v in repls.items():

1692 alt = re.sub(k, v, alt)

1693 alts.append(alt)

1694 # Remove "*" from beginning of forms, as in non-attested

1695 # or reconstructed forms. Otherwise it might confuse romanization

1696 # detection.

1697 alts = list(re.sub(r"^\*\*?([^ ])", r"\1", x) for x in alts)

1698 alts = list(

1699 x for x in alts if not re.match(r"pronounced with |\(with ", x)

1700 )

1701 alts = list(

1702 re.sub(r"^$(in the sense [^)]*)$\s+", "", x) for x in alts

1703 )

1704 # Check for parenthesized alternatives, e.g. ripromettersi/Italian

1705 if all( 1705 ↛ 1716line 1705 didn't jump to line 1716 because the condition on line 1705 was never true

1706 re.match(r"\w+( \w+)* $\w+( \w+)*(, \w+( \w+)*)*$$", alt)

1707 # word word* $word word*(, word word*)*$

1708 and all(

1709 distw([re.sub(r" \(.*", "", alt)], x) < 0.5

1710 # Levenshtein distance

1711 for x in re.sub(r".*$(.*)$", r"\1", alt).split(", ")

1712 )

1713 # Extract from parentheses for testin

1714 for alt in alts

1715 ):

1716 new_alts = []

1717 for alt in alts:

1718 # Replace parentheses before splitting

1719 alt = alt.replace(" (", ", ")

1720 alt = alt.replace(")", "")

1721 for new_alt in alt.split(", "):

1722 new_alts.append(new_alt)

1723 alts = new_alts

1724 return col, alts, split_extra_tags

1725

1726 def handle_mixed_lines(alts: list[str]) -> list[tuple[str, str, str]]:

1727 # Handle the special case where romanization is given under

1728 # normal form, e.g. in Russian. There can be multiple

1729 # comma-separated forms in each case. We also handle the case

1730 # where instead of romanization we have IPA pronunciation

1731 # (e.g., avoir/French/verb).

1732 len2 = len(alts) // 2

1733 # Check for IPAs (forms first, IPAs under)

1734 # base, base, IPA, IPA

1735 if (

1736 len(alts) % 2 == 0 # Divisibly by two

1737 and all(

1738 re.match(r"^\s*/.*/\s*$", x) # Inside slashes = IPA

1739 for x in alts[len2:]

1740 )

1741 ): # In the second half of alts

1742 nalts = list(

1743 (alts[i], "", alts[i + len2])

1744 # List of tuples: (base, "", ipa)

1745 for i in range(len2)

1746 )

1747 # base, base, base, IPA

1748 elif (

1749 len(alts) > 2

1750 and re.match(r"^\s*/.*/\s*$", alts[-1])

1751 and all(not x.startswith("/") for x in alts[:-1])

1752 ):

1753 # Only if the last alt is IPA

1754 nalts = list((alts[i], "", alts[-1]) for i in range(len(alts) - 1))

1755 # base, IPA, IPA, IPA

1756 elif (

1757 len(alts) > 2

1758 and not alts[0].startswith("/")

1759 and all(

1760 re.match(r"^\s*/.*/\s*$", alts[i]) for i in range(1, len(alts))

1761 )

1762 ):

1763 # First is base and the rest is IPA alternatives

1764 nalts = list((alts[0], "", alts[i]) for i in range(1, len(alts)))

1765

1766 # Check for romanizations, forms first, romanizations under

1767 elif (

1768 len(alts) % 2 == 0

1769 and not any("(" in x for x in alts)

1770 and all(

1771 classify_desc(

1772 re.sub(

1773 r"\^.*$",

1774 "",

1775 # Remove ends of strings starting from ^.

1776 # Supescripts have been already removed

1777 # from the string, while ^xyz needs to be

1778 # removed separately, though it's usually

1779 # something with a single letter?

1780 "".join(xx for xx in x if not is_superscript(xx)),

1781 )

1782 )

1783 == "other"

1784 for x in alts[:len2]

1785 )

1786 and all(

1787 classify_desc(

1788 re.sub(

1789 r"\^.*$",

1790 "",

1791 "".join(xx for xx in x if not is_superscript(xx)),

1792 )

1793 )

1794 in ("romanization", "english")

1795 for x in alts[len2:]

1796 )

1797 ):

1798 nalts = list((alts[i], alts[i + len2], "") for i in range(len2))

1799 # Check for romanizations, forms and romanizations alternating

1800 elif (

1801 len(alts) % 2 == 0

1802 and not any("(" in x for x in alts)

1803 and all(

1804 classify_desc(

1805 re.sub(

1806 r"\^.*$",

1807 "",

1808 "".join(xx for xx in alts[i] if not is_superscript(xx)),

1809 )

1810 )

1811 == "other"

1812 for i in range(0, len(alts), 2)

1813 )

1814 and all(

1815 classify_desc(

1816 re.sub(

1817 r"\^.*$",

1818 "",

1819 "".join(xx for xx in alts[i] if not is_superscript(xx)),

1820 )

1821 )

1822 in ("romanization", "english")

1823 for i in range(1, len(alts), 2)

1824 )

1825 ):

1826 # odds

1827 nalts = list(

1828 (alts[i], alts[i + 1], "") for i in range(0, len(alts), 2)

1829 )

1830 # evens

1831 else:

1832 new_alts = []

1833 for alt in alts:

1834 lst = [""]

1835 idx = 0

1836 for m in re.finditer(

1837 r"(^|\w|\*)$(\w+" r"(/\w+)*)$",

1838 # start OR letter OR asterisk (word/word*)

1839 # \\___________group 1_______/ \ \_g3_///

1840 # \ \__gr. 2_//

1841 # \_____________group 0________________/

1842 alt,

1843 ):

1844 v = m.group(2) # (word/word/word...)

1845 if (

1846 classify_desc(v) == "tags" # Tags inside parens

1847 or m.group(0) == alt

1848 ): # All in parens

1849 continue

1850 new_lst = []

1851 for x in lst:

1852 x += alt[idx : m.start()] + m.group(1)

1853 # alt until letter or asterisk

1854 idx = m.end()

1855 vparts = v.split("/")

1856 # group(2) = ["word", "wörd"...]

1857 if len(vparts) == 1:

1858 new_lst.append(x)

1859 new_lst.append(x + v)

1860 # "kind(er)" -> ["kind", "kinder"]

1861 else:

1862 for vv in vparts:

1863 new_lst.append(x + vv)

1864 # "lampai(tten/den)" ->

1865 # ["lampaitten", "lampaiden"]

1866 lst = new_lst

1867 for x in lst:

1868 new_alts.append(x + alt[idx:])

1869 # add the end of alt

1870 nalts = list((x, "", "") for x in new_alts)

1871 # [form, no romz, no ipa]

1872 return nalts

1873

1874 def find_semantic_parens(form: str) -> tuple[str, list[str]]:

1875 # "Some languages" (=Greek) use brackets to mark things that

1876 # require tags, like (informality), [rarity] and {archaicity}.

1877 extra_tags = []

1878 if re.match(r"$[^][(){}]*$$", form):

1879 if get_lang_conf(lang, "parentheses_for_informal"):

1880 form = form[1:-1]

1881 extra_tags.append("informal")

1882 else:

1883 form = form[1:-1]

1884 elif re.match(r"\{\[[^][(){}]*\]\}$", form):

1885 if get_lang_conf( 1885 ↛ 1892line 1885 didn't jump to line 1892 because the condition on line 1885 was always true

1886 lang, "square_brackets_for_rare"

1887 ) and get_lang_conf(lang, "curly_brackets_for_archaic"):

1888 # είμαι/Greek/Verb

1889 form = form[2:-2]

1890 extra_tags.extend(["rare", "archaic"])

1891 else:

1892 form = form[2:-2]

1893 elif re.match(r"\{[^][(){}]*\}$", form):

1894 if get_lang_conf(lang, "curly_brackets_for_archaic"): 1894 ↛ 1899line 1894 didn't jump to line 1899 because the condition on line 1894 was always true

1895 # είμαι/Greek/Verb

1896 form = form[1:-1]

1897 extra_tags.extend(["archaic"])

1898 else:

1899 form = form[1:-1]

1900 elif re.match(r"\[[^][(){}]*\]$", form):

1901 if get_lang_conf(lang, "square_brackets_for_rare"): 1901 ↛ 1906line 1901 didn't jump to line 1906 because the condition on line 1901 was always true

1902 # είμαι/Greek/Verb

1903 form = form[1:-1]

1904 extra_tags.append("rare")

1905 else:

1906 form = form[1:-1]

1907 return form, extra_tags

1908

1909 def handle_parens(

1910 form: str, roman: str, clitic: str, extra_tags: list[str]

1911 ) -> tuple[str, str, str]:

1912 if re.match(r"[’'][a-z]([a-z][a-z]?)?$", paren):

1913 # is there a clitic starting with apostrophe?

1914 clitic = paren

1915 # assume the whole paren is a clitic

1916 # then remove paren from form

1917 form = (form[: m.start()] + subst + form[m.end() :]).strip()

1918 elif classify_desc(paren) == "tags":

1919 tagsets1, topics1 = decode_tags(paren)

1920 if not topics1: 1920 ↛ 1941line 1920 didn't jump to line 1941 because the condition on line 1920 was always true

1921 for ts in tagsets1:

1922 ts = tuple(x for x in ts if " " not in x)

1923 # There are some generated tags containing

1924 # spaces; do not let them through here.

1925 extra_tags.extend(ts)

1926 form = (form[: m.start()] + subst + form[m.end() :]).strip()

1927 # brackets contain romanization

1928 elif ( 1928 ↛ 1937line 1928 didn't jump to line 1937 because the condition on line 1928 was never true

1929 m.start() > 0

1930 and not roman

1931 and classify_desc(form[: m.start()]) == "other"

1932 and

1933 # "other" ~ text

1934 classify_desc(paren) in ("romanization", "english")

1935 and not re.search(r"^with |-form$", paren)

1936 ):

1937 roman = paren

1938 form = (form[: m.start()] + subst + form[m.end() :]).strip()

1939 elif re.search(r"^with |-form", paren): 1939 ↛ 1940line 1939 didn't jump to line 1940 because the condition on line 1939 was never true

1940 form = (form[: m.start()] + subst + form[m.end() :]).strip()

1941 return form, roman, clitic

1942

1943 def merge_row_and_column_tags(form, some_has_covered_text):

1944 # Merge column tags and row tags. We give preference

1945 # to moods etc coming from rowtags (cf. austteigen/German/Verb

1946 # imperative forms).

1947

1948 # In certain cases, what a tag means depends on whether

1949 # it is a row or column header. Depending on the language,

1950 # we replace certain tags with others if they're in

1951 # a column or row

1952

1953 ret = []

1954 # rtagreplacs = get_lang_conf(lang, "rowtag_replacements")

1955 # ctagreplacs = get_lang_conf(lang, "coltag_replacements")

1956 for rt in sorted(rowtags):

1957 if "dummy-use-as-coltags" in rt: 1957 ↛ 1958line 1957 didn't jump to line 1958 because the condition on line 1957 was never true

1958 continue

1959 # if lang was in rowtag_replacements)

1960 # if not rtagreplacs == None:

1961 # rt = replace_directional_tags(rt, rtagreplacs)

1962 for ct in sorted(coltags):

1963 if "dummy-use-as-rowtags" in ct: 1963 ↛ 1964line 1963 didn't jump to line 1964 because the condition on line 1963 was never true

1964 continue

1965 # if lang was in coltag_replacements

1966 # if not ctagreplacs == None:

1967 # ct = replace_directional_tags(ct,

1968 # ctagreplacs)

1969 tags = set(global_tags)

1970 tags.update(extra_tags)

1971 tags.update(rt)

1972 tags.update(refs_tags)

1973 tags.update(tablecontext.section_header)

1974 # Merge tags from column. For certain kinds of tags,

1975 # those coming from row take precedence.

1976 old_tags = set(tags)

1977 for t in ct:

1978 c = valid_tags[t]

1979 if c in ("mood", "case", "number") and any(

1980 valid_tags[tt] == c for tt in old_tags

1981 ):

1982 continue

1983 tags.add(t)

1984

1985 # Extract language-specific tags from the

1986 # form. This may also adjust the form.

1987 form, lang_tags = lang_specific_tags(lang, pos, form)

1988 tags.update(lang_tags)

1989

1990 # For non-finite verb forms, see if they have

1991 # a gender/class suffix

1992 if pos == "verb" and any(

1993 valid_tags[t] == "non-finite" for t in tags

1994 ):

1995 form, tt = parse_head_final_tags(wxr, lang, form)

1996 tags.update(tt)

1997

1998 # Remove "personal" tag if have nth person; these

1999 # come up with e.g. reconhecer/Portuguese/Verb. But

2000 # not if we also have "pronoun"

2001 if (

2002 "personal" in tags

2003 and "pronoun" not in tags

2004 and any(

2005 x in tags

2006 for x in [

2007 "first-person",

2008 "second-person",

2009 "third-person",

2010 ]

2011 )

2012 ):

2013 tags.remove("personal")

2014

2015 # If we have impersonal, remove person and number.

2016 # This happens with e.g. viajar/Portuguese/Verb

2017 if "impersonal" in tags:

2018 tags = tags - set(

2019 [

2020 "first-person",

2021 "second-person",

2022 "third-person",

2023 "singular",

2024 "plural",

2025 ]

2026 )

2027

2028 # Remove unnecessary "positive" tag from verb forms

2029 if pos == "verb" and "positive" in tags:

2030 if "negative" in tags: 2030 ↛ 2031line 2030 didn't jump to line 2031 because the condition on line 2030 was never true

2031 tags.remove("negative")

2032 tags.remove("positive")

2033

2034 # Many Russian (and other Slavic) inflection tables

2035 # have animate/inanimate distinction that generates

2036 # separate entries for neuter/feminine, but the

2037 # distinction only applies to masculine. Remove them

2038 # form neuter/feminine and eliminate duplicates.

2039 if get_lang_conf(lang, "masc_only_animate"):

2040 for t1 in ("animate", "inanimate"):

2041 for t2 in ("neuter", "feminine"):

2042 if (

2043 t1 in tags

2044 and t2 in tags

2045 and "masculine" not in tags

2046 and "plural" not in tags

2047 ):

2048 tags.remove(t1)

2049

2050 # German adjective tables contain "(keiner)" etc

2051 # for mixed declension plural. When the adjective

2052 # disappears and it becomes just one word, remove

2053 # the "includes-article" tag. e.g. eiskalt/German

2054 if "includes-article" in tags and " " not in form:

2055 tags.remove("includes-article")

2056

2057 # Handle ignored forms. We mark that the form was

2058 # provided. This is important information; some words

2059 # just do not have a certain form. However, there also

2060 # many cases where no word in a language has a

2061 # particular form. Post-processing could detect and

2062 # remove such cases.

2063 if form in IGNORED_COLVALUES:

2064 # if cell text seems to be ignorable

2065 if "dummy-ignore-skipped" in tags:

2066 continue

2067 if (

2068 col_idx not in has_covering_hdr

2069 and some_has_covered_text

2070 ):

2071 continue

2072 # don't ignore this cell if there's been a header

2073 # above it

2074 form = "-"

2075 elif col_idx in has_covering_hdr:

2076 some_has_covered_text = True

2077

2078 # Handle ambiguous object concord. If a header

2079 # gives the "dummy-object-concord"-tag to a word,

2080 # replace person, number and gender tags with

2081 # their "object-" counterparts so that the verb

2082 # agrees with the object instead.

2083 # Use only when the verb has ONLY object agreement!

2084 # a پخول/Pashto

2085 if "dummy-object-concord" in tags: 2085 ↛ 2086line 2085 didn't jump to line 2086 because the condition on line 2085 was never true

2086 for subtag, objtag in object_concord_replacements.items():

2087 if subtag in tags:

2088 tags.remove(subtag)

2089 tags.add(objtag)

2090

2091 # Remove the dummy mood tag that we sometimes

2092 # use to block adding other mood and related

2093 # tags

2094 tags = tags - set(

2095 [

2096 "dummy-mood",

2097 "dummy-tense",

2098 "dummy-ignore-skipped",

2099 "dummy-object-concord",

2100 "dummy-reset-headers",

2101 "dummy-use-as-coltags",

2102 "dummy-use-as-rowtags",

2103 "dummy-store-hdrspan",

2104 "dummy-load-stored-hdrspans",

2105 "dummy-reset-stored-hdrspans",

2106 "dummy-section-header",

2107 ]

2108 )

2109

2110 # Perform language-specific tag replacements according

2111 # to rules in a table.

2112 lang_tag_mappings = get_lang_conf(lang, "lang_tag_mappings")

2113 if lang_tag_mappings is not None: 2113 ↛ 2114line 2113 didn't jump to line 2114 because the condition on line 2113 was never true

2114 for pre, post in lang_tag_mappings.items():

2115 if all(t in tags for t in pre):

2116 tags = (tags - set(pre)) | set(post)

2117

2118 # Warn if there are entries with empty tags

2119 if not tags: 2119 ↛ 2120line 2119 didn't jump to line 2120 because the condition on line 2119 was never true

2120 wxr.wtp.debug(

2121 "inflection table: empty tags for {}".format(form),

2122 sortid="inflection/1826",

2123 )

2124

2125 # Warn if form looks like IPA

2126 ########## XXX ########

2127 # Because IPA is its own unicode block, we could also

2128 # technically do a Unicode name check to see if a string

2129 # contains IPA. Not all valid IPA characters are in the

2130 # IPA extension block, so you can technically have false

2131 # negatives if it's something like /toki/, but it

2132 # shouldn't give false positives.

2133 # Alternatively, you could make a list of IPA-admissible

2134 # characters and reject non-IPA stuff with that.

2135 if re.match(r"\s*/.*/\s*$", form): 2135 ↛ 2136line 2135 didn't jump to line 2136 because the condition on line 2135 was never true

2136 wxr.wtp.debug(

2137 "inflection table form looks like IPA: "

2138 "form={} tags={}".format(form, tags),

2139 sortid="inflection/1840",

2140 )

2141

2142 # Note that this checks `form`, not `in tags`

2143 if form == "dummy-ignored-text-cell": 2143 ↛ 2144line 2143 didn't jump to line 2144 because the condition on line 2143 was never true

2144 continue

2145

2146 if "dummy-remove-this-cell" in tags: 2146 ↛ 2147line 2146 didn't jump to line 2147 because the condition on line 2146 was never true

2147 continue

2148

2149 # Add the form

2150 tags = list(sorted(tags))

2151 dt = {"form": form, "tags": tags, "source": source}

2152 if roman:

2153 dt["roman"] = roman

2154 if ipa:

2155 dt["ipa"] = ipa

2156 ret.append(dt)

2157 # If we got separate clitic form, add it

2158 if clitic:

2159 dt = {

2160 "form": clitic,

2161 "tags": tags + ["clitic"],

2162 "source": source,

2163 }

2164 ret.append(dt)

2165 return ret, form, some_has_covered_text

2166

2167 # First extract definitions from cells

2168 # See defs_ht for footnote defs stuff

2169 for row in rows:

2170 for cell in row:

2171 text, refs, defs, hdr_tags = extract_cell_content(

2172 lang, word, cell.text

2173 )

2174 # refs, defs = footnote stuff, defs -> (ref, def)

2175 add_defs(defs)

2176 # Extract definitions from text after table

2177 text, refs, defs, hdr_tags = extract_cell_content(lang, word, after)

2178 add_defs(defs)

2179

2180 # Then extract the actual forms

2181 ret = []

2182 hdrspans = []

2183 first_col_has_text = False

2184 rownum = 0

2185 title = None

2186 global_tags = []

2187 table_tags = []

2188 special_phrase_splits = get_lang_conf(lang, "special_phrase_splits")

2189 form_replacements = get_lang_conf(lang, "form_replacements")

2190 possibly_ignored_forms = get_lang_conf(lang, "conditionally_ignored_cells")

2191 cleanup_rules = get_lang_conf(lang, "minor_text_cleanups")

2192

2193 for title in titles:

2194 more_global_tags, more_table_tags, extra_forms = parse_title(

2195 title, source

2196 )

2197 global_tags.extend(more_global_tags)

2198 table_tags.extend(more_table_tags)

2199 ret.extend(extra_forms)

2200 cell_rowcnt = collections.defaultdict(int)

2201 seen_cells = set()

2202 has_covering_hdr = set()

2203 some_has_covered_text = False

2204 for row in rows:

2205 # print("ROW:", row)

2206 # print("====")

2207 # print(f"Start of PREVIOUS row hdrspans:"

2208 # f"{tuple(sp.tagsets for sp in hdrspans)}")

2209 # print(f"Start of row txt: {tuple(t.text for t in row)}")

2210 if not row: 2210 ↛ 2211line 2210 didn't jump to line 2211 because the condition on line 2210 was never true

2211 continue # Skip empty rows

2212 all_headers = all(x.is_title or not x.text.strip() for x in row)

2213 text = row[0].text

2214 if (

2215 row[0].is_title

2216 and text

2217 and not is_superscript(text[0])

2218 and text not in infl_map # zealous inflation map?

2219 and (

2220 re.match(r"Inflection ", text)

2221 or re.sub(

2222 r"\s+",

2223 " ", # flatten whitespace

2224 re.sub(

2225 r"\s*$[^)]*$",

2226 "",

2227 # Remove whitespace+parens

2228 text,

2229 ),

2230 ).strip()

2231 not in infl_map

2232 )

2233 and not re.match(infl_start_re, text)

2234 and all(

2235 x.is_title == row[0].is_title and x.text == text

2236 # all InflCells in `row` have the same is_title and text

2237 for x in row

2238 )

2239 ):

2240 if text and title is None:

2241 # Only if there were no titles previously make the first

2242 # text that is found the title

2243 title = text

2244 if re.match(r"(Note:|Notes:)", title): 2244 ↛ 2245line 2244 didn't jump to line 2245 because the condition on line 2244 was never true

2245 continue # not a title

2246 more_global_tags, more_table_tags, extra_forms = parse_title(

2247 title, source

2248 )

2249 global_tags.extend(more_global_tags)

2250 table_tags.extend(more_table_tags)

2251 ret.extend(extra_forms)

2252 continue # Skip title rows without incrementing i

2253 if "dummy-skip-this" in global_tags: 2253 ↛ 2254line 2253 didn't jump to line 2254 because the condition on line 2253 was never true

2254 return []

2255 rowtags = [()]

2256 # have_hdr = False

2257 # have_hdr never used?

2258 have_text = False

2259 samecell_cnt = 0

2260 col0_hdrspan = None # col0 or later header (despite its name)

2261 col0_followed_by_nonempty = False

2262 row_empty = True

2263 for col_idx, cell in enumerate(row):

2264 colspan = cell.colspan # >= 1

2265 rowspan = cell.rowspan # >= 1

2266 previously_seen = id(cell) in seen_cells

2267 # checks to see if this cell was in the previous ROW

2268 seen_cells.add(id(cell))

2269 if samecell_cnt == 0:

2270 # First column of a (possible multi-column) cell

2271 samecell_cnt = colspan - 1

2272 else:

2273 assert samecell_cnt > 0

2274 samecell_cnt -= 1

2275 continue

2276

2277 # is_first_row_of_cell = cell_rowcnt[id(cell)] == 0

2278 # never used?

2279

2280 # defaultdict(int) around line 1900

2281 cell_rowcnt[id(cell)] += 1

2282 # => how many cols this spans

2283 col = cell.text

2284 if not col:

2285 continue

2286 row_empty = False

2287 is_title = cell.is_title

2288

2289 # If the cell has a target, i.e., text after colon, interpret

2290 # it as simply specifying a value for that value and ignore

2291 # it otherwise.

2292 if cell.target:

2293 text, refs, defs, hdr_tags = extract_cell_content(

2294 lang, word, col

2295 )

2296 if not text: 2296 ↛ 2297line 2296 didn't jump to line 2297 because the condition on line 2296 was never true

2297 continue

2298 refs_tags = set()

2299 for ref in refs: # gets tags from footnotes 2299 ↛ 2300line 2299 didn't jump to line 2300 because the loop on line 2299 never started

2300 if ref in def_ht:

2301 refs_tags.update(def_ht[ref])

2302 rowtags = expand_header(

2303 wxr,

2304 tablecontext,

2305 word,

2306 lang,

2307 pos,

2308 text,

2309 [],

2310 silent=True,

2311 depth=depth,

2312 )

2313 rowtags = list(

2314 set(tuple(sorted(set(x) | refs_tags)) for x in rowtags)

2315 )

2316 is_title = False

2317 col = cell.target

2318

2319 # print(rownum, col_idx, col)

2320 # print(f"is_title: {is_title}")

2321 if is_title:

2322 # It is a header cell

2323 text, refs, defs, hdr_tags = extract_cell_content(

2324 lang, word, col

2325 )

2326 if not text:

2327 continue

2328 # Extract tags from referenced footnotes

2329 refs_tags = set()

2330 for ref in refs:

2331 if ref in def_ht:

2332 refs_tags.update(def_ht[ref])

2333

2334 # Expand header to tags

2335 v = expand_header(

2336 wxr,

2337 tablecontext,

2338 word,

2339 lang,

2340 pos,

2341 text,

2342 [],

2343 silent=True,

2344 depth=depth,

2345 )

2346 # print("EXPANDED {!r} to {}".format(text, v))

2347

2348 if col_idx == 0:

2349 # first_col_has_text is used for a test to ignore

2350 # upper-left cells that are just text without

2351 # header info

2352 first_col_has_text = True

2353 # Check if the header expands to reset hdrspans

2354 if any("dummy-reset-headers" in tt for tt in v):

2355 new_hdrspans = []

2356 for hdrspan in hdrspans:

2357 # if there are HdrSpan objects (abstract headers with

2358 # row- and column-spans) that are to the left or at the

2359 # same row or below, KEEP those; things above and to

2360 # the right of the hdrspan with dummy-reset-headers

2361 # are discarded. Tags from the header together with

2362 # dummy-reset-headers are kept as normal.

2363 if (

2364 hdrspan.start + hdrspan.colspan < col_idx

2365 or hdrspan.rownum > rownum - cell.rowspan

2366 ):

2367 new_hdrspans.append(hdrspan)

2368 hdrspans = new_hdrspans

2369

2370 for tt in v:

2371 if "dummy-section-header" in tt: 2371 ↛ 2372line 2371 didn't jump to line 2372 because the condition on line 2371 was never true

2372 tablecontext.section_header = tt

2373 break

2374 if "dummy-reset-section-header" in tt: 2374 ↛ 2375line 2374 didn't jump to line 2375 because the condition on line 2374 was never true

2375 tablecontext.section_header = []

2376 # Text between headers on a row causes earlier headers to

2377 # be reset

2378 if have_text:

2379 # print(" HAVE_TEXT BEFORE HDR:", col)

2380 # Reset rowtags if new title column after previous

2381 # text cells

2382 # +-----+-----+-----+-----+

2384 # +-----+-----+-----+-----+

2385 # ^reset rowtags=>

2386 # XXX beware of header "—": "" - must not clear on that if

2387 # it expands to no tags

2388 rowtags = [()]

2389 # have_hdr = True

2390 # have_hdr never used?

2391 # print("HAVE_HDR: {} rowtags={}".format(col, rowtags))

2392 # Update rowtags and coltags

2393 has_covering_hdr.add(col_idx) # col_idx == current column

2394 # has_covering_hdr is a set that has the col_idx-ids of columns

2395 # that have previously had some kind of header. It is never

2396 # resetted inside the col_idx-loops OR the bigger rows-loop, so

2397 # applies to the whole table.

2398

2399 rowtags, new_coltags, all_hdr_tags = generate_tags(

2400 rowtags, table_tags

2401 )

2402

2403 if any("dummy-skip-this" in ts for ts in rowtags):

2404 continue # Skip this cell

2405

2406 if any("dummy-load-stored-hdrspans" in ts for ts in v): 2406 ↛ 2407line 2406 didn't jump to line 2407 because the condition on line 2406 was never true

2407 hdrspans.extend(tablecontext.stored_hdrspans)

2408

2409 if any("dummy-reset-stored-hdrspans" in ts for ts in v): 2409 ↛ 2410line 2409 didn't jump to line 2410 because the condition on line 2409 was never true

2410 tablecontext.stored_hdrspans = []

2411

2412 if any("dummy-store-hdrspan" in ts for ts in v): 2412 ↛ 2414line 2412 didn't jump to line 2414 because the condition on line 2412 was never true

2413 # print(f"STORED: {col}")

2414 store_new_hdrspan = True

2415 else:

2416 store_new_hdrspan = False

2417

2418 new_coltags = list(

2419 x

2420 for x in new_coltags

2421 if not any(t in noinherit_tags for t in x)

2422 )

2423 # print("new_coltags={} previously_seen={} all_hdr_tags={}"

2424 # .format(new_coltags, previously_seen, all_hdr_tags))

2425 if any(new_coltags):

2426 (

2427 col,

2428 col0_followed_by_nonempty,

2429 col0_hdrspan,

2430 ) = add_new_hdrspan(

2431 col,

2432 hdrspans,

2433 store_new_hdrspan,

2434 col0_followed_by_nonempty,

2435 col0_hdrspan,

2436 )

2437

2438 continue

2439

2440 # These values are ignored, at least for now

2441 if re.match(r"^(# |\(see )", col): 2441 ↛ 2442line 2441 didn't jump to line 2442 because the condition on line 2441 was never true

2442 continue

2443

2444 if any("dummy-skip-this" in ts for ts in rowtags):

2445 continue # Skip this cell

2446

2447 # If the word has no rowtags and is a multi-row cell, then

2448 # ignore this. This happens with empty separator rows

2449 # within a rowspan>1 cell. cf. wander/English/Conjugation.

2450 if rowtags == [()] and rowspan > 1:

2451 continue

2452

2453 # Minor cleanup. See e.g. είμαι/Greek/Verb present participle.

2454 if cleanup_rules:

2455 for regx, substitution in cleanup_rules.items():

2456 col = re.sub(regx, substitution, col)

2457

2458 if ( 2458 ↛ 2463line 2458 didn't jump to line 2463 because the condition on line 2458 was never true

2459 col_idx == 0

2460 and not first_col_has_text

2461 and get_lang_conf(lang, "ignore_top_left_text_cell") is True

2462 ):

2463 continue # Skip text at top left, as in Icelandic, Faroese

2464

2465 # if col0_hdrspan is not None:

2466 # print("COL0 FOLLOWED NONHDR: {!r} by {!r}"

2467 # .format(col0_hdrspan.text, col))

2468 col0_followed_by_nonempty = True

2469 have_text = True

2470

2471 # Determine column tags for the multi-column cell

2472 combined_coltags = compute_coltags(

2473 lang, pos, hdrspans, col_idx, colspan, col

2474 )

2475 if any("dummy-ignored-text-cell" in ts for ts in combined_coltags): 2475 ↛ 2476line 2475 didn't jump to line 2476 because the condition on line 2475 was never true

2476 continue

2477

2478 # print("HAVE_TEXT:", repr(col))

2479 # Split the text into separate forms. First simplify spaces except

2480 # newline.

2481 col = re.sub(r"[ \t\r]+", " ", col)

2482 # Split the cell text into alternatives

2483

2484 col, alts, split_extra_tags = split_text_into_alts(col)

2485

2486 # Some cells have mixed form content, like text and romanization,

2487 # or text and IPA. Handle these.

2488 alts = handle_mixed_lines(alts)

2489

2490 alts = list((x, combined_coltags) for x in alts)

2491

2492 # Generate forms from the alternatives

2493 # alts is a list of (tuple of forms, tuple of tags)

2494 for (form, base_roman, ipa), coltags in alts:

2495 form = form.strip()

2496 extra_tags = []

2497 extra_tags.extend(split_extra_tags)

2498 # Handle special splits again here, so that we can have custom

2499 # mappings from form to form and tags.

2500 if form in form_replacements:

2501 replacement, tags = form_replacements[form]

2502 for x in tags.split():

2503 assert x in valid_tags

2504 assert isinstance(replacement, str)

2505 assert isinstance(tags, str)

2506 form = replacement

2507 extra_tags.extend(tags.split())

2508 # Clean the value, extracting reference symbols

2509 form, refs, defs, hdr_tags = extract_cell_content(

2510 lang, word, form

2511 )

2512 # if refs:

2513 # print("REFS:", refs)

2514 extra_tags.extend(hdr_tags)

2515 # Extract tags from referenced footnotes

2516 # Extract tags from referenced footnotes

2517 refs_tags = set()

2518 for ref in refs:

2519 if ref in def_ht:

2520 refs_tags.update(def_ht[ref])

2521

2522 if base_roman:

2523 base_roman, _, _, hdr_tags = extract_cell_content(

2524 lang, word, base_roman

2525 )

2526 extra_tags.extend(hdr_tags)

2527

2528 # Do some additional cleanup on the cell.

2529 form = re.sub(r"^\s*,\s*", "", form)

2530 form = re.sub(r"\s*,\s*$", "", form)

2531 form = re.sub(r"\s*(,\s*)+", ", ", form)

2532 form = re.sub(r"(?i)^Main:", "", form)

2533 form = re.sub(r"\s+", " ", form)

2534 form = form.strip()

2535

2536 # Look for parentheses that have semantic meaning

2537 form, et = find_semantic_parens(form)

2538 extra_tags.extend(et)

2539

2540 # Handle parentheses in the table element. We parse

2541 # tags anywhere and romanizations anywhere but beginning.

2542 roman = base_roman

2543 paren = None

2544 clitic = None

2545 m = re.search(r"(\s+|^)$([^)]*)$", form)

2546 # start|spaces + (anything)

2547 if m is not None:

2548 subst = m.group(1)

2549 paren = m.group(2)

2550 else:

2551 m = re.search(r"$([^)]*)$(\s+|$)", form)

2552 # (anything) + spaces|end

2553 if m is not None: 2553 ↛ 2554line 2553 didn't jump to line 2554 because the condition on line 2553 was never true

2554 paren = m.group(1)

2555 subst = m.group(2)

2556 if paren is not None:

2557 form, roman, clitic = handle_parens(

2558 form, roman, clitic, extra_tags

2559 )

2560

2561 # Ignore certain forms that are not really forms,

2562 # unless they're really, really close to the article title

2563 if form in ( 2563 ↛ 2568line 2563 didn't jump to line 2568 because the condition on line 2563 was never true

2564 "",

2565 "unchanged",

2566 "after an", # in sona/Irish/Adj/Mutation

2567 ):

2568 Lev = distw([form], word)

2569 if form and Lev < 0.1:

2570 wxr.wtp.debug(

2571 "accepted possible false positive '{}' with"

2572 "> 0.1 Levenshtein distance in {}/{}".format(

2573 form, word, lang

2574 ),

2575 sortid="inflection/2213",

2576 )

2577 elif form and Lev < 0.3:

2578 wxr.wtp.debug(

2579 "skipped possible match '{}' with > 0.3"

2580 "Levenshtein distance in {}/{}".format(

2581 form, word, lang

2582 ),

2583 sortid="inflection/2218",

2584 )

2585 continue

2586 else:

2587 continue

2588 # print("ROWTAGS={} COLTAGS={} REFS_TAGS={} "

2589 # "FORM={!r} ROMAN={!r}"

2590 # .format(rowtags, coltags, refs_tags,

2591 # form, roman))

2592

2593 # Merge tags from row and column and do miscellaneous

2594 # tag-related handling.

2595 (

2596 merge_ret,

2597 form,

2598 some_has_covered_text,

2599 ) = merge_row_and_column_tags(form, some_has_covered_text)

2600 ret.extend(merge_ret)

2601

2602 # End of row.

2603 rownum += 1

2604 # For certain languages, if the row was empty, reset

2605 # hdrspans (saprast/Latvian/Verb, but not aussteigen/German/Verb).

2606 if row_empty and get_lang_conf(lang, "empty_row_resets"):

2607 hdrspans = []

2608 # Check if we should expand col0_hdrspan.

2609 if col0_hdrspan is not None:

2610 col0_allowed = get_lang_conf(lang, "hdr_expand_first")

2611 col0_cats = tagset_cats(col0_hdrspan.tagsets)

2612 # Only expand if col0_cats and later_cats are allowed

2613 # and don't overlap and col0 has tags, and there have

2614 # been no disallowed cells in between.

2615 if (

2616 not col0_followed_by_nonempty

2617 and not (col0_cats - col0_allowed)

2618 and

2619 # len(col0_cats) == 1 and

2620 col_idx > col0_hdrspan.start + col0_hdrspan.colspan

2621 ):

2622 # If an earlier header is only followed by headers that yield

2623 # no tags, expand it to entire row

2624 # print("EXPANDING COL0: {} from {} to {} cols {}"

2625 # .format(col0_hdrspan.text, col0_hdrspan.colspan,

2626 # len(row) - col0_hdrspan.start,

2627 # col0_hdrspan.tagsets))

2628 col0_hdrspan.colspan = len(row) - col0_hdrspan.start

2629 col0_hdrspan.expanded = True

2630 # XXX handle refs and defs

2631 # for x in hdrspans:

2632 # print(" HDRSPAN {} {} {} {!r}"

2633 # .format(x.start, x.colspan, x.tagsets, x.text))

2634

2635 # Post-process German nouns with articles in separate columns. We move the

2636 # definite/indefinite/usually-without-article markers into the noun and

2637 # remove the article entries.

2638 if get_lang_conf(lang, "articles_in_separate_columns") and any(

2639 "noun" in x["tags"] for x in ret

2640 ):

2641 new_ret = []

2642 saved_tags = set()

2643 had_noun = False

2644 for dt in ret:

2645 tags = dt["tags"]

2646 # print(tags)

2647 if "noun" in tags:

2648 tags = list(

2649 sorted(set(t for t in tags if t != "noun") | saved_tags)

2650 )

2651 had_noun = True

2652 elif ( 2652 ↛ 2679line 2652 didn't jump to line 2679 because the condition on line 2652 was always true

2653 "indefinite" in tags

2654 or "definite" in tags

2655 or "usually-without-article" in tags

2656 or "without-article" in tags

2657 ):

2658 if had_noun:

2659 saved_tags = set(tags)

2660 else:

2661 saved_tags = saved_tags | set(tags) # E.g. Haus/German

2662 remove_useless_tags(lang, pos, saved_tags)

2663 saved_tags = saved_tags & set(

2664 [

2665 "masculine",

2666 "feminine",

2667 "neuter",

2668 "singular",

2669 "plural",

2670 "indefinite",

2671 "definite",

2672 "usually-without-article",

2673 "without-article",

2674 ]

2675 )

2676 had_noun = False

2677 continue # Skip the articles

2678

2679 dt = dt.copy()

2680 dt["tags"] = tags

2681 new_ret.append(dt)

2682 ret = new_ret

2683

2684 elif possibly_ignored_forms:

2685 # Some languages have tables with cells that are kind of separated

2686 # and difficult to handle, like eulersche Formel/German where

2687 # the definite and indefinite articles are just floating.

2688 # If a language has a dict of conditionally_ignored_cells,

2689 # and if the contents of a cell is found in one of the rules

2690 # there, ignore that cell if it

2691 # 1. Does not have the appropriate tag (like "definite" for "die")

2692 # and

2693 # 2. The title of the article is not one of the other co-words

2694 # (ie. it's an article for the definite articles in german etc.)

2695 # pass

2696 new_ret = []

2697 for cell_data in ret:

2698 tags = cell_data["tags"]

2699 text = cell_data["form"]

2700 skip_this = False

2701 for key_tag, ignored_forms in possibly_ignored_forms.items():

2702 if text not in ignored_forms: 2702 ↛ 2704line 2702 didn't jump to line 2704 because the condition on line 2702 was always true

2703 continue

2704 if word in ignored_forms:

2705 continue

2706 if key_tag not in tags:

2707 skip_this = True

2708

2709 if skip_this: 2709 ↛ 2710line 2709 didn't jump to line 2710 because the condition on line 2709 was never true

2710 continue

2711 new_ret.append(cell_data)

2712

2713 ret = new_ret

2714

2715 # Post-process English inflection tables, addding "multiword-construction"

2716 # when the number of words has increased.

2717 if lang == "English" and pos == "verb":

2718 word_words = len(word.split())

2719 new_ret = []

2720 for dt in ret:

2721 form = dt.get("form", "")

2722 if len(form.split()) > word_words:

2723 dt = dt.copy()

2724 dt["tags"] = list(dt.get("tags", []))

2725 # This strange copy-assigning shuffle is preventative black

2726 # magic; do not touch lest you invoke deep bugs.

2727 data_append(dt, "tags", "multiword-construction")

2728 new_ret.append(dt)

2729 ret = new_ret

2730

2731 # Always insert "table-tags" detail as the first entry in any inflection

2732 # table. This way we can reliably detect where a new table starts.

2733 # Table-tags applies until the next table-tags entry.

2734 if ret or table_tags:

2735 table_tags = list(sorted(set(table_tags)))

2736 dt = {

2737 "form": " ".join(table_tags),

2738 "source": source,

2739 "tags": ["table-tags"],

2740 }

2741 if dt["form"] == "":

2742 dt["form"] = "no-table-tags"

2743 if tablecontext.template_name:

2744 tn = {

2745 "form": tablecontext.template_name,

2746 "source": source,

2747 "tags": ["inflection-template"],

2748 }

2749 ret = [dt] + [tn] + ret

2750 else:

2751 ret = [dt] + ret

2752

2753 return ret

2754

2755

2756def handle_generic_table(

2757 wxr, tablecontext, data, word, lang, pos, rows, titles, source, after, depth

2758):

2759 assert isinstance(wxr, WiktextractContext)

2760 assert isinstance(data, dict)

2761 assert isinstance(word, str)

2762 assert isinstance(lang, str)

2763 assert isinstance(pos, str)

2764 assert isinstance(rows, list)

2765 assert isinstance(source, str)

2766 assert isinstance(after, str)

2767 assert isinstance(depth, int)

2768 for row in rows:

2769 assert isinstance(row, list)

2770 for x in row:

2771 assert isinstance(x, InflCell)

2772 assert isinstance(titles, list)

2773 for x in titles:

2774 assert isinstance(x, str)

2775

2776 # Try to parse the table as a simple table

2777 ret = parse_simple_table(

2778 wxr, tablecontext, word, lang, pos, rows, titles, source, after, depth

2779 )

2780 if ret is None: 2780 ↛ 2783line 2780 didn't jump to line 2783 because the condition on line 2780 was never true

2781 # XXX handle other table formats

2782 # We were not able to handle the table

2783 wxr.wtp.debug(

2784 "unhandled inflection table format, {}/{}".format(word, lang),

2785 sortid="inflection/2370",

2786 )

2787 return

2788

2789 # Add the returned forms but eliminate duplicates.

2790 have_forms = set()

2791 for dt in ret:

2792 fdt = freeze(dt)

2793 if fdt in have_forms:

2794 continue # Don't add duplicates

2795 # Some Russian words have Declension and Pre-reform declension partially

2796 # duplicating same data. Don't add "dated" tags variant if already have

2797 # the same without "dated" from the modern declension table

2798

2799 tags = dt.get("tags", [])

2800 for dated_tag in ("dated",):

2801 if dated_tag in tags:

2802 dt2 = dt.copy()

2803 tags2 = list(x for x in tags if x != dated_tag)

2804 dt2["tags"] = tags2

2805 if tags2 and freeze(dt2) in have_forms: 2805 ↛ 2806line 2805 didn't jump to line 2806 because the condition on line 2805 was never true

2806 break # Already have without archaic

2807 else:

2808 if "table-tags" not in tags:

2809 have_forms.add(fdt)

2810 data_append(data, "forms", dt)

2811

2812

2813def determine_header(

2814 wxr,

2815 tablecontext,

2816 lang,

2817 word,

2818 pos,

2819 table_kind,

2820 kind,

2821 style,

2822 row,

2823 col,

2824 celltext,

2825 titletext,

2826 cols_headered,

2827 target,

2828 cellstyle,

2829):

2830 assert isinstance(table_kind, NodeKind)

2831 assert isinstance(kind, (NodeKind, str))

2832 assert style is None or isinstance(style, str)

2833 assert cellstyle is None or isinstance(cellstyle, str)

2834

2835 if table_kind == NodeKind.TABLE:

2836 header_kind = NodeKind.TABLE_HEADER_CELL

2837 elif table_kind == NodeKind.HTML: 2837 ↛ 2839line 2837 didn't jump to line 2839 because the condition on line 2837 was always true

2838 header_kind = "th"

2839 idx = celltext.find(": ")

2840 is_title = False

2841 # remove anything in parentheses, compress whitespace, .strip()

2842 cleaned_titletext = re.sub(

2843 r"\s+", " ", re.sub(r"\s*$[^)]*$", "", titletext)

2844 ).strip()

2845 cleaned, _, _, _ = extract_cell_content(lang, word, celltext)

2846 cleaned = re.sub(r"\s+", " ", cleaned)

2847 hdr_expansion = expand_header(

2848 wxr,

2849 tablecontext,

2850 word,

2851 lang,

2852 pos,

2853 cleaned,

2854 [],

2855 silent=True,

2856 ignore_tags=True,

2857 )

2858 candidate_hdr = not any(

2859 any(t.startswith("error-") for t in ts) for ts in hdr_expansion

2860 )

2861 # KJ candidate_hdr says that a specific cell is a candidate

2862 # for being a header because it passed through expand_header

2863 # without getting any "error-" tags; that is, the contents

2864 # is "valid" for being a header; these are the false positives

2865 # we want to catch

2866 ignored_cell = any(

2867 any(t.startswith("dummy-") for t in ts) for ts in hdr_expansion

2868 )

2869 # ignored_cell should NOT be used to filter for headers, like

2870 # candidate_hdr is used, but only to filter for related *debug

2871 # messages*: some dummy-tags are actually half-way to headers,

2872 # like ones with "Notes", so they MUST be headers, but later

2873 # on they're ignored *as* headers so they don't need to print

2874 # out any cells-as-headers debug messages.

2875 if (

2876 candidate_hdr

2877 and kind != header_kind

2878 and cleaned != ""

2879 and cleaned != "dummy-ignored-text-cell"

2880 and cleaned not in IGNORED_COLVALUES

2881 ):

2882 # print("col: {}".format(col))

2883 if not ignored_cell and lang not in LANGUAGES_WITH_CELLS_AS_HEADERS:

2884 wxr.wtp.debug(

2885 "rejected heuristic header: "

2886 "table cell identified as header and given "

2887 "candidate status, BUT {} is not in "

2888 "LANGUAGES_WITH_CELLS_AS_HEADERS; "

2889 "cleaned text: {}".format(lang, cleaned),

2890 sortid="inflection/2447",

2891 )

2892 candidate_hdr = False

2893 elif cleaned not in LANGUAGES_WITH_CELLS_AS_HEADERS.get(lang, ""):

2894 wxr.wtp.debug(

2895 "rejected heuristic header: "

2896 "table cell identified as header and given "

2897 "candidate status, BUT the cleaned text is "

2898 "not in LANGUAGES_WITH_CELLS_AS_HEADERS[{}]; "

2899 "cleaned text: {}".format(lang, cleaned),

2900 sortid="inflection/2457",

2901 )

2902 candidate_hdr = False

2903 else:

2904 wxr.wtp.debug(

2905 "accepted heuristic header: "

2906 "table cell identified as header and given "

2907 "candidate status, AND the cleaned text is "

2908 "in LANGUAGES_WITH_CELLS_AS_HEADERS[{}]; "

2909 "cleaned text: {}".format(lang, cleaned),

2910 sortid="inflection/2466",

2911 )

2912

2913 # If the cell starts with something that could start a

2914 # definition (typically a reference symbol), make it a candidate

2915 # regardless of whether the language is listed.

2916 if re.match(def_re, cleaned) and not re.match(nondef_re, cleaned): 2916 ↛ 2917line 2916 didn't jump to line 2917 because the condition on line 2916 was never true

2917 candidate_hdr = True

2918

2919 # print("titletext={!r} hdr_expansion={!r} candidate_hdr={!r} "

2920 # "lang={} pos={}"

2921 # .format(titletext, hdr_expansion, candidate_hdr,

2922 # lang, pos))

2923 if idx >= 0 and titletext[:idx] in infl_map:

2924 target = titletext[idx + 2 :].strip()

2925 celltext = celltext[:idx]

2926 is_title = True

2927 elif (

2928 kind == header_kind

2929 and " + " not in titletext # For "avoir + blah blah"?

2930 and not any(

2931 isinstance(x, WikiNode)

2932 and x.kind == NodeKind.HTML

2933 and x.sarg == "span"

2934 and x.attrs.get("lang") in ("az",)

2935 for x in col.children

2936 )

2937 ):

2938 is_title = True

2939 elif (

2940 candidate_hdr

2941 and cleaned_titletext not in IGNORED_COLVALUES

2942 and distw([cleaned_titletext], word) > 0.3

2943 and cleaned_titletext not in ("I", "es")

2944 ):

2945 is_title = True

2946 # if first column or same style as first column

2947 elif (

2948 style == cellstyle

2949 and

2950 # and title is not identical to word name

2951 titletext != word

2952 and cleaned not in IGNORED_COLVALUES

2953 and cleaned != "dummy-ignored-text-cell"

2954 and

2955 # the style composite string is not broken

2956 not style.startswith("////")

2957 and " + " not in titletext

2958 ):

2959 if not ignored_cell and lang not in LANGUAGES_WITH_CELLS_AS_HEADERS: 2959 ↛ 2960line 2959 didn't jump to line 2960 because the condition on line 2959 was never true

2960 wxr.wtp.debug(

2961 "rejected heuristic header: "

2962 "table cell identified as header based "

2963 "on style, BUT {} is not in "

2964 "LANGUAGES_WITH_CELLS_AS_HEADERS; "

2965 "cleaned text: {}, style: {}".format(lang, cleaned, style),

2966 sortid="inflection/2512",

2967 )

2968 elif ( 2968 ↛ 2972line 2968 didn't jump to line 2972 because the condition on line 2968 was never true

2969 not ignored_cell

2970 and cleaned not in LANGUAGES_WITH_CELLS_AS_HEADERS.get(lang, "")

2971 ):

2972 wxr.wtp.debug(

2973 "rejected heuristic header: "

2974 "table cell identified as header based "

2975 "on style, BUT the cleaned text is "

2976 "not in LANGUAGES_WITH_CELLS_AS_HEADERS[{}]; "

2977 "cleaned text: {}, style: {}".format(lang, cleaned, style),

2978 sortid="inflection/2522",

2979 )

2980 else:

2981 wxr.wtp.debug(

2982 "accepted heuristic header: "

2983 "table cell identified as header based "

2984 "on style, AND the cleaned text is "

2985 "in LANGUAGES_WITH_CELLS_AS_HEADERS[{}]; "

2986 "cleaned text: {}, style: {}".format(lang, cleaned, style),

2987 sortid="inflection/2530",

2988 )

2989 is_title = True

2990 if ( 2990 ↛ 2997line 2990 didn't jump to line 2997 because the condition on line 2990 was never true

2991 not is_title

2992 and len(row) < len(cols_headered)

2993 and cols_headered[len(row)]

2994 ):

2995 # Whole column has title suggesting they are headers

2996 # (e.g. "Case")

2997 is_title = True

2998 if re.match(

2999 r"Conjugation of |Declension of |Inflection of |"

3000 r"Mutation of |Notes\b", # \b is word-boundary

3001 titletext,

3002 ):

3003 is_title = True

3004 return is_title, hdr_expansion, target, celltext

3005

3006

3007class TableContext:

3008 """Saved context used when parsing a table and its subtables."""

3009

3010 __slot__ = (

3011 "stored_hdrspans",

3012 "section_header",

3013 "template_name",

3014 )

3015

3016 def __init__(self, template_name=None):

3017 self.stored_hdrspans = []

3018 self.section_header = []

3019 if not template_name:

3020 self.template_name = ""

3021 else:

3022 self.template_name = template_name

3023

3024

3025def handle_wikitext_or_html_table(

3026 wxr, word, lang, pos, data, tree, titles, source, after, tablecontext=None

3027):

3028 """Parses a table from parsed Wikitext format into rows and columns of

3029 InflCell objects and then calls handle_generic_table() to parse it into

3030 forms. This adds the forms into ``data``."""

3031 assert isinstance(wxr, WiktextractContext)

3032 assert isinstance(word, str)

3033 assert isinstance(lang, str)

3034 assert isinstance(pos, str)

3035 assert isinstance(data, dict)

3036 assert isinstance(tree, WikiNode)

3037 assert tree.kind == NodeKind.TABLE or (

3038 tree.kind == NodeKind.HTML and tree.sarg == "table"

3039 )

3040 assert isinstance(titles, list)

3041 assert isinstance(source, str)

3042 for x in titles:

3043 assert isinstance(x, str)

3044 assert isinstance(after, str)

3045 assert tablecontext is None or isinstance(tablecontext, TableContext)

3046 # Imported here to avoid a circular import

3047 from wiktextract.page import clean_node, recursively_extract

3048

3049 if not tablecontext:

3050 tablecontext = TableContext()

3051

3052 def handle_table1(

3053 wxr,

3054 tablecontext,

3055 word,

3056 lang,

3057 pos,

3058 data,

3059 tree,

3060 titles,

3061 source,

3062 after,

3063 depth,

3064 ):

3065 """Helper function allowing the 'flattening' out of the table

3066 recursion: instead of handling the tables in the wrong order

3067 (recursively), this function adds to new_row that is then

3068 iterated through in the main function at the end, creating

3069 a longer table (still in pieces) in the correct order."""

3070

3071 assert isinstance(data, dict)

3072 assert isinstance(titles, list)

3073 assert isinstance(source, str)

3074 for x in titles:

3075 assert isinstance(x, str)

3076 assert isinstance(after, str)

3077 assert isinstance(depth, int)

3078 # print("HANDLE_WIKITEXT_TABLE", titles)

3079

3080 col_gap_data = [] # Filling for columns with rowspan > 1

3081 # col_gap_data contains None or InflCell

3082 vertical_still_left = [] # Number of remaining rows for which to fill

3083 # the column; vertical_still_left contains int

3084 cols_headered = [] # [F, T, F, F...]

3085 # True when the whole column contains headers, even

3086 # when the cell is not considered a header; triggered

3087 # by the "*" inflmap meta-tag.

3088 rows = []

3089

3090 sub_ret = []

3091

3092 for node in tree.children:

3093 if not isinstance(node, WikiNode):

3094 continue

3095 if node.kind == NodeKind.HTML:

3096 kind = node.sarg

3097 else:

3098 kind = node.kind

3099

3100 # print(" {}".format(node))

3101 if kind in (NodeKind.TABLE_CAPTION, "caption"):

3102 # print(" CAPTION:", node)

3103 pass

3104 elif kind in (NodeKind.TABLE_ROW, "tr"):

3105 if "vsShow" in node.attrs.get("class", "").split():

3106 # vsShow rows are those that are intially shown in tables

3107 # that have more data. The hidden data duplicates these

3108 # rows, so we skip it and just process the hidden data.

3109 continue

3110

3111 # Parse a table row.

3112 row = []

3113 style = None

3114 row_has_nonempty_cells = False

3115 # Have nonempty cell not from rowspan

3116 for col in node.children:

3117 # loop through each cell in the ROW

3118 if not isinstance(col, WikiNode):

3119 # This skip is not used for counting,

3120 # "None" is not used in

3121 # indexing or counting or looping.

3122 continue

3123 if col.kind == NodeKind.HTML:

3124 kind = col.sarg

3125 else:

3126 kind = col.kind

3127 if kind not in ( 3127 ↛ 3133line 3127 didn't jump to line 3133 because the condition on line 3127 was never true

3128 NodeKind.TABLE_HEADER_CELL,

3129 NodeKind.TABLE_CELL,

3130 "th",

3131 "td",

3132 ):

3133 print(" UNEXPECTED ROW CONTENT: {}".format(col))

3134 continue

3135

3136 while (

3137 len(row) < len(vertical_still_left)

3138 and vertical_still_left[len(row)] > 0

3139 ):

3140 # vertical_still_left is [...0, 0, 2...] for each

3141 # column. It is populated at the end of the loop, at the

3142 # same time as col_gap_data. This needs to be looped and

3143 # filled this way because each `for col`-looping jumps

3144 # straight to the next meaningful cell; there is no

3145 # "None" cells, only emptiness between, and rowspan and

3146 # colspan are just to generate the "fill-

3147 vertical_still_left[len(row)] -= 1

3148 row.append(col_gap_data[len(row)])

3149

3150 # appending row is how "indexing" is

3151 # done here; something is appended,

3152 # like a filler-cell here or a "start"

3153 # cell at the end of the row-loop,

3154 # which increased len(row) which is

3155 # then used as the target-index to check

3156 # for gaps. vertical_still_left is

3157 # the countdown to when to stop

3158 # filling in gaps, and goes down to 0,

3159 # and col_gap_data is not touched

3160 # except when a new rowspan is needed,

3161 # at the same time that

3162 # vertical_still_left gets reassigned.

3163

3164 try:

3165 rowspan = int(col.attrs.get("rowspan", "1")) # 🡙

3166 colspan = int(col.attrs.get("colspan", "1")) # 🡘

3167 except ValueError:

3168 rowspan = 1

3169 colspan = 1

3170 # print("COL:", col)

3171

3172 # Process any nested tables recursively.

3173 tables, rest = recursively_extract(

3174 col,

3175 lambda x: isinstance(x, WikiNode)

3176 and (x.kind == NodeKind.TABLE or x.sarg == "table"),

3177 )

3178

3179 # Clean the rest of the cell.

3180 celltext = clean_node(wxr, None, rest)

3181 # print("CLEANED:", celltext)

3182

3183 # Handle nested tables.

3184 for tbl in tables:

3185 # Some nested tables (e.g., croí/Irish) have subtitles

3186 # as normal paragraphs in the same cell under a descrip-

3187 # tive text that should be treated as a title (e.g.,

3188 # "Forms with the definite article", with "definite" not

3189 # mentioned elsewhere).

3190 new_titles = list(titles)

3191 if celltext:

3192 new_titles.append(celltext)

3193 subtbl = handle_table1(

3194 wxr,

3195 tablecontext,

3196 word,

3197 lang,

3198 pos,

3199 data,

3200 tbl,

3201 new_titles,

3202 source,

3203 "",

3204 depth + 1,

3205 )

3206 if subtbl: 3206 ↛ 3184line 3206 didn't jump to line 3184 because the condition on line 3206 was always true

3207 sub_ret.append((rows, titles, after, depth))

3208 rows = []

3209 titles = []

3210 after = ""

3211 sub_ret.extend(subtbl)

3212

3213 # This magic value is used as part of header detection

3214 cellstyle = (

3215 col.attrs.get("style", "")

3216 + "//"

3217 + col.attrs.get("class", "")

3218 + "//"

3219 + str(kind)

3220 )

3221

3222 if not row: # if first column in row

3223 style = cellstyle

3224 target = None

3225 titletext = celltext.strip()

3226 while titletext and is_superscript(titletext[-1]):

3227 titletext = titletext[:-1]

3228

3229 (

3230 is_title,

3231 hdr_expansion,

3232 target,

3233 celltext,

3234 ) = determine_header(

3235 wxr,

3236 tablecontext,

3237 lang,

3238 word,

3239 pos,

3240 tree.kind,

3241 kind,

3242 style,

3243 row,

3244 col,

3245 celltext,

3246 titletext,

3247 cols_headered,

3248 None,

3249 cellstyle,

3250 )

3251

3252 if is_title:

3253 # If this cell gets a "*" tag, make the whole column

3254 # below it (toggling it in cols_headered = [F, F, T...])

3255 # into headers.

3256 while len(cols_headered) <= len(row):

3257 cols_headered.append(False)

3258 if any("*" in tt for tt in hdr_expansion):

3259 cols_headered[len(row)] = True

3260 celltext = ""

3261 # if row_has_nonempty_cells has been True at some point, it

3262 # keeps on being True.

3263 # if row_has_nonempty_cells or is_title or celltext != "":

3264 # row_has_nonempty_cells = True

3265 # ⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓

3266 row_has_nonempty_cells |= is_title or celltext != ""

3267 cell = InflCell(

3268 celltext, is_title, colspan, rowspan, target

3269 )

3270 for _ in range(0, colspan):

3271 # colspan🡘 current loop (col) or 1

3272 # All the data-filling for colspan

3273 # is done simply in this loop,

3274 # while rowspan needs to use

3275 # vertical_still_left to count gaps

3276 # and col_gap_data to fill in

3277 # those gaps with InflCell data.

3278 if rowspan > 1: # rowspan🡙 current loop (col) or 1

3279 while len(col_gap_data) <= len(row):

3280 # Initialize col_gap_data/ed if

3281 # it is lacking slots

3282 # for each column; col_gap_data and

3283 # vertical_still_left are never

3284 # reset to [], during

3285 # the whole table function.

3286 col_gap_data.append(None)

3287 vertical_still_left.append(0)

3288 # Below is where the "rectangle" block of rowspan

3289 # and colspan is filled for the future.

3290 col_gap_data[len(row)] = cell

3291 # col_gap_data contains cells that

3292 # will be used in the

3293 # future, or None

3294 vertical_still_left[len(row)] = rowspan - 1

3295 # A counter for how many gaps🡙 are still left to be

3296 # filled (row.append or

3297 # row[col_gap_data[len(row)] =>

3298 # rows), it is not reset to [], but decremented to 0

3299 # each time a row gets something from col_gap_data.

3300 # Append this cell 1+ times for colspan🡘

3301 row.append(cell)

3302 if not row:

3303 continue

3304 # After looping the original row-nodes above, fill

3305 # in the rest of the row if the final cell has colspan

3306 # (inherited from above, so a cell with rowspan and colspan)

3307 for i in range(len(row), len(vertical_still_left)):

3308 if vertical_still_left[i] <= 0:

3309 continue

3310 vertical_still_left[i] -= 1

3311 while len(row) < i:

3312 row.append(InflCell("", False, 1, 1, None))

3313 row.append(col_gap_data[i])

3314 # print(" ROW {!r}".format(row))

3315 if row_has_nonempty_cells: 3315 ↛ 3092line 3315 didn't jump to line 3092 because the condition on line 3315 was always true

3316 rows.append(row)

3317 elif kind in ( 3317 ↛ 3092line 3317 didn't jump to line 3092 because the condition on line 3317 was always true

3318 NodeKind.TABLE_HEADER_CELL,

3319 NodeKind.TABLE_CELL,

3320 "th",

3321 "td",

3322 "span",

3323 ):

3324 # print(" TOP-LEVEL CELL", node)

3325 pass

3326

3327 if sub_ret:

3328 main_ret = sub_ret

3329 main_ret.append((rows, titles, after, depth))

3330 else:

3331 main_ret = [(rows, titles, after, depth)]

3332 return main_ret

3333

3334 new_rows = handle_table1(

3335 wxr, tablecontext, word, lang, pos, data, tree, titles, source, after, 0

3336 )

3337

3338 # Now we have a table that has been parsed into rows and columns of

3339 # InflCell objects. Parse the inflection table from that format.

3340 if new_rows: 3340 ↛ exitline 3340 didn't return from function 'handle_wikitext_or_html_table' because the condition on line 3340 was always true

3341 for rows, titles, after, depth in new_rows:

3342 handle_generic_table(

3343 wxr,

3344 tablecontext,

3345 data,

3346 word,

3347 lang,

3348 pos,

3349 rows,

3350 titles,

3351 source,

3352 after,

3353 depth,

3354 )

3355

3356

3357def handle_html_table(

3358 wxr, word, lang, pos, data, tree, titles, source, after, tablecontext=None

3359):

3360 """A passer-on function for html-tables, XXX, remove these?"""

3361 handle_wikitext_or_html_table(

3362 wxr, word, lang, pos, data, tree, titles, source, after, tablecontext

3363 )

3364

3365

3366def handle_wikitext_table(

3367 wxr, word, lang, pos, data, tree, titles, source, after, tablecontext=None

3368):

3369 """A passer-on function for html-tables, XXX, remove these?"""

3370 handle_wikitext_or_html_table(

3371 wxr, word, lang, pos, data, tree, titles, source, after, tablecontext

3372 )

3373

3374

3375def parse_inflection_section(

3376 wxr, data, word, lang, pos, section, tree, tablecontext=None

3377):

3378 """Parses an inflection section on a page. ``data`` should be the

3379 data for a part-of-speech, and inflections will be added to it."""

3380

3381 # print("PARSE_INFLECTION_SECTION {}/{}/{}/{}"

3382 # .format(word, lang, pos, section))

3383 assert isinstance(wxr, WiktextractContext)

3384 assert isinstance(data, dict)

3385 assert isinstance(word, str)

3386 assert isinstance(lang, str)

3387 assert isinstance(section, str)

3388 assert isinstance(tree, WikiNode)

3389 assert tablecontext is None or isinstance(tablecontext, TableContext)

3390 source = section

3391 tables = []

3392 titleparts = []

3393 preceding_bolded_title = ""

3394

3395 def process_tables():

3396 for kind, node, titles, after in tables:

3397 after = "".join(after).strip()

3398 after = clean_value(wxr, after)

3399 if kind == "wikitext":

3400 handle_wikitext_table(

3401 wxr,

3402 word,

3403 lang,

3404 pos,

3405 data,

3406 node,

3407 titles,

3408 source,

3409 after,

3410 tablecontext=tablecontext,

3411 )

3412 elif kind == "html": 3412 ↛ 3426line 3412 didn't jump to line 3426 because the condition on line 3412 was always true

3413 handle_html_table(

3414 wxr,

3415 word,

3416 lang,

3417 pos,

3418 data,

3419 node,

3420 titles,

3421 source,

3422 after,

3423 tablecontext=tablecontext,

3424 )

3425 else:

3426 raise RuntimeError(

3427 "{}: unimplemented table kind {}".format(word, kind)

3428 )

3429

3430 def recurse_navframe(node, titles):

3431 nonlocal tables

3432 nonlocal titleparts

3433 titleparts = []

3434 old_tables = tables

3435 tables = []

3436

3437 recurse(node, [], navframe=True)

3438

3439 process_tables()

3440 tables = old_tables

3441

3442 def recurse(node, titles, navframe=False):

3443 nonlocal tables

3444 if isinstance(node, (list, tuple)):

3445 for x in node:

3446 recurse(x, titles, navframe)

3447 return

3448 if isinstance(node, str):

3449 if tables:

3450 tables[-1][-1].append(node)

3451 elif navframe:

3452 titleparts.append(node)

3453 return

3454 if not isinstance(node, WikiNode): 3454 ↛ 3455line 3454 didn't jump to line 3455 because the condition on line 3454 was never true

3455 if navframe:

3456 wxr.wtp.debug(

3457 "inflection table: unhandled in NavFrame: {}".format(node),

3458 sortid="inflection/2907",

3459 )

3460 return

3461 kind = node.kind

3462 if navframe:

3463 if kind == NodeKind.HTML:

3464 classes = node.attrs.get("class", "").split()

3465 if "NavToggle" in classes: 3465 ↛ 3466line 3465 didn't jump to line 3466 because the condition on line 3465 was never true

3466 return

3467 if "NavHead" in classes:

3468 # print("NAVHEAD:", node)

3469 recurse(node.children, titles, navframe)

3470 return

3471 if "NavContent" in classes:

3472 # print("NAVCONTENT:", node)

3473 title = "".join(titleparts).strip()

3474 title = html.unescape(title)

3475 title = title.strip()

3476 new_titles = list(titles)

3477 if not re.match(r"(Note:|Notes:)", title): 3477 ↛ 3479line 3477 didn't jump to line 3479 because the condition on line 3477 was always true

3478 new_titles.append(title)

3479 recurse(node, new_titles, navframe=False)

3480 return

3481 else:

3482 if kind == NodeKind.TABLE:

3483 tables.append(["wikitext", node, titles, []])

3484 return

3485 elif kind == NodeKind.HTML and node.sarg == "table":

3486 classes = node.attrs.get("class", ())

3487 if "audiotable" in classes:

3488 return

3489 tables.append(["html", node, titles, []])

3490 return

3491 elif kind in ( 3491 ↛ 3498line 3491 didn't jump to line 3498 because the condition on line 3491 was never true

3492 NodeKind.LEVEL2,

3493 NodeKind.LEVEL3,

3494 NodeKind.LEVEL4,

3495 NodeKind.LEVEL5,

3496 NodeKind.LEVEL6,

3497 ):

3498 return # Skip subsections

3499 if (

3500 kind == NodeKind.HTML

3501 and node.sarg == "div"

3502 and "NavFrame" in node.attrs.get("class", "").split()

3503 ):

3504 recurse_navframe(node, titles)

3505 return

3506 if kind == NodeKind.LINK:

3507 if len(node.largs) > 1:

3508 recurse(node.largs[1:], titles, navframe)

3509 else:

3510 recurse(node.largs[0], titles, navframe)

3511 return

3512 if kind == NodeKind.LIST and node.sarg == ";": 3512 ↛ 3514line 3512 didn't jump to line 3514 because the condition on line 3512 was never true

3513 nonlocal preceding_bolded_title

3514 from wiktextract.page import clean_node

3515 preceding_bolded_title = clean_node(wxr, None, node).strip("; ")

3516 for x in node.children:

3517 recurse(x, titles, navframe)

3518

3519 assert tree.kind == NodeKind.ROOT

3520 for x in tree.children:

3521 if preceding_bolded_title != "": 3521 ↛ 3522line 3521 didn't jump to line 3522 because the condition on line 3521 was never true

3522 recurse(x, [preceding_bolded_title])

3523 else:

3524 recurse(x, [])

3525

3526 # Process the tables we found

3527 process_tables()

3528

3529 # XXX this code is used for extracting tables for inflection tests

3530 if wxr.config.expand_tables: 3530 ↛ 3531line 3530 didn't jump to line 3531 because the condition on line 3530 was never true

3531 if section != "Mutation":

3532 with open(wxr.config.expand_tables, "w") as f:

3533 f.write(word + "\n")

3534 f.write(lang + "\n")

3535 f.write(pos + "\n")

3536 f.write(section + "\n")

3537 text = wxr.wtp.node_to_wikitext(tree)

3538 f.write(text + "\n")