Coverage for src/wiktextract/extractor/en/inflection.py: 86%

1475 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-10-25 10:11 +0000

1# Code for parsing inflection tables. 

2# 

3# Copyright (c) 2021-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org. 

4 

5import collections 

6import copy 

7import functools 

8import html 

9import itertools 

10import re 

11import unicodedata 

12from typing import Optional, Union 

13 

14from wikitextprocessor import MAGIC_FIRST, NodeKind, WikiNode 

15 

16from ...clean import clean_value 

17from ...datautils import data_append, freeze, split_at_comma_semi 

18from ...tags import valid_tags 

19from ...wxr_context import WiktextractContext 

20from .form_descriptions import ( 

21 classify_desc, 

22 decode_tags, 

23 distw, 

24 parse_head_final_tags, 

25) 

26from .inflectiondata import infl_map, infl_start_map, infl_start_re 

27from .lang_specific_configs import get_lang_conf, lang_specific_tags 

28from .table_headers_heuristics_data import LANGUAGES_WITH_CELLS_AS_HEADERS 

29from .type_utils import FormData 

30 

31# --debug-text-cell WORD 

32# Command-line parameter for debugging. When parsing inflection tables, 

33# print out debug messages when encountering this text. 

34debug_cell_text: Optional[str] = None 

35 

36 

37def set_debug_cell_text(text: str) -> None: 

38 global debug_cell_text 

39 debug_cell_text = text 

40 

41 

42TagSets = list[tuple[str, ...]] 

43 

44# Column texts that are interpreted as an empty column. 

45IGNORED_COLVALUES = { 

46 "-", 

47 "־", 

48 "᠆", 

49 "‐", 

50 "‑", 

51 "‒", 

52 "–", 

53 "—", 

54 "―", 

55 "−", 

56 "⸺", 

57 "⸻", 

58 "﹘", 

59 "﹣", 

60 "-", 

61 "/", 

62 "?", 

63 "not used", 

64 "not applicable", 

65} 

66 

67# These tags are never inherited from above 

68# XXX merge with lang_specific 

69noinherit_tags = { 

70 "infinitive-i", 

71 "infinitive-i-long", 

72 "infinitive-ii", 

73 "infinitive-iii", 

74 "infinitive-iv", 

75 "infinitive-v", 

76} 

77 

78# Subject->object transformation mapping, when using dummy-object-concord 

79# to replace subject concord tags with object concord tags 

80object_concord_replacements = { 

81 "first-person": "object-first-person", 

82 "second-person": "object-second-person", 

83 "third-person": "object-third-person", 

84 "singular": "object-singular", 

85 "plural": "object-plural", 

86 "definite": "object-definite", 

87 "indefinite": "object-indefinite", 

88 "class-1": "object-class-1", 

89 "class-2": "object-class-2", 

90 "class-3": "object-class-3", 

91 "class-4": "object-class-4", 

92 "class-5": "object-class-5", 

93 "class-6": "object-class-6", 

94 "class-7": "object-class-7", 

95 "class-8": "object-class-8", 

96 "class-9": "object-class-9", 

97 "class-10": "object-class-10", 

98 "class-11": "object-class-11", 

99 "class-12": "object-class-12", 

100 "class-13": "object-class-13", 

101 "class-14": "object-class-14", 

102 "class-15": "object-class-15", 

103 "class-16": "object-class-16", 

104 "class-17": "object-class-17", 

105 "class-18": "object-class-18", 

106 "masculine": "object-masculine", 

107 "feminine": "object-feminine", 

108} 

109 

110# Words in title that cause addition of tags in all entries 

111title_contains_global_map = { 

112 "possessive": "possessive", 

113 "possessed forms of": "possessive", 

114 "predicative forms of": "predicative", 

115 "negative": "negative", 

116 "positive definite forms": "positive definite", 

117 "positive indefinite forms": "positive indefinite", 

118 "comparative": "comparative", 

119 "superlative": "superlative", 

120 "combined forms": "combined-form", 

121 "mutation": "mutation", 

122 "definite article": "definite", 

123 "indefinite article": "indefinite", 

124 "indefinite declension": "indefinite", 

125 "bare forms": "indefinite", # e.g., cois/Irish 

126 "definite declension": "definite", 

127 "pre-reform": "dated", 

128 "personal pronouns": "personal pronoun", 

129 "composed forms of": "multiword-construction", 

130 "subordinate-clause forms of": "subordinate-clause", 

131 "participles of": "participle", 

132 "variation of": "dummy-skip-this", # a'/Scottish Gaelic 

133 "command form of": "imperative", # a راتلل/Pashto 

134 "historical inflection of": "dummy-skip-this", # kork/Norwegian Nynorsk 

135} 

136for k, v in title_contains_global_map.items(): 

137 if any(t not in valid_tags for t in v.split()): 137 ↛ 138line 137 didn't jump to line 138 because the condition on line 137 was never true

138 print("TITLE_CONTAINS_GLOBAL_MAP UNRECOGNIZED TAG: {}: {}".format(k, v)) 

139table_hdr_ign_part = r"(Inflection|Conjugation|Declension|Mutation) of [^\s]" 

140 

141table_hdr_ign_part_re = re.compile(r"(?i)(" + table_hdr_ign_part + ")") 

142# (?i) python regex extension, ignore case 

143title_contains_global_re = re.compile( 

144 r"(?i)(^|\b)({}|{})($|\b)".format( 

145 table_hdr_ign_part, 

146 "|".join(re.escape(x) for x in title_contains_global_map.keys()), 

147 ) 

148) 

149 

150# Words in title that cause addition of tags to table-tags "form" 

151title_contains_wordtags_map = { 

152 "pf": "perfective", 

153 "impf": "imperfective", 

154 "strong": "strong", 

155 "weak": "weak", 

156 "countable": "countable", 

157 "uncountable": "uncountable", 

158 "inanimate": "inanimate", 

159 "animate": "animate", 

160 "transitive": "transitive", 

161 "intransitive": "intransitive", 

162 "ditransitive": "ditransitive", 

163 "ambitransitive": "ambitransitive", 

164 "archaic": "archaic", 

165 "dated": "dated", 

166 "affirmative": "affirmative", 

167 "negative": "negative", 

168 "subject pronouns": "subjective", 

169 "object pronouns": "objective", 

170 "emphatic": "emphatic", 

171 "proper noun": "proper-noun", 

172 "no plural": "no-plural", 

173 "imperfective": "imperfective", 

174 "perfective": "perfective", 

175 "no supine stem": "no-supine", 

176 "no perfect stem": "no-perfect", 

177 "deponent": "deponent", 

178 "irregular": "irregular", 

179 "no short forms": "no-short-form", 

180 "iō-variant": "iō-variant", 

181 "1st declension": "declension-1", 

182 "2nd declension": "declension-2", 

183 "3rd declension": "declension-3", 

184 "4th declension": "declension-4", 

185 "5th declension": "declension-5", 

186 "6th declension": "declension-6", 

187 "first declension": "declension-1", 

188 "second declension": "declension-2", 

189 "third declension": "declension-3", 

190 "fourth declension": "declension-4", 

191 "fifth declension": "declension-5", 

192 "sixth declension": "declension-6", 

193 "1st conjugation": "conjugation-1", 

194 "2nd conjugation": "conjugation-2", 

195 "3rd conjugation": "conjugation-3", 

196 "4th conjugation": "conjugation-4", 

197 "5th conjugation": "conjugation-5", 

198 "6th conjugation": "conjugation-6", 

199 "7th conjugation": "conjugation-7", 

200 "first conjugation": "conjugation-1", 

201 "second conjugation": "conjugation-2", 

202 "third conjugation": "conjugation-3", 

203 "fourth conjugation": "conjugation-4", 

204 "fifth conjugation": "conjugation-5", 

205 "sixth conjugation": "conjugation-6", 

206 "seventh conjugation": "conjugation-7", 

207 # Corsican regional tags in table header 

208 "cismontane": "Cismontane", 

209 "ultramontane": "Ultramontane", 

210 "western lombard": "Western-Lombard", 

211 "eastern lombard": "Eastern-Lombard", 

212} 

213for k, v in title_contains_wordtags_map.items(): 

214 if any(t not in valid_tags for t in v.split()): 214 ↛ 215line 214 didn't jump to line 215 because the condition on line 214 was never true

215 print( 

216 "TITLE_CONTAINS_WORDTAGS_MAP UNRECOGNIZED TAG: {}: {}".format(k, v) 

217 ) 

218title_contains_wordtags_re = re.compile( 

219 r"(?i)(^|\b)({}|{})($|\b)".format( 

220 table_hdr_ign_part, 

221 "|".join(re.escape(x) for x in title_contains_wordtags_map.keys()), 

222 ) 

223) 

224 

225# Parenthesized elements in title that are converted to tags in 

226# "table-tags" form 

227title_elements_map = { 

228 "weak": "weak", 

229 "strong": "strong", 

230 "separable": "separable", 

231 "masculine": "masculine", 

232 "feminine": "feminine", 

233 "neuter": "neuter", 

234 "singular": "singular", 

235 "plural": "plural", 

236 "archaic": "archaic", 

237 "dated": "dated", 

238 "Attic": "Attic", # e.g. καλός/Greek/Adj 

239 "Epic": "Epic", # e.g. καλός/Greek/Adj 

240} 

241for k, v in title_elements_map.items(): 

242 if any(t not in valid_tags for t in v.split()): 242 ↛ 243line 242 didn't jump to line 243 because the condition on line 242 was never true

243 print("TITLE_ELEMENTS_MAP UNRECOGNIZED TAG: {}: {}".format(k, v)) 

244 

245# Parenthized element starts to map them to tags for form for the rest of 

246# the element 

247title_elemstart_map = { 

248 "auxiliary": "auxiliary", 

249 "Kotus type": "class", 

250 "ÕS type": "class", 

251 "class": "class", 

252 "short class": "class", 

253 "type": "class", 

254 "strong class": "class", 

255 "weak class": "class", 

256 "accent paradigm": "accent-paradigm", 

257 "stem in": "class", 

258} 

259for k, v in title_elemstart_map.items(): 

260 if any(t not in valid_tags for t in v.split()): 260 ↛ 261line 260 didn't jump to line 261 because the condition on line 260 was never true

261 print("TITLE_ELEMSTART_MAP UNRECOGNIZED TAG: {}: {}".format(k, v)) 

262title_elemstart_re = re.compile( 

263 r"^({}) ".format("|".join(re.escape(x) for x in title_elemstart_map.keys())) 

264) 

265 

266 

267# Regexp for cell starts that are likely definitions of reference symbols. 

268# See also nondef_re. 

269def_re = re.compile( 

270 r"(\s*•?\s+)?" 

271 r"((\*+|[△†0123456789⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻]+)([⁾):]|\s|(?=[A-Z]))|" 

272 r"\^(\*+|[△†])|" 

273 r"([¹²³⁴⁵⁶⁷⁸⁹])|" 

274 r"([ᴬᴮᴰᴱᴳᴴᴵᴶᴷᴸᴹᴺᴼᴾᴿᵀᵁⱽᵂᵃᵇᶜᵈᵉᶠᵍʰⁱʲᵏˡᵐⁿᵒᵖʳˢᵗᵘᵛʷˣʸᶻᵝᵞᵟᶿᶥᵠᵡ]))" 

275) 

276# ᴺᴸᴴ persan/Old Irish 

277 

278# Regexp for cell starts that are exceptions to def_re and do not actually 

279# start a definition. 

280nondef_re = re.compile( 

281 r"(^\s*(1|2|3)\s+(sg|pl)\s*$|" # 1s or 3p etc. 

282 r"\s*\d\d?\s*/\s*\d\d?\s*$)" 

283) # taka/Swahili "15 / 17" 

284 

285# Certain tags are moved from headers in tables into word tags, as they always 

286# apply to the whole word. 

287TAGS_FORCED_WORDTAGS: set[str] = set( 

288 [ 

289 # This was originally created for a issue with number paradigms in 

290 # Arabic, but that is being handled elsewhere now. 

291 ] 

292) 

293 

294 

295class InflCell: 

296 """Cell in an inflection table.""" 

297 

298 __slots__ = ( 

299 "text", 

300 "is_title", 

301 "colspan", 

302 "rowspan", 

303 "target", 

304 ) 

305 

306 def __init__( 

307 self, 

308 text: str, 

309 is_title: bool, 

310 colspan: int, 

311 rowspan: int, 

312 target: Optional[str], 

313 ) -> None: 

314 assert isinstance(text, str) 

315 assert is_title in (True, False) 

316 assert isinstance(colspan, int) and colspan >= 1 

317 assert isinstance(rowspan, int) and rowspan >= 1 

318 assert target is None or isinstance(target, str) 

319 self.text = text.strip() 

320 self.is_title = text and is_title 

321 self.colspan = colspan 

322 self.rowspan = rowspan 

323 self.target = target 

324 

325 def __str__(self) -> str: 

326 v = "{}/{}/{}/{!r}".format( 

327 self.text, self.is_title, self.colspan, self.rowspan 

328 ) 

329 if self.target: 

330 v += ": {!r}".format(self.target) 

331 return v 

332 

333 def __repr__(self) -> str: 

334 return str(self) 

335 

336 

337class HdrSpan: 

338 """Saved information about a header cell/span during the parsing 

339 of a table.""" 

340 

341 __slots__ = ( 

342 "start", 

343 "colspan", 

344 "rowspan", 

345 "rownum", # Row number where this occurred 

346 "tagsets", # list of tuples 

347 "text", # For debugging 

348 "all_headers_row", 

349 "expanded", # The header has been expanded to cover whole row/part 

350 ) 

351 

352 def __init__( 

353 self, 

354 start: int, 

355 colspan: int, 

356 rowspan: int, 

357 rownum: int, 

358 tagsets: TagSets, 

359 text: str, 

360 all_headers_row: bool, 

361 ) -> None: 

362 assert isinstance(start, int) and start >= 0 

363 assert isinstance(colspan, int) and colspan >= 1 

364 assert isinstance(rownum, int) 

365 assert isinstance(tagsets, list) 

366 for x in tagsets: 

367 assert isinstance(x, tuple) 

368 assert all_headers_row in (True, False) 

369 self.start = start 

370 self.colspan = colspan 

371 self.rowspan = rowspan 

372 self.rownum = rownum 

373 self.tagsets = list(tuple(sorted(set(tags))) for tags in tagsets) 

374 self.text = text 

375 self.all_headers_row = all_headers_row 

376 self.expanded = False 

377 

378 

379def is_superscript(ch: str) -> bool: 

380 """Returns True if the argument is a superscript character.""" 

381 assert isinstance(ch, str) and len(ch) == 1 

382 try: 

383 name = unicodedata.name(ch) 

384 except ValueError: 

385 return False 

386 return ( 

387 re.match( 

388 r"SUPERSCRIPT |" 

389 r"MODIFIER LETTER SMALL |" 

390 r"MODIFIER LETTER CAPITAL ", 

391 name, 

392 ) 

393 is not None 

394 ) 

395 

396 

397def remove_useless_tags(lang: str, pos: str, tags: set[str]) -> None: 

398 """Remove certain tag combinations from ``tags`` when they serve no purpose 

399 together (cover all options).""" 

400 assert isinstance(lang, str) 

401 assert isinstance(pos, str) 

402 assert isinstance(tags, set) 

403 if ( 

404 "animate" in tags 

405 and "inanimate" in tags 

406 and get_lang_conf(lang, "animate_inanimate_remove") 

407 ): 

408 tags.remove("animate") 

409 tags.remove("inanimate") 

410 if ( 

411 "virile" in tags 

412 and "nonvirile" in tags 

413 and get_lang_conf(lang, "virile_nonvirile_remove") 

414 ): 

415 tags.remove("virile") 

416 tags.remove("nonvirile") 

417 # If all numbers in the language are listed, remove them all 

418 numbers = get_lang_conf(lang, "numbers") 

419 if numbers and all(x in tags for x in numbers): 

420 for x in numbers: 

421 tags.remove(x) 

422 # If all genders in the language are listed, remove them all 

423 genders = get_lang_conf(lang, "genders") 

424 if genders and all(x in tags for x in genders): 

425 for x in genders: 

426 tags.remove(x) 

427 # If all voices in the language are listed, remove them all 

428 voices = get_lang_conf(lang, "voices") 

429 if voices and all(x in tags for x in voices): 

430 for x in voices: 

431 tags.remove(x) 

432 # If all strengths of the language are listed, remove them all 

433 strengths = get_lang_conf(lang, "strengths") 

434 if strengths and all(x in tags for x in strengths): 

435 for x in strengths: 

436 tags.remove(x) 

437 # If all persons of the language are listed, remove them all 

438 persons = get_lang_conf(lang, "persons") 

439 if persons and all(x in tags for x in persons): 

440 for x in persons: 

441 tags.remove(x) 

442 # If all definitenesses of the language are listed, remove them all 

443 definitenesses = get_lang_conf(lang, "definitenesses") 

444 if definitenesses and all(x in tags for x in definitenesses): 

445 for x in definitenesses: 

446 tags.remove(x) 

447 

448 

449def tagset_cats(tagset: TagSets) -> set[str]: 

450 """Returns a set of tag categories for the tagset (merged from all 

451 alternatives).""" 

452 return set(valid_tags[t] for ts in tagset for t in ts) 

453 

454 

455def or_tagsets( 

456 lang: str, pos: str, tagsets1: TagSets, tagsets2: TagSets 

457) -> TagSets: 

458 """Merges two tagsets (the new tagset just merges the tags from both, in 

459 all combinations). If they contain simple alternatives (differ in 

460 only one category), they are simply merged; otherwise they are split to 

461 more alternatives. The tagsets are assumed be sets of sorted tuples.""" 

462 assert isinstance(tagsets1, list) 

463 assert all(isinstance(x, tuple) for x in tagsets1) 

464 assert isinstance(tagsets2, list) 

465 assert all(isinstance(x, tuple) for x in tagsets1) 

466 tagsets: TagSets = [] # This will be the result 

467 

468 def add_tags(tags1: tuple[str, ...]) -> None: 

469 # CONTINUE 

470 if not tags1: 

471 return # empty set would merge with anything, won't change result 

472 if not tagsets: 

473 tagsets.append(tags1) 

474 return 

475 for tags2 in tagsets: 

476 # Determine if tags1 can be merged with tags2 

477 num_differ = 0 

478 if tags1 and tags2: 478 ↛ 496line 478 didn't jump to line 496 because the condition on line 478 was always true

479 cats1 = set(valid_tags[t] for t in tags1) 

480 cats2 = set(valid_tags[t] for t in tags2) 

481 cats = cats1 | cats2 

482 for cat in cats: 

483 tags1_in_cat = set(t for t in tags1 if valid_tags[t] == cat) 

484 tags2_in_cat = set(t for t in tags2 if valid_tags[t] == cat) 

485 if ( 

486 tags1_in_cat != tags2_in_cat 

487 or not tags1_in_cat 

488 or not tags2_in_cat 

489 ): 

490 num_differ += 1 

491 if not tags1_in_cat or not tags2_in_cat: 

492 # Prevent merging if one is empty 

493 num_differ += 1 

494 # print("tags1={} tags2={} num_differ={}" 

495 # .format(tags1, tags2, num_differ)) 

496 if num_differ <= 1: 

497 # Yes, they can be merged 

498 tagsets.remove(tags2) 

499 tags_s = set(tags1) | set(tags2) 

500 remove_useless_tags(lang, pos, tags_s) 

501 tags_t = tuple(sorted(tags_s)) 

502 add_tags(tags_t) # Could result in further merging 

503 return 

504 # If we could not merge, add to tagsets 

505 tagsets.append(tags1) 

506 

507 for tags in tagsets1: 

508 add_tags(tags) 

509 for tags in tagsets2: 

510 add_tags(tags) 

511 if not tagsets: 

512 tagsets.append(()) 

513 

514 # print("or_tagsets: {} + {} -> {}" 

515 # .format(tagsets1, tagsets2, tagsets)) 

516 return tagsets 

517 

518 

519def and_tagsets( 

520 lang: str, 

521 pos: str, 

522 tagsets1: list[tuple[str, ...]], 

523 tagsets2: list[tuple[str, ...]], 

524) -> list[tuple[str, ...]]: 

525 """Merges tagsets by taking union of all cobinations, without trying 

526 to determine whether they are compatible.""" 

527 assert isinstance(tagsets1, list) and len(tagsets1) >= 1 

528 assert all(isinstance(x, tuple) for x in tagsets1) 

529 assert isinstance(tagsets2, list) and len(tagsets2) >= 1 

530 assert all(isinstance(x, tuple) for x in tagsets1) 

531 new_tagsets = [] 

532 tags: Union[set[str], tuple[str, ...]] 

533 for tags1 in tagsets1: 

534 for tags2 in tagsets2: 

535 tags = set(tags1) | set(tags2) 

536 remove_useless_tags(lang, pos, tags) 

537 if "dummy-ignored-text-cell" in tags: 537 ↛ 538line 537 didn't jump to line 538 because the condition on line 537 was never true

538 tags.remove("dummy-ignored-text-cell") 

539 tags = tuple(sorted(tags)) 

540 if tags not in new_tagsets: 540 ↛ 534line 540 didn't jump to line 534 because the condition on line 540 was always true

541 new_tagsets.append(tags) 

542 # print("and_tagsets: {} + {} -> {}" 

543 # .format(tagsets1, tagsets2, new_tagsets)) 

544 return new_tagsets 

545 

546 

547@functools.lru_cache(65536) 

548def extract_cell_content( 

549 lang: str, word: str, col: str 

550) -> tuple[str, list[str], list[tuple[str, str]], list[str]]: 

551 """Cleans a row/column header for later processing. This returns 

552 (cleaned, refs, defs, tags).""" 

553 # print("EXTRACT_CELL_CONTENT {!r}".format(col)) 

554 hdr_tags = [] 

555 col = re.sub(r"(?s)\s*,\s*$", "", col) 

556 col = re.sub(r"(?s)\s*•\s*$", "", col) 

557 col = re.sub(r"\s+", " ", col) 

558 col = col.strip() 

559 if re.search( 

560 r"^\s*(There are |" 

561 r"\* |" 

562 r"see |" 

563 r"Use |" 

564 r"use the |" 

565 r"Only used |" 

566 r"The forms in |" 

567 r"these are also written |" 

568 r"The genitive can be |" 

569 r"Genitive forms are rare or non-existant|" 

570 r"Accusative Note: |" 

571 r"Classifier Note: |" 

572 r"Noun: Assamese nouns are |" 

573 r"the active conjugation|" 

574 r"the instrumenal singular|" 

575 r"Note:|" 

576 r"\^* Note:|" 

577 r"possible mutated form |" 

578 r"The future tense: )", 

579 col, 

580 ): 

581 return "dummy-ignored-text-cell", [], [], [] 

582 

583 # Temporarily remove final parenthesized part (if separated by whitespace), 

584 # so that we can extract reference markers before it. 

585 final_paren = "" 

586 m = re.search(r"\s+\([^)]*\)$", col) 

587 if m is not None: 

588 final_paren = m.group(0) 

589 col = col[: m.start()] 

590 

591 # Extract references and tag markers 

592 refs = [] 

593 special_references = get_lang_conf(lang, "special_references") 

594 while True: 

595 m = re.search(r"\^(.|\([^)]*\))$", col) 

596 if not m: 

597 break 

598 r = m.group(1) 

599 if r.startswith("(") and r.endswith(")"): 

600 r = r[1:-1] 

601 for r1 in r.split(","): 

602 if r1 == "rare": 602 ↛ 603line 602 didn't jump to line 603 because the condition on line 602 was never true

603 hdr_tags.append("rare") 

604 elif special_references and r1 in special_references: 604 ↛ 605line 604 didn't jump to line 605 because the condition on line 604 was never true

605 hdr_tags.extend(special_references[r1].split()) 

606 else: 

607 # v = m.group(1) 

608 if r1.startswith("(") and r1.endswith(")"): 608 ↛ 609line 608 didn't jump to line 609 because the condition on line 608 was never true

609 r1 = r1[1:-1] 

610 refs.append(unicodedata.normalize("NFKD", r1)) 

611 col = col[: m.start()] 

612 # See if it is a ref definition 

613 # print("BEFORE REF CHECK: {!r}".format(col)) 

614 m = def_re.match(col) 

615 # print(f"Before def_re: {refs=}") 

616 if m and not nondef_re.match(col): 

617 ofs = 0 

618 ref = None 

619 deflst = [] 

620 for m in re.finditer(def_re, col): 

621 if ref: 

622 deflst.append((ref, col[ofs : m.start()].strip())) 

623 ref = unicodedata.normalize( 

624 "NFKD", m.group(3) or m.group(5) or m.group(6) or "" 

625 ) 

626 ofs = m.end() 

627 if ref: 627 ↛ 630line 627 didn't jump to line 630 because the condition on line 627 was always true

628 deflst.append((ref, col[ofs:].strip())) 

629 # print("deflst:", deflst) 

630 return "", [], deflst, [] 

631 # See if it *looks* like a reference to a definition 

632 # print(f"After def_re: {refs=}") 

633 while col: 

634 if is_superscript(col[-1]) or col[-1] in ("†",): 

635 if col.endswith("ʳᵃʳᵉ"): 

636 hdr_tags.append("rare") 

637 col = col[:-4].strip() 

638 continue 

639 if special_references: 639 ↛ 640line 639 didn't jump to line 640 because the condition on line 639 was never true

640 stop_flag = False 

641 for r in special_references: 

642 if col.endswith(r): 

643 hdr_tags.extend(special_references[r].split()) 

644 col = col[: -len(r)].strip() 

645 stop_flag = True 

646 break # this for loop 

647 if stop_flag: 

648 continue # this while loop 

649 # Numbers and H/L/N are useful information 

650 refs.append(unicodedata.normalize("NFKD", col[-1])) 

651 col = col[:-1] 

652 else: 

653 break 

654 

655 # Check for another form of note definition 

656 if ( 656 ↛ 662line 656 didn't jump to line 662

657 len(col) > 2 

658 and col[1] in (")", " ", ":") 

659 and col[0].isdigit() 

660 and not re.match(nondef_re, col) 

661 ): 

662 return "", [], [(col[0], col[2:].strip())], [] 

663 col = col.strip() 

664 

665 # Extract final "*" reference symbols. Sometimes there are multiple. 

666 m = re.search(r"\*+$", col) 

667 if m is not None: 

668 col = col[: m.start()] 

669 refs.append(unicodedata.normalize("NFKD", m.group(0))) 

670 if col.endswith("(*)"): 670 ↛ 671line 670 didn't jump to line 671 because the condition on line 670 was never true

671 col = col[:-3].strip() 

672 refs.append("*") 

673 

674 # Put back the final parenthesized part 

675 col = col.strip() + final_paren 

676 # print("EXTRACT_CELL_CONTENT: orig_col={!r} col={!r} refs={!r} hdr_tags={}" 

677 # .format(orig_col, col, refs, hdr_tags)) 

678 return col.strip(), refs, [], hdr_tags 

679 

680 

681@functools.lru_cache(10000) 

682def parse_title( 

683 title: str, source: str 

684) -> tuple[list[str], list[str], list[FormData]]: 

685 """Parses inflection table title. This returns (global_tags, table_tags, 

686 extra_forms), where ``global_tags`` is tags to be added to each inflection 

687 entry, ``table_tags`` are tags for the word but not to be added to every 

688 form, and ``extra_forms`` is dictionary describing additional forms to be 

689 included in the part-of-speech entry).""" 

690 assert isinstance(title, str) 

691 assert isinstance(source, str) 

692 title = html.unescape(title) 

693 title = re.sub(r"(?i)<[^>]*>", "", title).strip() 

694 title = re.sub(r"\s+", " ", title) 

695 # print("PARSE_TITLE:", title) 

696 global_tags = [] 

697 table_tags = [] 

698 extra_forms = [] 

699 # Add certain global tags based on contained words 

700 for m in re.finditer(title_contains_global_re, title): 

701 v = m.group(0).lower() 

702 if re.match(table_hdr_ign_part_re, v): 702 ↛ 703line 702 didn't jump to line 703 because the condition on line 702 was never true

703 continue 

704 global_tags.extend(title_contains_global_map[v].split()) 

705 # Add certain tags to table-tags "form" based on contained words 

706 for m in re.finditer(title_contains_wordtags_re, title): 

707 v = m.group(0).lower() 

708 if re.match(table_hdr_ign_part_re, v): 708 ↛ 709line 708 didn't jump to line 709 because the condition on line 708 was never true

709 continue 

710 table_tags.extend(title_contains_wordtags_map[v].split()) 

711 if re.search(r"Conjugation of (s’|se ).*French verbs", title): 711 ↛ 712line 711 didn't jump to line 712 because the condition on line 711 was never true

712 global_tags.append("reflexive") 

713 # Check for <x>-type at the beginning of title (e.g., Armenian) and various 

714 # other ways of specifying an inflection class. 

715 for m in re.finditer( 

716 r"\b(" 

717 r"[\w/]+-type|" 

718 r"accent-\w+|" 

719 r"[\w/]+-stem|" 

720 r"[^ ]+ gradation|" 

721 r"\b(stem in [\w/ ]+)|" 

722 r"[^ ]+ alternation|" 

723 r"(First|Second|Third|Fourth|Fifth|Sixth|Seventh) " 

724 r"(Conjugation|declension)|" 

725 r"First and second declension|" 

726 r"(1st|2nd|3rd|4th|5th|6th) declension|" 

727 r"\w[\w/ ]* harmony" 

728 r")\b", 

729 title, 

730 ): 

731 dt: FormData = {"form": m.group(1), "source": source, "tags": ["class"]} 

732 extra_forms.append(dt) 

733 # Parse parenthesized part from title 

734 for m in re.finditer(r"\(([^)]*)\)", title): 

735 for elem in m.group(1).split(","): 

736 # group(0) is the whole string, group(1) first parens 

737 elem = elem.strip() 

738 if elem in title_elements_map: 

739 table_tags.extend(title_elements_map[elem].split()) 

740 else: 

741 m1 = re.match(title_elemstart_re, elem) 

742 if m1: 

743 tags = title_elemstart_map[m1.group(1)].split() 

744 dt = { 

745 "form": elem[m1.end() :], 

746 "source": source, 

747 "tags": tags, 

748 } 

749 extra_forms.append(dt) 

750 # For titles that contains no parenthesized parts, do some special 

751 # handling to still interpret parts from them 

752 if "(" not in title: 

753 # No parenthesized parts 

754 m1 = re.search(r"\b(Portuguese) (-.* verb) ", title) 

755 if m1 is not None: 

756 dt = {"form": m1.group(2), "tags": ["class"], "source": source} 

757 extra_forms.append(dt) 

758 for elem in title.split(","): 

759 elem = elem.strip() 

760 if elem in title_elements_map: 760 ↛ 761line 760 didn't jump to line 761 because the condition on line 760 was never true

761 table_tags.extend(title_elements_map[elem].split()) 

762 elif elem.endswith("-stem"): 762 ↛ 763line 762 didn't jump to line 763 because the condition on line 762 was never true

763 dt = {"form": elem, "tags": ["class"], "source": source} 

764 extra_forms.append(dt) 

765 return global_tags, table_tags, extra_forms 

766 

767 

768def expand_header( 

769 wxr: WiktextractContext, 

770 tablecontext: "TableContext", 

771 word: str, 

772 lang: str, 

773 pos: str, 

774 text: str, 

775 base_tags: Union[list[str], set[str], tuple[str, ...]], 

776 silent=False, 

777 ignore_tags=False, 

778 depth=0, 

779) -> list[tuple[str, ...]]: 

780 """Expands a cell header to tagset, handling conditional expressions 

781 in infl_map. This returns list of tuples of tags, each list element 

782 describing an alternative interpretation. ``base_tags`` is combined 

783 column and row tags for the cell in which the text is being interpreted 

784 (conditional expressions in inflection data may depend on it). 

785 If ``silent`` is True, then no warnings will be printed. If ``ignore_tags`` 

786 is True, then tags listed in "if" will be ignored in the test (this is 

787 used when trying to heuristically detect whether a non-<th> cell is anyway 

788 a header).""" 

789 assert isinstance(wxr, WiktextractContext) 

790 assert isinstance(word, str) 

791 assert isinstance(lang, str) 

792 assert isinstance(pos, str) 

793 assert isinstance(text, str) 

794 assert isinstance(base_tags, (list, tuple, set)) 

795 assert silent in (True, False) 

796 assert isinstance(depth, int) 

797 # print("EXPAND_HDR: text={!r} base_tags={!r}".format(text, base_tags)) 

798 # First map the text using the inflection map 

799 text = clean_value(wxr, text) 

800 combined_return: list[tuple[str, ...]] = [] 

801 parts = split_at_comma_semi(text, separators=[";"]) 

802 for text in parts: 

803 if not text: 803 ↛ 804line 803 didn't jump to line 804 because the condition on line 803 was never true

804 continue 

805 if text in infl_map: 

806 v = infl_map[text] # list or string 

807 else: 

808 m = re.match(infl_start_re, text) 

809 if m is not None: 809 ↛ 810line 809 didn't jump to line 810 because the condition on line 809 was never true

810 v = infl_start_map[m.group(1)] 

811 # print("INFL_START {} -> {}".format(text, v)) 

812 elif re.match(r"Notes", text): 

813 # Ignored header 

814 # print("IGNORING NOTES") 

815 combined_return = or_tagsets( 

816 lang, pos, combined_return, [("dummy-skip-this",)] 

817 ) 

818 # this just adds dummy-skip-this 

819 continue 

820 elif text in IGNORED_COLVALUES: 

821 combined_return = or_tagsets( 

822 lang, pos, combined_return, [("dummy-ignore-skipped",)] 

823 ) 

824 continue 

825 # Try without final parenthesized part 

826 text_without_parens = re.sub(r"[,/]?\s+\([^)]*\)\s*$", "", text) 

827 if text_without_parens in infl_map: 

828 v = infl_map[text_without_parens] 

829 elif m is None: 829 ↛ 845line 829 didn't jump to line 845 because the condition on line 829 was always true

830 if not silent: 

831 wxr.wtp.debug( 

832 "inflection table: unrecognized header: {}".format( 

833 repr(text) 

834 ), 

835 sortid="inflection/735", 

836 ) 

837 # Unrecognized header 

838 combined_return = or_tagsets( 

839 lang, pos, combined_return, [("error-unrecognized-form",)] 

840 ) 

841 continue 

842 

843 # Then loop interpreting the value, until the value is a simple string. 

844 # This may evaluate nested conditional expressions. 

845 default_then = None 

846 while True: 

847 # If it is a string, we are done. 

848 if isinstance(v, str): 

849 tags = set(v.split()) 

850 remove_useless_tags(lang, pos, tags) 

851 tagset = [tuple(sorted(tags))] 

852 break 

853 # For a list, just interpret it as alternatives. (Currently the 

854 # alternatives must directly be strings.) 

855 if isinstance(v, (list, tuple)): 

856 tagset = [] 

857 for x in v: 

858 tags = set(x.split()) 

859 remove_useless_tags(lang, pos, tags) 

860 tags_t = tuple(sorted(tags)) 

861 if tags_t not in tagset: 861 ↛ 857line 861 didn't jump to line 857 because the condition on line 861 was always true

862 tagset.append(tags_t) 

863 break 

864 # Otherwise the value should be a dictionary describing a 

865 # conditional expression. 

866 if not isinstance(v, dict): 866 ↛ 867line 866 didn't jump to line 867 because the condition on line 866 was never true

867 wxr.wtp.debug( 

868 "inflection table: internal: " 

869 "UNIMPLEMENTED INFL_MAP VALUE: {}".format(infl_map[text]), 

870 sortid="inflection/767", 

871 ) 

872 tagset = [()] 

873 break 

874 # Evaluate the conditional expression. 

875 assert isinstance(v, dict) 

876 cond: Union[bool, str] = "default-true" 

877 c: Union[str, list[str], set[str]] = "" 

878 # Handle "lang" condition. The value must be either a 

879 # single language or a list of languages, and the 

880 # condition evaluates to True if the table is one of 

881 # those languages. 

882 if "lang" in v: 

883 c = v["lang"] 

884 if isinstance(c, str): 

885 cond = c == lang 

886 else: 

887 assert isinstance(c, (list, tuple, set)) 

888 cond = lang in c 

889 # Handle "nested-table-depth" condition. The value must 

890 # be an int or list of ints, and the condition evaluates 

891 # True if the depth is one of those values. 

892 # "depth" is how deep into a nested table tree the current 

893 # table lies. It is first started in handle_wikitext_table, 

894 # so only applies to tables-within-tables, not other 

895 # WikiNode content. `depth` is currently only passed as a 

896 # parameter down the table parsing stack, and not stored. 

897 if cond and "nested-table-depth" in v: 897 ↛ 898line 897 didn't jump to line 898 because the condition on line 897 was never true

898 d = v["nested-table-depth"] 

899 if isinstance(d, int): 

900 cond = d == depth 

901 else: 

902 assert isinstance(d, (list, tuple, set)) 

903 cond = depth in d 

904 # Handle inflection-template condition. Must be a string 

905 # or list of strings, and if tablecontext.template_name is in 

906 # those, accept the condition. 

907 # TableContext.template_name is passed down from page/ 

908 # parse_inflection, before parsing and expanding itself 

909 # has begun. 

910 if cond and tablecontext and "inflection-template" in v: 

911 d1 = v["inflection-template"] 

912 if isinstance(d1, str): 912 ↛ 915line 912 didn't jump to line 915 because the condition on line 912 was always true

913 cond = d1 == tablecontext.template_name 

914 else: 

915 assert isinstance(d1, (list, tuple, set)) 

916 cond = tablecontext.template_name in d1 

917 # Handle "pos" condition. The value must be either a single 

918 # part-of-speech or a list of them, and the condition evaluates to 

919 # True if the part-of-speech is any of those listed. 

920 if cond and "pos" in v: 

921 c = v["pos"] 

922 if isinstance(c, str): 

923 cond = c == pos 

924 else: 

925 assert isinstance(c, (list, tuple, set)) 

926 cond = pos in c 

927 # Handle "if" condition. The value must be a string containing a 

928 # space-separated list of tags. The condition evaluates to True if 

929 # ``base_tags`` contains all of the listed tags. If the condition 

930 # is of the form "any: ...tags...", then any of the tags will be 

931 # enough. 

932 if cond and "if" in v and not ignore_tags: 

933 c = v["if"] 

934 assert isinstance(c, str) 

935 # "if" condition is true if any of the listed tags is present if 

936 # it starts with "any:", otherwise all must be present 

937 if c.startswith("any: "): 

938 cond = any(t in base_tags for t in c[5:].split()) 

939 else: 

940 cond = all(t in base_tags for t in c.split()) 

941 

942 # Handle "default" assignment. Store the value to be used 

943 # as a default later. 

944 if "default" in v: 

945 assert isinstance(v["default"], str) 

946 default_then = v["default"] 

947 

948 # Warning message about missing conditions for debugging. 

949 

950 if cond == "default-true" and not default_then and not silent: 

951 wxr.wtp.debug( 

952 "inflection table: IF MISSING COND: word={} " 

953 "lang={} text={} base_tags={} c={} cond={}".format( 

954 word, lang, text, base_tags, c, cond 

955 ), 

956 sortid="inflection/851", 

957 ) 

958 # Based on the result of evaluating the condition, select either 

959 # "then" part or "else" part. 

960 if cond: 

961 v = v.get("then", "") 

962 else: 

963 v1 = v.get("else") 

964 if v1 is None: 

965 if default_then: 

966 v = default_then 

967 else: 

968 if not silent: 

969 wxr.wtp.debug( 

970 "inflection table: IF WITHOUT ELSE EVALS " 

971 "False: " 

972 "{}/{} {!r} base_tags={}".format( 

973 word, lang, text, base_tags 

974 ), 

975 sortid="inflection/865", 

976 ) 

977 v = "error-unrecognized-form" 

978 else: 

979 v = v1 

980 

981 # Merge the resulting tagset from this header part with the other 

982 # tagsets from the whole header 

983 combined_return = or_tagsets(lang, pos, combined_return, tagset) 

984 

985 # Return the combined tagsets, or empty tagset if we got no tagsets 

986 if not combined_return: 

987 combined_return = [()] 

988 return combined_return 

989 

990 

991def compute_coltags( 

992 lang: str, 

993 pos: str, 

994 hdrspans: list[str], 

995 start: int, 

996 colspan: int, 

997 celltext: int, 

998) -> list[tuple[str]]: 

999 """Computes column tags for a column of the given width based on the 

1000 current header spans.""" 

1001 assert isinstance(lang, str) 

1002 assert isinstance(pos, str) 

1003 assert isinstance(hdrspans, list) 

1004 assert isinstance(start, int) and start >= 0 

1005 assert isinstance(colspan, int) and colspan >= 1 

1006 assert isinstance(celltext, str) # For debugging only 

1007 # print("COMPUTE_COLTAGS CALLED start={} colspan={} celltext={!r}" 

1008 # .format(start, colspan, celltext)) 

1009 # For debugging, set this to the form for whose cell you want debug prints 

1010 if celltext == debug_cell_text: 1010 ↛ 1011line 1010 didn't jump to line 1011 because the condition on line 1010 was never true

1011 print( 

1012 "COMPUTE_COLTAGS CALLED start={} colspan={} celltext={!r}".format( 

1013 start, colspan, celltext 

1014 ) 

1015 ) 

1016 for hdrspan in hdrspans: 

1017 print( 

1018 " row={} start={} colspans={} tagsets={}".format( 

1019 hdrspan.rownum, 

1020 hdrspan.start, 

1021 hdrspan.colspan, 

1022 hdrspan.tagsets, 

1023 ) 

1024 ) 

1025 used = set() 

1026 coltags = [()] 

1027 last_header_row = 1000000 

1028 # Iterate through the headers in reverse order, i.e., headers lower in the 

1029 # table (closer to the cell) first. 

1030 row_tagsets = [()] 

1031 row_tagsets_rownum = 1000000 

1032 used_hdrspans = set() 

1033 for hdrspan in reversed(hdrspans): 

1034 if ( 

1035 hdrspan.start + hdrspan.colspan <= start 

1036 or hdrspan.start >= start + colspan 

1037 ): 

1038 # Does not horizontally overlap current cell. Ignore this hdrspan. 

1039 if celltext == debug_cell_text: 1039 ↛ 1040line 1039 didn't jump to line 1040 because the condition on line 1039 was never true

1040 print( 

1041 "Ignoring row={} start={} colspan={} tagsets={}".format( 

1042 hdrspan.rownum, 

1043 hdrspan.start, 

1044 hdrspan.colspan, 

1045 hdrspan.tagsets, 

1046 ) 

1047 ) 

1048 continue 

1049 # If the cell partially overlaps the current cell, assume we have 

1050 # reached something unrelated and abort. 

1051 if ( 

1052 hdrspan.start < start 

1053 and hdrspan.start + hdrspan.colspan > start 

1054 and hdrspan.start + hdrspan.colspan < start + colspan 

1055 ): 

1056 if celltext == debug_cell_text: 1056 ↛ 1057line 1056 didn't jump to line 1057 because the condition on line 1056 was never true

1057 print( 

1058 "break on partial overlap at start {} {} {}".format( 

1059 hdrspan.start, hdrspan.colspan, hdrspan.tagsets 

1060 ) 

1061 ) 

1062 break 

1063 if ( 

1064 hdrspan.start < start + colspan 

1065 and hdrspan.start > start 

1066 and hdrspan.start + hdrspan.colspan > start + colspan 

1067 and not hdrspan.expanded 

1068 ): 

1069 if celltext == debug_cell_text: 1069 ↛ 1070line 1069 didn't jump to line 1070 because the condition on line 1069 was never true

1070 print( 

1071 "break on partial overlap at end {} {} {}".format( 

1072 hdrspan.start, hdrspan.colspan, hdrspan.tagsets 

1073 ) 

1074 ) 

1075 break 

1076 # Check if we have already used this cell. 

1077 if id(hdrspan) in used_hdrspans: 

1078 continue 

1079 # We are going to use this cell. 

1080 used_hdrspans.add(id(hdrspan)) 

1081 tagsets = hdrspan.tagsets 

1082 # If the hdrspan is fully inside the current cell and does not cover 

1083 # it fully, check if we should merge information from multiple cells. 

1084 if not hdrspan.expanded and ( 

1085 hdrspan.start > start 

1086 or hdrspan.start + hdrspan.colspan < start + colspan 

1087 ): 

1088 # Multiple columns apply to the current cell, only 

1089 # gender/number/case tags present 

1090 # If there are no tags outside the range in any of the 

1091 # categories included in these cells, don't add anything 

1092 # (assume all choices valid in the language are possible). 

1093 in_cats = set( 

1094 valid_tags[t] 

1095 for x in hdrspans 

1096 if x.rownum == hdrspan.rownum 

1097 and x.start >= start 

1098 and x.start + x.colspan <= start + colspan 

1099 for tt in x.tagsets 

1100 for t in tt 

1101 ) 

1102 if celltext == debug_cell_text: 1102 ↛ 1103line 1102 didn't jump to line 1103 because the condition on line 1102 was never true

1103 print("in_cats={} tagsets={}".format(in_cats, tagsets)) 

1104 # Merge the tagsets into existing tagsets. This merges 

1105 # alternatives into the same tagset if there is only one 

1106 # category different; otherwise this splits the tagset into 

1107 # more alternatives. 

1108 includes_all_on_row = True 

1109 for x in hdrspans: 

1110 # print("X: x.rownum={} x.start={}".format(x.rownum, x.start)) 

1111 if x.rownum != hdrspan.rownum: 

1112 continue 

1113 if x.start < start or x.start + x.colspan > start + colspan: 

1114 if celltext == debug_cell_text: 1114 ↛ 1115line 1114 didn't jump to line 1115 because the condition on line 1114 was never true

1115 print( 

1116 "NOT IN RANGE: {} {} {}".format( 

1117 x.start, x.colspan, x.tagsets 

1118 ) 

1119 ) 

1120 includes_all_on_row = False 

1121 continue 

1122 if id(x) in used_hdrspans: 

1123 if celltext == debug_cell_text: 1123 ↛ 1124line 1123 didn't jump to line 1124 because the condition on line 1123 was never true

1124 print( 

1125 "ALREADY USED: {} {} {}".format( 

1126 x.start, x.colspan, x.tagsets 

1127 ) 

1128 ) 

1129 continue 

1130 used_hdrspans.add(id(x)) 

1131 if celltext == debug_cell_text: 1131 ↛ 1132line 1131 didn't jump to line 1132 because the condition on line 1131 was never true

1132 print( 

1133 "Merging into wide col: x.rownum={} " 

1134 "x.start={} x.colspan={} " 

1135 "start={} colspan={} tagsets={} x.tagsets={}".format( 

1136 x.rownum, 

1137 x.start, 

1138 x.colspan, 

1139 start, 

1140 colspan, 

1141 tagsets, 

1142 x.tagsets, 

1143 ) 

1144 ) 

1145 tagsets = or_tagsets(lang, pos, tagsets, x.tagsets) 

1146 # If all headers on the row were included, ignore them. 

1147 # See e.g. kunna/Swedish/Verb. 

1148 ts_cats = tagset_cats(tagsets) 

1149 if ( 

1150 includes_all_on_row 

1151 or 

1152 # Kludge, see fut/Hungarian/Verb 

1153 ("tense" in ts_cats and "object" in ts_cats) 

1154 ): 

1155 tagsets = [()] 

1156 # For limited categories, if the category doesn't appear 

1157 # outside, we won't include the category 

1158 if not in_cats - set( 

1159 ("gender", "number", "person", "case", "category", "voice") 

1160 ): 

1161 # Sometimes we have masc, fem, neut and plural, so treat 

1162 # number and gender as the same here (if one given, look for 

1163 # the other too) 

1164 if "number" in in_cats or "gender" in in_cats: 

1165 in_cats.update(("number", "gender")) 

1166 # Determine which categories occur outside on 

1167 # the same row. Ignore headers that have been expanded 

1168 # to cover the whole row/part of it. 

1169 out_cats = set( 

1170 valid_tags[t] 

1171 for x in hdrspans 

1172 if x.rownum == hdrspan.rownum 

1173 and not x.expanded 

1174 and ( 

1175 x.start < start or x.start + x.colspan > start + colspan 

1176 ) 

1177 for tt in x.tagsets 

1178 for t in tt 

1179 ) 

1180 if celltext == debug_cell_text: 1180 ↛ 1181line 1180 didn't jump to line 1181 because the condition on line 1180 was never true

1181 print("in_cats={} out_cats={}".format(in_cats, out_cats)) 

1182 # Remove all inside categories that do not appear outside 

1183 

1184 new_tagsets = [] 

1185 for ts in tagsets: 

1186 tags = tuple( 

1187 sorted(t for t in ts if valid_tags[t] in out_cats) 

1188 ) 

1189 if tags not in new_tagsets: 1189 ↛ 1185line 1189 didn't jump to line 1185 because the condition on line 1189 was always true

1190 new_tagsets.append(tags) 

1191 if celltext == debug_cell_text and new_tagsets != tagsets: 1191 ↛ 1192line 1191 didn't jump to line 1192 because the condition on line 1191 was never true

1192 print( 

1193 "Removed tags that do not " 

1194 "appear outside {} -> {}".format( 

1195 # have_hdr never used? 

1196 tagsets, 

1197 new_tagsets, 

1198 ) 

1199 ) 

1200 tagsets = new_tagsets 

1201 key = (hdrspan.start, hdrspan.colspan) 

1202 if key in used: 

1203 if celltext == debug_cell_text: 1203 ↛ 1204line 1203 didn't jump to line 1204 because the condition on line 1203 was never true

1204 print( 

1205 "Cellspan already used: start={} " 

1206 "colspan={} rownum={} {}".format( 

1207 hdrspan.start, 

1208 hdrspan.colspan, 

1209 hdrspan.rownum, 

1210 hdrspan.tagsets, 

1211 ) 

1212 ) 

1213 action = get_lang_conf(lang, "reuse_cellspan") 

1214 # can be "stop", "skip" or "reuse" 

1215 if action == "stop": 

1216 break 

1217 if action == "skip": 

1218 continue 

1219 assert action == "reuse" 

1220 tcats = tagset_cats(tagsets) 

1221 # Most headers block using the same column position above. However, 

1222 # "register" tags don't do this (cf. essere/Italian/verb: "formal") 

1223 if len(tcats) != 1 or "register" not in tcats: 

1224 used.add(key) 

1225 # If we have moved to a different row, merge into column tagsets 

1226 # (we use different and_tagsets within the row) 

1227 if row_tagsets_rownum != hdrspan.rownum: 

1228 # row_tagsets_rownum was initialized as 10000000 

1229 ret = and_tagsets(lang, pos, coltags, row_tagsets) 

1230 if celltext == debug_cell_text: 1230 ↛ 1231line 1230 didn't jump to line 1231 because the condition on line 1230 was never true

1231 print( 

1232 "merging rows: {} {} -> {}".format( 

1233 coltags, row_tagsets, ret 

1234 ) 

1235 ) 

1236 coltags = ret 

1237 row_tagsets = [()] 

1238 row_tagsets_rownum = hdrspan.rownum 

1239 # Merge into coltags 

1240 if hdrspan.all_headers_row and hdrspan.rownum + 1 == last_header_row: 

1241 # If this row is all headers and immediately preceeds the last 

1242 # header we accepted, take any header from there. 

1243 row_tagsets = and_tagsets(lang, pos, row_tagsets, tagsets) 

1244 if celltext == debug_cell_text: 1244 ↛ 1245line 1244 didn't jump to line 1245 because the condition on line 1244 was never true

1245 print("merged (next header row): {}".format(row_tagsets)) 

1246 else: 

1247 # new_cats is for the new tags (higher up in the table) 

1248 new_cats = tagset_cats(tagsets) 

1249 # cur_cats is for the tags already collected (lower in the table) 

1250 cur_cats = tagset_cats(coltags) 

1251 if celltext == debug_cell_text: 1251 ↛ 1252line 1251 didn't jump to line 1252 because the condition on line 1251 was never true

1252 print( 

1253 "row={} start={} colspan={} tagsets={} coltags={} " 

1254 "new_cats={} cur_cats={}".format( 

1255 hdrspan.rownum, 

1256 hdrspan.start, 

1257 hdrspan.colspan, 

1258 tagsets, 

1259 coltags, 

1260 new_cats, 

1261 cur_cats, 

1262 ) 

1263 ) 

1264 if "detail" in new_cats: 

1265 if not any(coltags): # Only if no tags so far 

1266 coltags = or_tagsets(lang, pos, coltags, tagsets) 

1267 if celltext == debug_cell_text: 1267 ↛ 1268line 1267 didn't jump to line 1268 because the condition on line 1267 was never true

1268 print("stopping on detail after merge") 

1269 break 

1270 # Here, we block bleeding of categories from above 

1271 elif "non-finite" in cur_cats and "non-finite" in new_cats: 

1272 stop = get_lang_conf(lang, "stop_non_finite_non_finite") 

1273 if stop: 1273 ↛ 1299line 1273 didn't jump to line 1299 because the condition on line 1273 was always true

1274 if celltext == debug_cell_text: 1274 ↛ 1275line 1274 didn't jump to line 1275 because the condition on line 1274 was never true

1275 print("stopping on non-finite-non-finite") 

1276 break 

1277 elif "non-finite" in cur_cats and "voice" in new_cats: 

1278 stop = get_lang_conf(lang, "stop_non_finite_voice") 

1279 if stop: 1279 ↛ 1299line 1279 didn't jump to line 1299 because the condition on line 1279 was always true

1280 if celltext == debug_cell_text: 1280 ↛ 1281line 1280 didn't jump to line 1281 because the condition on line 1280 was never true

1281 print("stopping on non-finite-voice") 

1282 break 

1283 elif "non-finite" in new_cats and cur_cats & set( 

1284 ("person", "number") 

1285 ): 

1286 if celltext == debug_cell_text: 1286 ↛ 1287line 1286 didn't jump to line 1287 because the condition on line 1286 was never true

1287 print("stopping on non-finite new") 

1288 break 

1289 elif "non-finite" in new_cats and "tense" in new_cats: 

1290 stop = get_lang_conf(lang, "stop_non_finite_tense") 

1291 if stop: 

1292 if celltext == debug_cell_text: 1292 ↛ 1293line 1292 didn't jump to line 1293 because the condition on line 1292 was never true

1293 print("stopping on non-finite new") 

1294 break 

1295 elif "non-finite" in cur_cats and new_cats & set(("mood",)): 1295 ↛ 1296line 1295 didn't jump to line 1296 because the condition on line 1295 was never true

1296 if celltext == debug_cell_text: 

1297 print("stopping on non-finite cur") 

1298 break 

1299 if ( 

1300 "tense" in new_cats 

1301 and any("imperative" in x for x in coltags) 

1302 and get_lang_conf(lang, "imperative_no_tense") 

1303 ): 

1304 if celltext == debug_cell_text: 1304 ↛ 1305line 1304 didn't jump to line 1305 because the condition on line 1304 was never true

1305 print("skipping tense in imperative") 

1306 continue 

1307 elif ( 

1308 "mood" in new_cats 

1309 and "mood" in cur_cats 

1310 and 

1311 # Allow if all new tags are already in current set 

1312 any( 

1313 t not in ts1 

1314 for ts1 in coltags # current 

1315 for ts2 in tagsets # new (from above) 

1316 for t in ts2 

1317 ) 

1318 ): 

1319 skip = get_lang_conf(lang, "skip_mood_mood") 

1320 if skip: 

1321 if celltext == debug_cell_text: 1321 ↛ 1322line 1321 didn't jump to line 1322 because the condition on line 1321 was never true

1322 print("skipping on mood-mood") 

1323 # we continue to next header 

1324 else: 

1325 if celltext == debug_cell_text: 1325 ↛ 1326line 1325 didn't jump to line 1326 because the condition on line 1325 was never true

1326 print("stopping on mood-mood") 

1327 break 

1328 elif "tense" in new_cats and "tense" in cur_cats: 

1329 skip = get_lang_conf(lang, "skip_tense_tense") 

1330 if skip: 

1331 if celltext == debug_cell_text: 1331 ↛ 1332line 1331 didn't jump to line 1332 because the condition on line 1331 was never true

1332 print("skipping on tense-tense") 

1333 # we continue to next header 

1334 else: 

1335 if celltext == debug_cell_text: 1335 ↛ 1336line 1335 didn't jump to line 1336 because the condition on line 1335 was never true

1336 print("stopping on tense-tense") 

1337 break 

1338 elif "aspect" in new_cats and "aspect" in cur_cats: 

1339 if celltext == debug_cell_text: 1339 ↛ 1340line 1339 didn't jump to line 1340 because the condition on line 1339 was never true

1340 print("skipping on aspect-aspect") 

1341 continue 

1342 elif "number" in cur_cats and "number" in new_cats: 

1343 if celltext == debug_cell_text: 1343 ↛ 1344line 1343 didn't jump to line 1344 because the condition on line 1343 was never true

1344 print("stopping on number-number") 

1345 break 

1346 elif "number" in cur_cats and "gender" in new_cats: 

1347 if celltext == debug_cell_text: 1347 ↛ 1348line 1347 didn't jump to line 1348 because the condition on line 1347 was never true

1348 print("stopping on number-gender") 

1349 break 

1350 elif "person" in cur_cats and "person" in new_cats: 

1351 if celltext == debug_cell_text: 1351 ↛ 1352line 1351 didn't jump to line 1352 because the condition on line 1351 was never true

1352 print("stopping on person-person") 

1353 break 

1354 else: 

1355 # Merge tags and continue to next header up/left in the table. 

1356 row_tagsets = and_tagsets(lang, pos, row_tagsets, tagsets) 

1357 if celltext == debug_cell_text: 1357 ↛ 1358line 1357 didn't jump to line 1358 because the condition on line 1357 was never true

1358 print("merged: {}".format(coltags)) 

1359 # Update the row number from which we have last taken headers 

1360 last_header_row = hdrspan.rownum 

1361 # Merge the final row tagset into coltags 

1362 coltags = and_tagsets(lang, pos, coltags, row_tagsets) 

1363 # print( 

1364 # "HDRSPANS:", list((x.start, x.colspan, x.tagsets) for x in hdrspans) 

1365 # ) 

1366 if celltext == debug_cell_text: 1366 ↛ 1367line 1366 didn't jump to line 1367 because the condition on line 1366 was never true

1367 print("COMPUTE_COLTAGS {} {}: {}".format(start, colspan, coltags)) 

1368 assert isinstance(coltags, list) 

1369 assert all(isinstance(x, tuple) for x in coltags) 

1370 return coltags 

1371 

1372 

1373def parse_simple_table( 

1374 wxr, tablecontext, word, lang, pos, rows, titles, source, after, depth 

1375): 

1376 """This is the default table parser. Despite its name, it can parse 

1377 complex tables. This returns a list of forms to be added to the 

1378 part-of-speech, or None if the table could not be parsed.""" 

1379 assert isinstance(wxr, WiktextractContext) 

1380 assert isinstance(tablecontext, TableContext) 

1381 assert isinstance(word, str) 

1382 assert isinstance(lang, str) 

1383 assert isinstance(pos, str) 

1384 assert isinstance(rows, list) 

1385 assert isinstance(source, str) 

1386 assert isinstance(after, str) 

1387 assert isinstance(depth, int) 

1388 for row in rows: 

1389 for col in row: 

1390 assert isinstance(col, InflCell) 

1391 assert isinstance(titles, list) 

1392 for x in titles: 

1393 assert isinstance(x, str) 

1394 

1395 # print("PARSE_SIMPLE_TABLE: TITLES:", titles) 

1396 if debug_cell_text: 1396 ↛ 1397line 1396 didn't jump to line 1397 because the condition on line 1396 was never true

1397 print("ROWS:") 

1398 for row in rows: 

1399 print(" ", row) 

1400 

1401 # Check for forced rowspan kludge. See e.g. 

1402 # maorski/Serbo-Croatian. These are essentially multi-row 

1403 # cells implemented using <br> rather than separate cell. We fix this 

1404 # by identifying rows where this happens, and splitting the current row 

1405 # to multiple rows by synthesizing additional cells. 

1406 new_rows = [] 

1407 for row in rows: 

1408 split_row = ( 

1409 any(x.is_title and x.text in ("inanimate\nanimate",) for x in row) 

1410 and 

1411 # x is an InflCell 

1412 all(x.rowspan == 1 for x in row) 

1413 ) 

1414 if not split_row: 

1415 new_rows.append(row) 

1416 continue 

1417 row1 = [] 

1418 row2 = [] 

1419 for cell in row: 

1420 cell1 = copy.deepcopy(cell) 

1421 if "\n" in cell.text: 

1422 # Has more than one line - split this cell 

1423 parts = cell.text.strip().splitlines() 

1424 if len(parts) != 2: 1424 ↛ 1425line 1424 didn't jump to line 1425 because the condition on line 1424 was never true

1425 wxr.wtp.debug( 

1426 "forced rowspan kludge got {} parts: {!r}".format( 

1427 len(parts), cell.text 

1428 ), 

1429 sortid="inflection/1234", 

1430 ) 

1431 cell2 = copy.deepcopy(cell) 

1432 cell1.text = parts[0] 

1433 cell2.text = parts[1] 

1434 else: 

1435 cell1.rowspan = 2 

1436 cell2 = cell1 # ref, not a copy 

1437 row1.append(cell1) 

1438 row2.append(cell2) 

1439 new_rows.append(row1) 

1440 new_rows.append(row2) 

1441 rows = new_rows 

1442 # print("ROWS AFTER FORCED ROWSPAN KLUDGE:") 

1443 # for row in rows: 

1444 # print(" ", row) 

1445 

1446 # Parse definitions for references (from table itself and from text 

1447 # after it) 

1448 def_ht = {} 

1449 

1450 def add_defs(defs: list[tuple[str, str]]) -> None: 

1451 for ref, d in defs: 

1452 # print("DEF: ref={} d={}".format(ref, d)) 

1453 d = d.strip() 

1454 d = d.split(". ")[0].strip() # text before ". " 

1455 if not d: 1455 ↛ 1456line 1455 didn't jump to line 1456 because the condition on line 1455 was never true

1456 continue 

1457 if d.endswith("."): # catc ".."?? 

1458 d = d[:-1] 

1459 tags, topics = decode_tags(d, no_unknown_starts=True) 

1460 # print(f"{ref=}, {d=}, {tags=}") 

1461 if topics or any("error-unknown-tag" in ts for ts in tags): 

1462 d = d[0].lower() + d[1:] 

1463 tags, topics = decode_tags(d, no_unknown_starts=True) 

1464 if topics or any("error-unknown-tag" in ts for ts in tags): 

1465 # Failed to parse as tags 

1466 # print("Failed: topics={} tags={}" 

1467 # .format(topics, tags)) 

1468 continue 

1469 tags1_s: set[str] = set() 

1470 for ts in tags: 

1471 tags1_s.update(ts) 

1472 tags1 = tuple(sorted(tags1_s)) 

1473 # print("DEFINED: {} -> {}".format(ref, tags1)) 

1474 def_ht[ref] = tags1 

1475 

1476 def generate_tags( 

1477 rowtags: list[tuple[str]], table_tags: list[str] 

1478 ) -> tuple[ 

1479 list[tuple[str, ...]], list[tuple[str, ...]], list[tuple[str, ...]] 

1480 ]: 

1481 new_coltags = [] 

1482 all_hdr_tags = [] # list of tuples 

1483 new_rowtags = [] 

1484 for rt0 in rowtags: 

1485 for ct0 in compute_coltags( 

1486 lang, 

1487 pos, 

1488 hdrspans, 

1489 col_idx, # col_idx=>start 

1490 colspan, 

1491 col, # cell_text 

1492 ): 

1493 base_tags: set[str] = ( 

1494 set(rt0) 

1495 | set(ct0) 

1496 | set(global_tags) 

1497 | set(itertools.chain.from_iterable(table_tags)) 

1498 ) # Union. 

1499 alt_tags = expand_header( 

1500 wxr, 

1501 tablecontext, 

1502 word, 

1503 lang, 

1504 pos, 

1505 text, 

1506 base_tags, 

1507 depth=depth, 

1508 ) 

1509 # base_tags are used in infl_map "if"-conds. 

1510 for tt in alt_tags: 

1511 if tt not in all_hdr_tags: 

1512 all_hdr_tags.append(tt) 

1513 tt_s = set(tt) 

1514 # Certain tags are always moved to word-level tags 

1515 if tt_s & TAGS_FORCED_WORDTAGS: 1515 ↛ 1516line 1515 didn't jump to line 1516 because the condition on line 1515 was never true

1516 table_tags.extend(tt_s & TAGS_FORCED_WORDTAGS) 

1517 tt_s = tt_s - TAGS_FORCED_WORDTAGS 

1518 # Add tags from referenced footnotes 

1519 tt_s.update(refs_tags) 

1520 # Sort, convert to tuple, and add to set of 

1521 # alternatives. 

1522 tt = tuple(sorted(tt_s)) 

1523 if tt not in new_coltags: 

1524 new_coltags.append(tt) 

1525 # Kludge (saprast/Latvian/Verb): ignore row tags 

1526 # if trying to add a non-finite after mood. 

1527 if any(valid_tags[t] == "mood" for t in rt0) and any( 

1528 valid_tags[t] == "non-finite" for t in tt 

1529 ): 

1530 tags = tuple(sorted(set(tt) | set(hdr_tags))) 

1531 else: 

1532 tags = tuple(sorted(set(tt) | set(rt0) | set(hdr_tags))) 

1533 if tags not in new_rowtags: 

1534 new_rowtags.append(tags) 

1535 return new_rowtags, new_coltags, all_hdr_tags 

1536 

1537 def add_new_hdrspan( 

1538 col: str, 

1539 hdrspans: list[HdrSpan], 

1540 store_new_hdrspan: bool, 

1541 col0_followed_by_nonempty: bool, 

1542 col0_hdrspan: Optional[HdrSpan], 

1543 ) -> tuple[str, bool, Optional[HdrSpan]]: 

1544 hdrspan = HdrSpan( 

1545 col_idx, colspan, rowspan, rownum, new_coltags, col, all_headers 

1546 ) 

1547 hdrspans.append(hdrspan) 

1548 

1549 # infl-map tag "dummy-store-hdrspan" causes this new hdrspan 

1550 # to be added to a register of stored hdrspans to be used 

1551 # later with "dummy-load-stored-hdrspans". 

1552 if store_new_hdrspan: 1552 ↛ 1553line 1552 didn't jump to line 1553 because the condition on line 1552 was never true

1553 tablecontext.stored_hdrspans.append(hdrspan) 

1554 

1555 # Handle headers that are above left-side header 

1556 # columns and are followed by personal pronouns in 

1557 # remaining columns (basically headers that 

1558 # evaluate to no tags). In such cases widen the 

1559 # left-side header to the full row. 

1560 if previously_seen: # id(cell) in seen_cells previously 

1561 col0_followed_by_nonempty = True 

1562 return col, col0_followed_by_nonempty, col0_hdrspan 

1563 elif col0_hdrspan is None: 

1564 col0_hdrspan = hdrspan 

1565 elif any(all_hdr_tags): 1565 ↛ 1633line 1565 didn't jump to line 1633 because the condition on line 1565 was always true

1566 col0_cats = tagset_cats(col0_hdrspan.tagsets) 

1567 later_cats = tagset_cats(all_hdr_tags) 

1568 col0_allowed = get_lang_conf(lang, "hdr_expand_first") 

1569 later_allowed = get_lang_conf(lang, "hdr_expand_cont") 

1570 later_allowed = later_allowed | set(["dummy"]) 

1571 # dummy2 has different behavior than plain dummy 

1572 # and does not belong here. 

1573 

1574 # print("col0_cats={} later_cats={} " 

1575 # "fol_by_nonempty={} col_idx={} end={} " 

1576 # "tagsets={}" 

1577 # .format(col0_cats, later_cats, 

1578 # col0_followed_by_nonempty, col_idx, 

1579 # col0_hdrspan.start + 

1580 # col0_hdrspan.colspan, 

1581 # col0_hdrspan.tagsets)) 

1582 # print("col0.rowspan={} rowspan={}" 

1583 # .format(col0_hdrspan.rowspan, rowspan)) 

1584 # Only expand if [col0_cats and later_cats are allowed 

1585 # and don't overlap] and [col0 has tags], and there have 

1586 # been [no disallowed cells in between]. 

1587 # 

1588 # There are three cases here: 

1589 # - col0_hdrspan set, continue with allowed current 

1590 # - col0_hdrspan set, expand, start new 

1591 # - col0_hdrspan set, no expand, start new 

1592 if ( 

1593 not col0_followed_by_nonempty 

1594 and 

1595 # XXX Only one cat of tags: kunna/Swedish 

1596 # XXX len(col0_cats) == 1 and 

1597 col0_hdrspan.rowspan >= rowspan 

1598 and 

1599 # from hdrspan 

1600 not (later_cats - later_allowed) 

1601 and not (col0_cats & later_cats) 

1602 ): 

1603 # First case: col0 set, continue 

1604 return col, col0_followed_by_nonempty, col0_hdrspan 

1605 # We are going to start new col0_hdrspan. Check if 

1606 # we should expand. 

1607 if ( 

1608 not col0_followed_by_nonempty 

1609 and not (col0_cats - col0_allowed) 

1610 and 

1611 # Only "allowed" allowed 

1612 # XXX len(col0_cats) == 1 and 

1613 col_idx > col0_hdrspan.start + col0_hdrspan.colspan 

1614 ): 

1615 # col_idx is beyond current colspan 

1616 # *Expand* current col0_hdrspan 

1617 # print("EXPANDING COL0 MID: {} from {} to {} " 

1618 # "cols {}" 

1619 # .format(col0_hdrspan.text, 

1620 # col0_hdrspan.colspan, 

1621 # col_idx - col0_hdrspan.start, 

1622 # col0_hdrspan.tagsets)) 

1623 col0_hdrspan.colspan = col_idx - col0_hdrspan.start 

1624 col0_hdrspan.expanded = True 

1625 # Clear old col0_hdrspan 

1626 if col == debug_cell_text: 1626 ↛ 1627line 1626 didn't jump to line 1627 because the condition on line 1626 was never true

1627 print("START NEW {}".format(hdrspan.tagsets)) 

1628 col0_hdrspan = None 

1629 # Now start new, unless it comes from previous row 

1630 if not previously_seen: 1630 ↛ 1633line 1630 didn't jump to line 1633 because the condition on line 1630 was always true

1631 col0_hdrspan = hdrspan 

1632 col0_followed_by_nonempty = False 

1633 return col, col0_followed_by_nonempty, col0_hdrspan 

1634 

1635 def split_text_into_alts(col: str) -> tuple[str, list[str], list[str]]: 

1636 # Split the cell text into alternatives 

1637 split_extra_tags = [] 

1638 if col and is_superscript(col[0]): 1638 ↛ 1639line 1638 didn't jump to line 1639 because the condition on line 1638 was never true

1639 alts = [col] 

1640 else: 

1641 separators = [";", "•", r"\n", " or "] 

1642 if " + " not in col: 

1643 separators.append(",") 

1644 if not col.endswith("/"): 

1645 separators.append("/") 

1646 if col in special_phrase_splits: 

1647 # Use language-specific special splits. 

1648 # These are phrases and constructions that have 

1649 # unique ways of splitting, not specific characters 

1650 # to split on like with the default splitting. 

1651 alts, tags = special_phrase_splits[col] 

1652 split_extra_tags = tags.split() 

1653 for x in split_extra_tags: 

1654 assert x in valid_tags 

1655 assert isinstance(alts, (list, tuple)) 

1656 assert isinstance(tags, str) 

1657 else: 

1658 # Use default splitting. However, recognize 

1659 # language-specific replacements and change them to magic 

1660 # characters before splitting. This way we won't split 

1661 # them. This is important for, e.g., recognizing 

1662 # alternative pronouns. 

1663 # The magic characters are characters out of Unicode scope 

1664 # that are given a simple incremental value, int > unicode. 

1665 repls = {} 

1666 magic_ch = MAGIC_FIRST 

1667 trs = get_lang_conf(lang, "form_transformations") 

1668 # trs is a list of lists of strings 

1669 for _, v, _, _ in trs: 

1670 # v is a pattern string, like "^ich" 

1671 # form_transformations data is doing double-duty here, 

1672 # because the pattern strings are already known to us and 

1673 # not meant to be split. 

1674 m = re.search(v, col) 

1675 if m is not None: 

1676 # if pattern found in text 

1677 magic = chr(magic_ch) 

1678 magic_ch += 1 # next magic character value 

1679 col = re.sub(v, magic, col) # replace with magic ch 

1680 repls[magic] = m.group(0) 

1681 # remember what regex match string each magic char 

1682 # replaces. .group(0) is the whole match. 

1683 alts0 = split_at_comma_semi(col, separators=separators) 

1684 # with magic characters in place, split the text so that 

1685 # pre-transformation text is out of the way. 

1686 alts = [] 

1687 for alt in alts0: 

1688 # create a new list with the separated items and 

1689 # the magic characters replaced with the original texts. 

1690 for k, v in repls.items(): 

1691 alt = re.sub(k, v, alt) 

1692 alts.append(alt) 

1693 # Remove "*" from beginning of forms, as in non-attested 

1694 # or reconstructed forms. Otherwise it might confuse romanization 

1695 # detection. 

1696 alts = list(re.sub(r"^\*\*?([^ ])", r"\1", x) for x in alts) 

1697 alts = list( 

1698 x for x in alts if not re.match(r"pronounced with |\(with ", x) 

1699 ) 

1700 alts = list( 

1701 re.sub(r"^\((in the sense [^)]*)\)\s+", "", x) for x in alts 

1702 ) 

1703 # Check for parenthesized alternatives, e.g. ripromettersi/Italian 

1704 if all( 1704 ↛ 1715line 1704 didn't jump to line 1715 because the condition on line 1704 was never true

1705 re.match(r"\w+( \w+)* \(\w+( \w+)*(, \w+( \w+)*)*\)$", alt) 

1706 # word word* \(word word*(, word word*)*\) 

1707 and all( 

1708 distw([re.sub(r" \(.*", "", alt)], x) < 0.5 

1709 # Levenshtein distance 

1710 for x in re.sub(r".*\((.*)\)", r"\1", alt).split(", ") 

1711 ) 

1712 # Extract from parentheses for testin 

1713 for alt in alts 

1714 ): 

1715 new_alts = [] 

1716 for alt in alts: 

1717 # Replace parentheses before splitting 

1718 alt = alt.replace(" (", ", ") 

1719 alt = alt.replace(")", "") 

1720 for new_alt in alt.split(", "): 

1721 new_alts.append(new_alt) 

1722 alts = new_alts 

1723 return col, alts, split_extra_tags 

1724 

1725 def handle_mixed_lines(alts: list[str]) -> list[tuple[str, str, str]]: 

1726 # Handle the special case where romanization is given under 

1727 # normal form, e.g. in Russian. There can be multiple 

1728 # comma-separated forms in each case. We also handle the case 

1729 # where instead of romanization we have IPA pronunciation 

1730 # (e.g., avoir/French/verb). 

1731 len2 = len(alts) // 2 

1732 # Check for IPAs (forms first, IPAs under) 

1733 # base, base, IPA, IPA 

1734 if ( 

1735 len(alts) % 2 == 0 # Divisibly by two 

1736 and all( 

1737 re.match(r"^\s*/.*/\s*$", x) # Inside slashes = IPA 

1738 for x in alts[len2:] 

1739 ) 

1740 ): # In the second half of alts 

1741 nalts = list( 

1742 (alts[i], "", alts[i + len2]) 

1743 # List of tuples: (base, "", ipa) 

1744 for i in range(len2) 

1745 ) 

1746 # base, base, base, IPA 

1747 elif ( 

1748 len(alts) > 2 

1749 and re.match(r"^\s*/.*/\s*$", alts[-1]) 

1750 and all(not x.startswith("/") for x in alts[:-1]) 

1751 ): 

1752 # Only if the last alt is IPA 

1753 nalts = list((alts[i], "", alts[-1]) for i in range(len(alts) - 1)) 

1754 # base, IPA, IPA, IPA 

1755 elif ( 

1756 len(alts) > 2 

1757 and not alts[0].startswith("/") 

1758 and all( 

1759 re.match(r"^\s*/.*/\s*$", alts[i]) for i in range(1, len(alts)) 

1760 ) 

1761 ): 

1762 # First is base and the rest is IPA alternatives 

1763 nalts = list((alts[0], "", alts[i]) for i in range(1, len(alts))) 

1764 

1765 # Check for romanizations, forms first, romanizations under 

1766 elif ( 

1767 len(alts) % 2 == 0 

1768 and not any("(" in x for x in alts) 

1769 and all( 

1770 classify_desc( 

1771 re.sub( 

1772 r"\^.*$", 

1773 "", 

1774 # Remove ends of strings starting from ^. 

1775 # Supescripts have been already removed 

1776 # from the string, while ^xyz needs to be 

1777 # removed separately, though it's usually 

1778 # something with a single letter? 

1779 "".join(xx for xx in x if not is_superscript(xx)), 

1780 ) 

1781 ) 

1782 == "other" 

1783 for x in alts[:len2] 

1784 ) 

1785 and all( 

1786 classify_desc( 

1787 re.sub( 

1788 r"\^.*$", 

1789 "", 

1790 "".join(xx for xx in x if not is_superscript(xx)), 

1791 ) 

1792 ) 

1793 in ("romanization", "english") 

1794 for x in alts[len2:] 

1795 ) 

1796 ): 

1797 nalts = list((alts[i], alts[i + len2], "") for i in range(len2)) 

1798 # Check for romanizations, forms and romanizations alternating 

1799 elif ( 

1800 len(alts) % 2 == 0 

1801 and not any("(" in x for x in alts) 

1802 and all( 

1803 classify_desc( 

1804 re.sub( 

1805 r"\^.*$", 

1806 "", 

1807 "".join(xx for xx in alts[i] if not is_superscript(xx)), 

1808 ) 

1809 ) 

1810 == "other" 

1811 for i in range(0, len(alts), 2) 

1812 ) 

1813 and all( 

1814 classify_desc( 

1815 re.sub( 

1816 r"\^.*$", 

1817 "", 

1818 "".join(xx for xx in alts[i] if not is_superscript(xx)), 

1819 ) 

1820 ) 

1821 in ("romanization", "english") 

1822 for i in range(1, len(alts), 2) 

1823 ) 

1824 ): 

1825 # odds 

1826 nalts = list( 

1827 (alts[i], alts[i + 1], "") for i in range(0, len(alts), 2) 

1828 ) 

1829 # evens 

1830 else: 

1831 new_alts = [] 

1832 for alt in alts: 

1833 lst = [""] 

1834 idx = 0 

1835 for m in re.finditer( 

1836 r"(^|\w|\*)\((\w+" r"(/\w+)*)\)", 

1837 # start OR letter OR asterisk (word/word*) 

1838 # \\___________group 1_______/ \ \_g3_/// 

1839 # \ \__gr. 2_// 

1840 # \_____________group 0________________/ 

1841 alt, 

1842 ): 

1843 v = m.group(2) # (word/word/word...) 

1844 if ( 

1845 classify_desc(v) == "tags" # Tags inside parens 

1846 or m.group(0) == alt 

1847 ): # All in parens 

1848 continue 

1849 new_lst = [] 

1850 for x in lst: 

1851 x += alt[idx : m.start()] + m.group(1) 

1852 # alt until letter or asterisk 

1853 idx = m.end() 

1854 vparts = v.split("/") 

1855 # group(2) = ["word", "wörd"...] 

1856 if len(vparts) == 1: 

1857 new_lst.append(x) 

1858 new_lst.append(x + v) 

1859 # "kind(er)" -> ["kind", "kinder"] 

1860 else: 

1861 for vv in vparts: 

1862 new_lst.append(x + vv) 

1863 # "lampai(tten/den)" -> 

1864 # ["lampaitten", "lampaiden"] 

1865 lst = new_lst 

1866 for x in lst: 

1867 new_alts.append(x + alt[idx:]) 

1868 # add the end of alt 

1869 nalts = list((x, "", "") for x in new_alts) 

1870 # [form, no romz, no ipa] 

1871 return nalts 

1872 

1873 def find_semantic_parens(form: str) -> tuple[str, list[str]]: 

1874 # "Some languages" (=Greek) use brackets to mark things that 

1875 # require tags, like (informality), [rarity] and {archaicity}. 

1876 extra_tags = [] 

1877 if re.match(r"\([^][(){}]*\)$", form): 

1878 if get_lang_conf(lang, "parentheses_for_informal"): 

1879 form = form[1:-1] 

1880 extra_tags.append("informal") 

1881 else: 

1882 form = form[1:-1] 

1883 elif re.match(r"\{\[[^][(){}]*\]\}$", form): 

1884 if get_lang_conf( 1884 ↛ 1891line 1884 didn't jump to line 1891 because the condition on line 1884 was always true

1885 lang, "square_brackets_for_rare" 

1886 ) and get_lang_conf(lang, "curly_brackets_for_archaic"): 

1887 # είμαι/Greek/Verb 

1888 form = form[2:-2] 

1889 extra_tags.extend(["rare", "archaic"]) 

1890 else: 

1891 form = form[2:-2] 

1892 elif re.match(r"\{[^][(){}]*\}$", form): 

1893 if get_lang_conf(lang, "curly_brackets_for_archaic"): 1893 ↛ 1898line 1893 didn't jump to line 1898 because the condition on line 1893 was always true

1894 # είμαι/Greek/Verb 

1895 form = form[1:-1] 

1896 extra_tags.extend(["archaic"]) 

1897 else: 

1898 form = form[1:-1] 

1899 elif re.match(r"\[[^][(){}]*\]$", form): 

1900 if get_lang_conf(lang, "square_brackets_for_rare"): 1900 ↛ 1905line 1900 didn't jump to line 1905 because the condition on line 1900 was always true

1901 # είμαι/Greek/Verb 

1902 form = form[1:-1] 

1903 extra_tags.append("rare") 

1904 else: 

1905 form = form[1:-1] 

1906 return form, extra_tags 

1907 

1908 def handle_parens( 

1909 form: str, roman: str, clitic: str, extra_tags: list[str] 

1910 ) -> tuple[str, str, str]: 

1911 if re.match(r"[’'][a-z]([a-z][a-z]?)?$", paren): 

1912 # is there a clitic starting with apostrophe? 

1913 clitic = paren 

1914 # assume the whole paren is a clitic 

1915 # then remove paren from form 

1916 form = (form[: m.start()] + subst + form[m.end() :]).strip() 

1917 elif classify_desc(paren) == "tags": 

1918 tagsets1, topics1 = decode_tags(paren) 

1919 if not topics1: 1919 ↛ 1940line 1919 didn't jump to line 1940 because the condition on line 1919 was always true

1920 for ts in tagsets1: 

1921 ts = tuple(x for x in ts if " " not in x) 

1922 # There are some generated tags containing 

1923 # spaces; do not let them through here. 

1924 extra_tags.extend(ts) 

1925 form = (form[: m.start()] + subst + form[m.end() :]).strip() 

1926 # brackets contain romanization 

1927 elif ( 1927 ↛ 1936line 1927 didn't jump to line 1936

1928 m.start() > 0 

1929 and not roman 

1930 and classify_desc(form[: m.start()]) == "other" 

1931 and 

1932 # "other" ~ text 

1933 classify_desc(paren) in ("romanization", "english") 

1934 and not re.search(r"^with |-form$", paren) 

1935 ): 

1936 roman = paren 

1937 form = (form[: m.start()] + subst + form[m.end() :]).strip() 

1938 elif re.search(r"^with |-form", paren): 1938 ↛ 1939line 1938 didn't jump to line 1939 because the condition on line 1938 was never true

1939 form = (form[: m.start()] + subst + form[m.end() :]).strip() 

1940 return form, roman, clitic 

1941 

1942 def merge_row_and_column_tags(form, some_has_covered_text): 

1943 # Merge column tags and row tags. We give preference 

1944 # to moods etc coming from rowtags (cf. austteigen/German/Verb 

1945 # imperative forms). 

1946 

1947 # In certain cases, what a tag means depends on whether 

1948 # it is a row or column header. Depending on the language, 

1949 # we replace certain tags with others if they're in 

1950 # a column or row 

1951 

1952 ret = [] 

1953 # rtagreplacs = get_lang_conf(lang, "rowtag_replacements") 

1954 # ctagreplacs = get_lang_conf(lang, "coltag_replacements") 

1955 for rt in sorted(rowtags): 

1956 if "dummy-use-as-coltags" in rt: 1956 ↛ 1957line 1956 didn't jump to line 1957 because the condition on line 1956 was never true

1957 continue 

1958 # if lang was in rowtag_replacements) 

1959 # if not rtagreplacs == None: 

1960 # rt = replace_directional_tags(rt, rtagreplacs) 

1961 for ct in sorted(coltags): 

1962 if "dummy-use-as-rowtags" in ct: 1962 ↛ 1963line 1962 didn't jump to line 1963 because the condition on line 1962 was never true

1963 continue 

1964 # if lang was in coltag_replacements 

1965 # if not ctagreplacs == None: 

1966 # ct = replace_directional_tags(ct, 

1967 # ctagreplacs) 

1968 tags = set(global_tags) 

1969 tags.update(extra_tags) 

1970 tags.update(rt) 

1971 tags.update(refs_tags) 

1972 tags.update(tablecontext.section_header) 

1973 # Merge tags from column. For certain kinds of tags, 

1974 # those coming from row take precedence. 

1975 old_tags = set(tags) 

1976 for t in ct: 

1977 c = valid_tags[t] 

1978 if c in ("mood", "case", "number") and any( 

1979 valid_tags[tt] == c for tt in old_tags 

1980 ): 

1981 continue 

1982 tags.add(t) 

1983 

1984 # Extract language-specific tags from the 

1985 # form. This may also adjust the form. 

1986 form, lang_tags = lang_specific_tags(lang, pos, form) 

1987 tags.update(lang_tags) 

1988 

1989 # For non-finite verb forms, see if they have 

1990 # a gender/class suffix 

1991 if pos == "verb" and any( 

1992 valid_tags[t] == "non-finite" for t in tags 

1993 ): 

1994 form, tt = parse_head_final_tags(wxr, lang, form) 

1995 tags.update(tt) 

1996 

1997 # Remove "personal" tag if have nth person; these 

1998 # come up with e.g. reconhecer/Portuguese/Verb. But 

1999 # not if we also have "pronoun" 

2000 if ( 

2001 "personal" in tags 

2002 and "pronoun" not in tags 

2003 and any( 

2004 x in tags 

2005 for x in [ 

2006 "first-person", 

2007 "second-person", 

2008 "third-person", 

2009 ] 

2010 ) 

2011 ): 

2012 tags.remove("personal") 

2013 

2014 # If we have impersonal, remove person and number. 

2015 # This happens with e.g. viajar/Portuguese/Verb 

2016 if "impersonal" in tags: 

2017 tags = tags - set( 

2018 [ 

2019 "first-person", 

2020 "second-person", 

2021 "third-person", 

2022 "singular", 

2023 "plural", 

2024 ] 

2025 ) 

2026 

2027 # Remove unnecessary "positive" tag from verb forms 

2028 if pos == "verb" and "positive" in tags: 

2029 if "negative" in tags: 2029 ↛ 2030line 2029 didn't jump to line 2030 because the condition on line 2029 was never true

2030 tags.remove("negative") 

2031 tags.remove("positive") 

2032 

2033 # Many Russian (and other Slavic) inflection tables 

2034 # have animate/inanimate distinction that generates 

2035 # separate entries for neuter/feminine, but the 

2036 # distinction only applies to masculine. Remove them 

2037 # form neuter/feminine and eliminate duplicates. 

2038 if get_lang_conf(lang, "masc_only_animate"): 

2039 for t1 in ("animate", "inanimate"): 

2040 for t2 in ("neuter", "feminine"): 

2041 if ( 

2042 t1 in tags 

2043 and t2 in tags 

2044 and "masculine" not in tags 

2045 and "plural" not in tags 

2046 ): 

2047 tags.remove(t1) 

2048 

2049 # German adjective tables contain "(keiner)" etc 

2050 # for mixed declension plural. When the adjective 

2051 # disappears and it becomes just one word, remove 

2052 # the "includes-article" tag. e.g. eiskalt/German 

2053 if "includes-article" in tags and " " not in form: 

2054 tags.remove("includes-article") 

2055 

2056 # Handle ignored forms. We mark that the form was 

2057 # provided. This is important information; some words 

2058 # just do not have a certain form. However, there also 

2059 # many cases where no word in a language has a 

2060 # particular form. Post-processing could detect and 

2061 # remove such cases. 

2062 if form in IGNORED_COLVALUES: 

2063 # if cell text seems to be ignorable 

2064 if "dummy-ignore-skipped" in tags: 

2065 continue 

2066 if ( 

2067 col_idx not in has_covering_hdr 

2068 and some_has_covered_text 

2069 ): 

2070 continue 

2071 # don't ignore this cell if there's been a header 

2072 # above it 

2073 form = "-" 

2074 elif col_idx in has_covering_hdr: 

2075 some_has_covered_text = True 

2076 

2077 # Handle ambiguous object concord. If a header 

2078 # gives the "dummy-object-concord"-tag to a word, 

2079 # replace person, number and gender tags with 

2080 # their "object-" counterparts so that the verb 

2081 # agrees with the object instead. 

2082 # Use only when the verb has ONLY object agreement! 

2083 # a پخول/Pashto 

2084 if "dummy-object-concord" in tags: 2084 ↛ 2085line 2084 didn't jump to line 2085 because the condition on line 2084 was never true

2085 for subtag, objtag in object_concord_replacements.items(): 

2086 if subtag in tags: 

2087 tags.remove(subtag) 

2088 tags.add(objtag) 

2089 

2090 # Remove the dummy mood tag that we sometimes 

2091 # use to block adding other mood and related 

2092 # tags 

2093 tags = tags - set( 

2094 [ 

2095 "dummy-mood", 

2096 "dummy-tense", 

2097 "dummy-ignore-skipped", 

2098 "dummy-object-concord", 

2099 "dummy-reset-headers", 

2100 "dummy-use-as-coltags", 

2101 "dummy-use-as-rowtags", 

2102 "dummy-store-hdrspan", 

2103 "dummy-load-stored-hdrspans", 

2104 "dummy-reset-stored-hdrspans", 

2105 "dummy-section-header", 

2106 ] 

2107 ) 

2108 

2109 # Perform language-specific tag replacements according 

2110 # to rules in a table. 

2111 lang_tag_mappings = get_lang_conf(lang, "lang_tag_mappings") 

2112 if lang_tag_mappings is not None: 2112 ↛ 2113line 2112 didn't jump to line 2113 because the condition on line 2112 was never true

2113 for pre, post in lang_tag_mappings.items(): 

2114 if all(t in tags for t in pre): 

2115 tags = (tags - set(pre)) | set(post) 

2116 

2117 # Warn if there are entries with empty tags 

2118 if not tags: 2118 ↛ 2119line 2118 didn't jump to line 2119 because the condition on line 2118 was never true

2119 wxr.wtp.debug( 

2120 "inflection table: empty tags for {}".format(form), 

2121 sortid="inflection/1826", 

2122 ) 

2123 

2124 # Warn if form looks like IPA 

2125 ########## XXX ######## 

2126 # Because IPA is its own unicode block, we could also 

2127 # technically do a Unicode name check to see if a string 

2128 # contains IPA. Not all valid IPA characters are in the 

2129 # IPA extension block, so you can technically have false 

2130 # negatives if it's something like /toki/, but it 

2131 # shouldn't give false positives. 

2132 # Alternatively, you could make a list of IPA-admissible 

2133 # characters and reject non-IPA stuff with that. 

2134 if re.match(r"\s*/.*/\s*$", form): 2134 ↛ 2135line 2134 didn't jump to line 2135 because the condition on line 2134 was never true

2135 wxr.wtp.debug( 

2136 "inflection table form looks like IPA: " 

2137 "form={} tags={}".format(form, tags), 

2138 sortid="inflection/1840", 

2139 ) 

2140 

2141 # Note that this checks `form`, not `in tags` 

2142 if form == "dummy-ignored-text-cell": 2142 ↛ 2143line 2142 didn't jump to line 2143 because the condition on line 2142 was never true

2143 continue 

2144 

2145 if "dummy-remove-this-cell" in tags: 2145 ↛ 2146line 2145 didn't jump to line 2146 because the condition on line 2145 was never true

2146 continue 

2147 

2148 # Add the form 

2149 tags = list(sorted(tags)) 

2150 dt = {"form": form, "tags": tags, "source": source} 

2151 if roman: 

2152 dt["roman"] = roman 

2153 if ipa: 

2154 dt["ipa"] = ipa 

2155 ret.append(dt) 

2156 # If we got separate clitic form, add it 

2157 if clitic: 

2158 dt = { 

2159 "form": clitic, 

2160 "tags": tags + ["clitic"], 

2161 "source": source, 

2162 } 

2163 ret.append(dt) 

2164 return ret, form, some_has_covered_text 

2165 

2166 # First extract definitions from cells 

2167 # See defs_ht for footnote defs stuff 

2168 for row in rows: 

2169 for cell in row: 

2170 text, refs, defs, hdr_tags = extract_cell_content( 

2171 lang, word, cell.text 

2172 ) 

2173 # refs, defs = footnote stuff, defs -> (ref, def) 

2174 add_defs(defs) 

2175 # Extract definitions from text after table 

2176 text, refs, defs, hdr_tags = extract_cell_content(lang, word, after) 

2177 add_defs(defs) 

2178 

2179 # Then extract the actual forms 

2180 ret = [] 

2181 hdrspans = [] 

2182 first_col_has_text = False 

2183 rownum = 0 

2184 title = None 

2185 global_tags = [] 

2186 table_tags = [] 

2187 special_phrase_splits = get_lang_conf(lang, "special_phrase_splits") 

2188 form_replacements = get_lang_conf(lang, "form_replacements") 

2189 possibly_ignored_forms = get_lang_conf(lang, "conditionally_ignored_cells") 

2190 cleanup_rules = get_lang_conf(lang, "minor_text_cleanups") 

2191 

2192 for title in titles: 

2193 more_global_tags, more_table_tags, extra_forms = parse_title( 

2194 title, source 

2195 ) 

2196 global_tags.extend(more_global_tags) 

2197 table_tags.extend(more_table_tags) 

2198 ret.extend(extra_forms) 

2199 cell_rowcnt = collections.defaultdict(int) 

2200 seen_cells = set() 

2201 has_covering_hdr = set() 

2202 some_has_covered_text = False 

2203 for row in rows: 

2204 # print("ROW:", row) 

2205 # print("====") 

2206 # print(f"Start of PREVIOUS row hdrspans:" 

2207 # f"{tuple(sp.tagsets for sp in hdrspans)}") 

2208 # print(f"Start of row txt: {tuple(t.text for t in row)}") 

2209 if not row: 2209 ↛ 2210line 2209 didn't jump to line 2210 because the condition on line 2209 was never true

2210 continue # Skip empty rows 

2211 all_headers = all(x.is_title or not x.text.strip() for x in row) 

2212 text = row[0].text 

2213 if ( 

2214 row[0].is_title 

2215 and text 

2216 and not is_superscript(text[0]) 

2217 and text not in infl_map # zealous inflation map? 

2218 and ( 

2219 re.match(r"Inflection ", text) 

2220 or re.sub( 

2221 r"\s+", 

2222 " ", # flatten whitespace 

2223 re.sub( 

2224 r"\s*\([^)]*\)", 

2225 "", 

2226 # Remove whitespace+parens 

2227 text, 

2228 ), 

2229 ).strip() 

2230 not in infl_map 

2231 ) 

2232 and not re.match(infl_start_re, text) 

2233 and all( 

2234 x.is_title == row[0].is_title and x.text == text 

2235 # all InflCells in `row` have the same is_title and text 

2236 for x in row 

2237 ) 

2238 ): 

2239 if text and title is None: 

2240 # Only if there were no titles previously make the first 

2241 # text that is found the title 

2242 title = text 

2243 if re.match(r"(Note:|Notes:)", title): 2243 ↛ 2244line 2243 didn't jump to line 2244 because the condition on line 2243 was never true

2244 continue # not a title 

2245 more_global_tags, more_table_tags, extra_forms = parse_title( 

2246 title, source 

2247 ) 

2248 global_tags.extend(more_global_tags) 

2249 table_tags.extend(more_table_tags) 

2250 ret.extend(extra_forms) 

2251 continue # Skip title rows without incrementing i 

2252 if "dummy-skip-this" in global_tags: 2252 ↛ 2253line 2252 didn't jump to line 2253 because the condition on line 2252 was never true

2253 return [] 

2254 rowtags = [()] 

2255 # have_hdr = False 

2256 # have_hdr never used? 

2257 have_text = False 

2258 samecell_cnt = 0 

2259 col0_hdrspan = None # col0 or later header (despite its name) 

2260 col0_followed_by_nonempty = False 

2261 row_empty = True 

2262 for col_idx, cell in enumerate(row): 

2263 colspan = cell.colspan # >= 1 

2264 rowspan = cell.rowspan # >= 1 

2265 previously_seen = id(cell) in seen_cells 

2266 # checks to see if this cell was in the previous ROW 

2267 seen_cells.add(id(cell)) 

2268 if samecell_cnt == 0: 

2269 # First column of a (possible multi-column) cell 

2270 samecell_cnt = colspan - 1 

2271 else: 

2272 assert samecell_cnt > 0 

2273 samecell_cnt -= 1 

2274 continue 

2275 

2276 # is_first_row_of_cell = cell_rowcnt[id(cell)] == 0 

2277 # never used? 

2278 

2279 # defaultdict(int) around line 1900 

2280 cell_rowcnt[id(cell)] += 1 

2281 # => how many cols this spans 

2282 col = cell.text 

2283 if not col: 

2284 continue 

2285 row_empty = False 

2286 is_title = cell.is_title 

2287 

2288 # If the cell has a target, i.e., text after colon, interpret 

2289 # it as simply specifying a value for that value and ignore 

2290 # it otherwise. 

2291 if cell.target: 

2292 text, refs, defs, hdr_tags = extract_cell_content( 

2293 lang, word, col 

2294 ) 

2295 if not text: 2295 ↛ 2296line 2295 didn't jump to line 2296 because the condition on line 2295 was never true

2296 continue 

2297 refs_tags = set() 

2298 for ref in refs: # gets tags from footnotes 2298 ↛ 2299line 2298 didn't jump to line 2299 because the loop on line 2298 never started

2299 if ref in def_ht: 

2300 refs_tags.update(def_ht[ref]) 

2301 rowtags = expand_header( 

2302 wxr, 

2303 tablecontext, 

2304 word, 

2305 lang, 

2306 pos, 

2307 text, 

2308 [], 

2309 silent=True, 

2310 depth=depth, 

2311 ) 

2312 rowtags = list( 

2313 set(tuple(sorted(set(x) | refs_tags)) for x in rowtags) 

2314 ) 

2315 is_title = False 

2316 col = cell.target 

2317 

2318 # print(rownum, col_idx, col) 

2319 # print(f"is_title: {is_title}") 

2320 if is_title: 

2321 # It is a header cell 

2322 text, refs, defs, hdr_tags = extract_cell_content( 

2323 lang, word, col 

2324 ) 

2325 if not text: 

2326 continue 

2327 # Extract tags from referenced footnotes 

2328 refs_tags = set() 

2329 for ref in refs: 

2330 if ref in def_ht: 

2331 refs_tags.update(def_ht[ref]) 

2332 

2333 # Expand header to tags 

2334 v = expand_header( 

2335 wxr, 

2336 tablecontext, 

2337 word, 

2338 lang, 

2339 pos, 

2340 text, 

2341 [], 

2342 silent=True, 

2343 depth=depth, 

2344 ) 

2345 # print("EXPANDED {!r} to {}".format(text, v)) 

2346 

2347 if col_idx == 0: 

2348 # first_col_has_text is used for a test to ignore 

2349 # upper-left cells that are just text without 

2350 # header info 

2351 first_col_has_text = True 

2352 # Check if the header expands to reset hdrspans 

2353 if any("dummy-reset-headers" in tt for tt in v): 

2354 new_hdrspans = [] 

2355 for hdrspan in hdrspans: 

2356 # if there are HdrSpan objects (abstract headers with 

2357 # row- and column-spans) that are to the left or at the 

2358 # same row or below, KEEP those; things above and to 

2359 # the right of the hdrspan with dummy-reset-headers 

2360 # are discarded. Tags from the header together with 

2361 # dummy-reset-headers are kept as normal. 

2362 if ( 

2363 hdrspan.start + hdrspan.colspan < col_idx 

2364 or hdrspan.rownum > rownum - cell.rowspan 

2365 ): 

2366 new_hdrspans.append(hdrspan) 

2367 hdrspans = new_hdrspans 

2368 

2369 for tt in v: 

2370 if "dummy-section-header" in tt: 2370 ↛ 2371line 2370 didn't jump to line 2371 because the condition on line 2370 was never true

2371 tablecontext.section_header = tt 

2372 break 

2373 if "dummy-reset-section-header" in tt: 2373 ↛ 2374line 2373 didn't jump to line 2374 because the condition on line 2373 was never true

2374 tablecontext.section_header = [] 

2375 # Text between headers on a row causes earlier headers to 

2376 # be reset 

2377 if have_text: 

2378 # print(" HAVE_TEXT BEFORE HDR:", col) 

2379 # Reset rowtags if new title column after previous 

2380 # text cells 

2381 # +-----+-----+-----+-----+ 

2382 # |hdr-a|txt-a|hdr-B|txt-B| 

2383 # +-----+-----+-----+-----+ 

2384 # ^reset rowtags=> 

2385 # XXX beware of header "—": "" - must not clear on that if 

2386 # it expands to no tags 

2387 rowtags = [()] 

2388 # have_hdr = True 

2389 # have_hdr never used? 

2390 # print("HAVE_HDR: {} rowtags={}".format(col, rowtags)) 

2391 # Update rowtags and coltags 

2392 has_covering_hdr.add(col_idx) # col_idx == current column 

2393 # has_covering_hdr is a set that has the col_idx-ids of columns 

2394 # that have previously had some kind of header. It is never 

2395 # resetted inside the col_idx-loops OR the bigger rows-loop, so 

2396 # applies to the whole table. 

2397 

2398 rowtags, new_coltags, all_hdr_tags = generate_tags( 

2399 rowtags, table_tags 

2400 ) 

2401 

2402 if any("dummy-skip-this" in ts for ts in rowtags): 

2403 continue # Skip this cell 

2404 

2405 if any("dummy-load-stored-hdrspans" in ts for ts in v): 2405 ↛ 2406line 2405 didn't jump to line 2406 because the condition on line 2405 was never true

2406 hdrspans.extend(tablecontext.stored_hdrspans) 

2407 

2408 if any("dummy-reset-stored-hdrspans" in ts for ts in v): 2408 ↛ 2409line 2408 didn't jump to line 2409 because the condition on line 2408 was never true

2409 tablecontext.stored_hdrspans = [] 

2410 

2411 if any("dummy-store-hdrspan" in ts for ts in v): 2411 ↛ 2413line 2411 didn't jump to line 2413 because the condition on line 2411 was never true

2412 # print(f"STORED: {col}") 

2413 store_new_hdrspan = True 

2414 else: 

2415 store_new_hdrspan = False 

2416 

2417 new_coltags = list( 

2418 x 

2419 for x in new_coltags 

2420 if not any(t in noinherit_tags for t in x) 

2421 ) 

2422 # print("new_coltags={} previously_seen={} all_hdr_tags={}" 

2423 # .format(new_coltags, previously_seen, all_hdr_tags)) 

2424 if any(new_coltags): 

2425 ( 

2426 col, 

2427 col0_followed_by_nonempty, 

2428 col0_hdrspan, 

2429 ) = add_new_hdrspan( 

2430 col, 

2431 hdrspans, 

2432 store_new_hdrspan, 

2433 col0_followed_by_nonempty, 

2434 col0_hdrspan, 

2435 ) 

2436 

2437 continue 

2438 

2439 # These values are ignored, at least for now 

2440 if re.match(r"^(# |\(see )", col): 2440 ↛ 2441line 2440 didn't jump to line 2441 because the condition on line 2440 was never true

2441 continue 

2442 

2443 if any("dummy-skip-this" in ts for ts in rowtags): 

2444 continue # Skip this cell 

2445 

2446 # If the word has no rowtags and is a multi-row cell, then 

2447 # ignore this. This happens with empty separator rows 

2448 # within a rowspan>1 cell. cf. wander/English/Conjugation. 

2449 if rowtags == [()] and rowspan > 1: 

2450 continue 

2451 

2452 # Minor cleanup. See e.g. είμαι/Greek/Verb present participle. 

2453 if cleanup_rules: 

2454 for regx, substitution in cleanup_rules.items(): 

2455 col = re.sub(regx, substitution, col) 

2456 

2457 if ( 2457 ↛ 2462line 2457 didn't jump to line 2462

2458 col_idx == 0 

2459 and not first_col_has_text 

2460 and get_lang_conf(lang, "ignore_top_left_text_cell") is True 

2461 ): 

2462 continue # Skip text at top left, as in Icelandic, Faroese 

2463 

2464 # if col0_hdrspan is not None: 

2465 # print("COL0 FOLLOWED NONHDR: {!r} by {!r}" 

2466 # .format(col0_hdrspan.text, col)) 

2467 col0_followed_by_nonempty = True 

2468 have_text = True 

2469 

2470 # Determine column tags for the multi-column cell 

2471 combined_coltags = compute_coltags( 

2472 lang, pos, hdrspans, col_idx, colspan, col 

2473 ) 

2474 if any("dummy-ignored-text-cell" in ts for ts in combined_coltags): 2474 ↛ 2475line 2474 didn't jump to line 2475 because the condition on line 2474 was never true

2475 continue 

2476 

2477 # print("HAVE_TEXT:", repr(col)) 

2478 # Split the text into separate forms. First simplify spaces except 

2479 # newline. 

2480 col = re.sub(r"[ \t\r]+", " ", col) 

2481 # Split the cell text into alternatives 

2482 

2483 col, alts, split_extra_tags = split_text_into_alts(col) 

2484 

2485 # Some cells have mixed form content, like text and romanization, 

2486 # or text and IPA. Handle these. 

2487 alts = handle_mixed_lines(alts) 

2488 

2489 alts = list((x, combined_coltags) for x in alts) 

2490 

2491 # Generate forms from the alternatives 

2492 # alts is a list of (tuple of forms, tuple of tags) 

2493 for (form, base_roman, ipa), coltags in alts: 

2494 form = form.strip() 

2495 extra_tags = [] 

2496 extra_tags.extend(split_extra_tags) 

2497 # Handle special splits again here, so that we can have custom 

2498 # mappings from form to form and tags. 

2499 if form in form_replacements: 

2500 replacement, tags = form_replacements[form] 

2501 for x in tags.split(): 

2502 assert x in valid_tags 

2503 assert isinstance(replacement, str) 

2504 assert isinstance(tags, str) 

2505 form = replacement 

2506 extra_tags.extend(tags.split()) 

2507 # Clean the value, extracting reference symbols 

2508 form, refs, defs, hdr_tags = extract_cell_content( 

2509 lang, word, form 

2510 ) 

2511 # if refs: 

2512 # print("REFS:", refs) 

2513 extra_tags.extend(hdr_tags) 

2514 # Extract tags from referenced footnotes 

2515 # Extract tags from referenced footnotes 

2516 refs_tags = set() 

2517 for ref in refs: 

2518 if ref in def_ht: 

2519 refs_tags.update(def_ht[ref]) 

2520 

2521 if base_roman: 

2522 base_roman, _, _, hdr_tags = extract_cell_content( 

2523 lang, word, base_roman 

2524 ) 

2525 extra_tags.extend(hdr_tags) 

2526 

2527 # Do some additional cleanup on the cell. 

2528 form = re.sub(r"^\s*,\s*", "", form) 

2529 form = re.sub(r"\s*,\s*$", "", form) 

2530 form = re.sub(r"\s*(,\s*)+", ", ", form) 

2531 form = re.sub(r"(?i)^Main:", "", form) 

2532 form = re.sub(r"\s+", " ", form) 

2533 form = form.strip() 

2534 

2535 # Look for parentheses that have semantic meaning 

2536 form, et = find_semantic_parens(form) 

2537 extra_tags.extend(et) 

2538 

2539 # Handle parentheses in the table element. We parse 

2540 # tags anywhere and romanizations anywhere but beginning. 

2541 roman = base_roman 

2542 paren = None 

2543 clitic = None 

2544 m = re.search(r"(\s+|^)\(([^)]*)\)", form) 

2545 # start|spaces + (anything) 

2546 if m is not None: 

2547 subst = m.group(1) 

2548 paren = m.group(2) 

2549 else: 

2550 m = re.search(r"\(([^)]*)\)(\s+|$)", form) 

2551 # (anything) + spaces|end 

2552 if m is not None: 2552 ↛ 2553line 2552 didn't jump to line 2553 because the condition on line 2552 was never true

2553 paren = m.group(1) 

2554 subst = m.group(2) 

2555 if paren is not None: 

2556 form, roman, clitic = handle_parens( 

2557 form, roman, clitic, extra_tags 

2558 ) 

2559 

2560 # Ignore certain forms that are not really forms, 

2561 # unless they're really, really close to the article title 

2562 if form in ( 2562 ↛ 2567line 2562 didn't jump to line 2567 because the condition on line 2562 was never true

2563 "", 

2564 "unchanged", 

2565 "after an", # in sona/Irish/Adj/Mutation 

2566 ): 

2567 Lev = distw([form], word) 

2568 if form and Lev < 0.1: 

2569 wxr.wtp.debug( 

2570 "accepted possible false positive '{}' with" 

2571 "> 0.1 Levenshtein distance in {}/{}".format( 

2572 form, word, lang 

2573 ), 

2574 sortid="inflection/2213", 

2575 ) 

2576 elif form and Lev < 0.3: 

2577 wxr.wtp.debug( 

2578 "skipped possible match '{}' with > 0.3" 

2579 "Levenshtein distance in {}/{}".format( 

2580 form, word, lang 

2581 ), 

2582 sortid="inflection/2218", 

2583 ) 

2584 continue 

2585 else: 

2586 continue 

2587 # print("ROWTAGS={} COLTAGS={} REFS_TAGS={} " 

2588 # "FORM={!r} ROMAN={!r}" 

2589 # .format(rowtags, coltags, refs_tags, 

2590 # form, roman)) 

2591 

2592 # Merge tags from row and column and do miscellaneous 

2593 # tag-related handling. 

2594 ( 

2595 merge_ret, 

2596 form, 

2597 some_has_covered_text, 

2598 ) = merge_row_and_column_tags(form, some_has_covered_text) 

2599 ret.extend(merge_ret) 

2600 

2601 # End of row. 

2602 rownum += 1 

2603 # For certain languages, if the row was empty, reset 

2604 # hdrspans (saprast/Latvian/Verb, but not aussteigen/German/Verb). 

2605 if row_empty and get_lang_conf(lang, "empty_row_resets"): 

2606 hdrspans = [] 

2607 # Check if we should expand col0_hdrspan. 

2608 if col0_hdrspan is not None: 

2609 col0_allowed = get_lang_conf(lang, "hdr_expand_first") 

2610 col0_cats = tagset_cats(col0_hdrspan.tagsets) 

2611 # Only expand if col0_cats and later_cats are allowed 

2612 # and don't overlap and col0 has tags, and there have 

2613 # been no disallowed cells in between. 

2614 if ( 

2615 not col0_followed_by_nonempty 

2616 and not (col0_cats - col0_allowed) 

2617 and 

2618 # len(col0_cats) == 1 and 

2619 col_idx > col0_hdrspan.start + col0_hdrspan.colspan 

2620 ): 

2621 # If an earlier header is only followed by headers that yield 

2622 # no tags, expand it to entire row 

2623 # print("EXPANDING COL0: {} from {} to {} cols {}" 

2624 # .format(col0_hdrspan.text, col0_hdrspan.colspan, 

2625 # len(row) - col0_hdrspan.start, 

2626 # col0_hdrspan.tagsets)) 

2627 col0_hdrspan.colspan = len(row) - col0_hdrspan.start 

2628 col0_hdrspan.expanded = True 

2629 # XXX handle refs and defs 

2630 # for x in hdrspans: 

2631 # print(" HDRSPAN {} {} {} {!r}" 

2632 # .format(x.start, x.colspan, x.tagsets, x.text)) 

2633 

2634 # Post-process German nouns with articles in separate columns. We move the 

2635 # definite/indefinite/usually-without-article markers into the noun and 

2636 # remove the article entries. 

2637 if get_lang_conf(lang, "articles_in_separate_columns") and any( 

2638 "noun" in x["tags"] for x in ret 

2639 ): 

2640 new_ret = [] 

2641 saved_tags = set() 

2642 had_noun = False 

2643 for dt in ret: 

2644 tags = dt["tags"] 

2645 # print(tags) 

2646 if "noun" in tags: 

2647 tags = list( 

2648 sorted(set(t for t in tags if t != "noun") | saved_tags) 

2649 ) 

2650 had_noun = True 

2651 elif ( 2651 ↛ 2678line 2651 didn't jump to line 2678

2652 "indefinite" in tags 

2653 or "definite" in tags 

2654 or "usually-without-article" in tags 

2655 or "without-article" in tags 

2656 ): 

2657 if had_noun: 

2658 saved_tags = set(tags) 

2659 else: 

2660 saved_tags = saved_tags | set(tags) # E.g. Haus/German 

2661 remove_useless_tags(lang, pos, saved_tags) 

2662 saved_tags = saved_tags & set( 

2663 [ 

2664 "masculine", 

2665 "feminine", 

2666 "neuter", 

2667 "singular", 

2668 "plural", 

2669 "indefinite", 

2670 "definite", 

2671 "usually-without-article", 

2672 "without-article", 

2673 ] 

2674 ) 

2675 had_noun = False 

2676 continue # Skip the articles 

2677 

2678 dt = dt.copy() 

2679 dt["tags"] = tags 

2680 new_ret.append(dt) 

2681 ret = new_ret 

2682 

2683 elif possibly_ignored_forms: 

2684 # Some languages have tables with cells that are kind of separated 

2685 # and difficult to handle, like eulersche Formel/German where 

2686 # the definite and indefinite articles are just floating. 

2687 # If a language has a dict of conditionally_ignored_cells, 

2688 # and if the contents of a cell is found in one of the rules 

2689 # there, ignore that cell if it 

2690 # 1. Does not have the appropriate tag (like "definite" for "die") 

2691 # and 

2692 # 2. The title of the article is not one of the other co-words 

2693 # (ie. it's an article for the definite articles in german etc.) 

2694 # pass 

2695 new_ret = [] 

2696 for cell_data in ret: 

2697 tags = cell_data["tags"] 

2698 text = cell_data["form"] 

2699 skip_this = False 

2700 for key_tag, ignored_forms in possibly_ignored_forms.items(): 

2701 if text not in ignored_forms: 2701 ↛ 2703line 2701 didn't jump to line 2703 because the condition on line 2701 was always true

2702 continue 

2703 if word in ignored_forms: 

2704 continue 

2705 if key_tag not in tags: 

2706 skip_this = True 

2707 

2708 if skip_this: 2708 ↛ 2709line 2708 didn't jump to line 2709 because the condition on line 2708 was never true

2709 continue 

2710 new_ret.append(cell_data) 

2711 

2712 ret = new_ret 

2713 

2714 # Post-process English inflection tables, addding "multiword-construction" 

2715 # when the number of words has increased. 

2716 if lang == "English" and pos == "verb": 

2717 word_words = len(word.split()) 

2718 new_ret = [] 

2719 for dt in ret: 

2720 form = dt.get("form", "") 

2721 if len(form.split()) > word_words: 

2722 dt = dt.copy() 

2723 dt["tags"] = list(dt.get("tags", [])) 

2724 # This strange copy-assigning shuffle is preventative black 

2725 # magic; do not touch lest you invoke deep bugs. 

2726 data_append(dt, "tags", "multiword-construction") 

2727 new_ret.append(dt) 

2728 ret = new_ret 

2729 

2730 # Always insert "table-tags" detail as the first entry in any inflection 

2731 # table. This way we can reliably detect where a new table starts. 

2732 # Table-tags applies until the next table-tags entry. 

2733 if ret or table_tags: 

2734 table_tags = list(sorted(set(table_tags))) 

2735 dt = { 

2736 "form": " ".join(table_tags), 

2737 "source": source, 

2738 "tags": ["table-tags"], 

2739 } 

2740 if dt["form"] == "": 

2741 dt["form"] = "no-table-tags" 

2742 if tablecontext.template_name: 

2743 tn = { 

2744 "form": tablecontext.template_name, 

2745 "source": source, 

2746 "tags": ["inflection-template"], 

2747 } 

2748 ret = [dt] + [tn] + ret 

2749 else: 

2750 ret = [dt] + ret 

2751 

2752 return ret 

2753 

2754 

2755def handle_generic_table( 

2756 wxr, tablecontext, data, word, lang, pos, rows, titles, source, after, depth 

2757): 

2758 assert isinstance(wxr, WiktextractContext) 

2759 assert isinstance(data, dict) 

2760 assert isinstance(word, str) 

2761 assert isinstance(lang, str) 

2762 assert isinstance(pos, str) 

2763 assert isinstance(rows, list) 

2764 assert isinstance(source, str) 

2765 assert isinstance(after, str) 

2766 assert isinstance(depth, int) 

2767 for row in rows: 

2768 assert isinstance(row, list) 

2769 for x in row: 

2770 assert isinstance(x, InflCell) 

2771 assert isinstance(titles, list) 

2772 for x in titles: 

2773 assert isinstance(x, str) 

2774 

2775 # Try to parse the table as a simple table 

2776 ret = parse_simple_table( 

2777 wxr, tablecontext, word, lang, pos, rows, titles, source, after, depth 

2778 ) 

2779 if ret is None: 2779 ↛ 2782line 2779 didn't jump to line 2782 because the condition on line 2779 was never true

2780 # XXX handle other table formats 

2781 # We were not able to handle the table 

2782 wxr.wtp.debug( 

2783 "unhandled inflection table format, {}/{}".format(word, lang), 

2784 sortid="inflection/2370", 

2785 ) 

2786 return 

2787 

2788 # Add the returned forms but eliminate duplicates. 

2789 have_forms = set() 

2790 for dt in ret: 

2791 fdt = freeze(dt) 

2792 if fdt in have_forms: 

2793 continue # Don't add duplicates 

2794 # Some Russian words have Declension and Pre-reform declension partially 

2795 # duplicating same data. Don't add "dated" tags variant if already have 

2796 # the same without "dated" from the modern declension table 

2797 

2798 tags = dt.get("tags", []) 

2799 for dated_tag in ("dated",): 

2800 if dated_tag in tags: 

2801 dt2 = dt.copy() 

2802 tags2 = list(x for x in tags if x != dated_tag) 

2803 dt2["tags"] = tags2 

2804 if tags2 and freeze(dt2) in have_forms: 2804 ↛ 2805line 2804 didn't jump to line 2805 because the condition on line 2804 was never true

2805 break # Already have without archaic 

2806 else: 

2807 if "table-tags" not in tags: 

2808 have_forms.add(fdt) 

2809 data_append(data, "forms", dt) 

2810 

2811 

2812def determine_header( 

2813 wxr, 

2814 tablecontext, 

2815 lang, 

2816 word, 

2817 pos, 

2818 table_kind, 

2819 kind, 

2820 style, 

2821 row, 

2822 col, 

2823 celltext, 

2824 titletext, 

2825 cols_headered, 

2826 target, 

2827 cellstyle, 

2828): 

2829 assert isinstance(table_kind, NodeKind) 

2830 assert isinstance(kind, (NodeKind, str)) 

2831 assert style is None or isinstance(style, str) 

2832 assert cellstyle is None or isinstance(cellstyle, str) 

2833 

2834 if table_kind == NodeKind.TABLE: 

2835 header_kind = NodeKind.TABLE_HEADER_CELL 

2836 elif table_kind == NodeKind.HTML: 2836 ↛ 2838line 2836 didn't jump to line 2838 because the condition on line 2836 was always true

2837 header_kind = "th" 

2838 idx = celltext.find(": ") 

2839 is_title = False 

2840 # remove anything in parentheses, compress whitespace, .strip() 

2841 cleaned_titletext = re.sub( 

2842 r"\s+", " ", re.sub(r"\s*\([^)]*\)", "", titletext) 

2843 ).strip() 

2844 cleaned, _, _, _ = extract_cell_content(lang, word, celltext) 

2845 cleaned = re.sub(r"\s+", " ", cleaned) 

2846 hdr_expansion = expand_header( 

2847 wxr, 

2848 tablecontext, 

2849 word, 

2850 lang, 

2851 pos, 

2852 cleaned, 

2853 [], 

2854 silent=True, 

2855 ignore_tags=True, 

2856 ) 

2857 candidate_hdr = not any( 

2858 any(t.startswith("error-") for t in ts) for ts in hdr_expansion 

2859 ) 

2860 # KJ candidate_hdr says that a specific cell is a candidate 

2861 # for being a header because it passed through expand_header 

2862 # without getting any "error-" tags; that is, the contents 

2863 # is "valid" for being a header; these are the false positives 

2864 # we want to catch 

2865 ignored_cell = any( 

2866 any(t.startswith("dummy-") for t in ts) for ts in hdr_expansion 

2867 ) 

2868 # ignored_cell should NOT be used to filter for headers, like 

2869 # candidate_hdr is used, but only to filter for related *debug 

2870 # messages*: some dummy-tags are actually half-way to headers, 

2871 # like ones with "Notes", so they MUST be headers, but later 

2872 # on they're ignored *as* headers so they don't need to print 

2873 # out any cells-as-headers debug messages. 

2874 if ( 

2875 candidate_hdr 

2876 and kind != header_kind 

2877 and cleaned != "" 

2878 and cleaned != "dummy-ignored-text-cell" 

2879 and cleaned not in IGNORED_COLVALUES 

2880 ): 

2881 # print("col: {}".format(col)) 

2882 if not ignored_cell and lang not in LANGUAGES_WITH_CELLS_AS_HEADERS: 

2883 wxr.wtp.debug( 

2884 "rejected heuristic header: " 

2885 "table cell identified as header and given " 

2886 "candidate status, BUT {} is not in " 

2887 "LANGUAGES_WITH_CELLS_AS_HEADERS; " 

2888 "cleaned text: {}".format(lang, cleaned), 

2889 sortid="inflection/2447", 

2890 ) 

2891 candidate_hdr = False 

2892 elif cleaned not in LANGUAGES_WITH_CELLS_AS_HEADERS.get(lang, ""): 

2893 wxr.wtp.debug( 

2894 "rejected heuristic header: " 

2895 "table cell identified as header and given " 

2896 "candidate status, BUT the cleaned text is " 

2897 "not in LANGUAGES_WITH_CELLS_AS_HEADERS[{}]; " 

2898 "cleaned text: {}".format(lang, cleaned), 

2899 sortid="inflection/2457", 

2900 ) 

2901 candidate_hdr = False 

2902 else: 

2903 wxr.wtp.debug( 

2904 "accepted heuristic header: " 

2905 "table cell identified as header and given " 

2906 "candidate status, AND the cleaned text is " 

2907 "in LANGUAGES_WITH_CELLS_AS_HEADERS[{}]; " 

2908 "cleaned text: {}".format(lang, cleaned), 

2909 sortid="inflection/2466", 

2910 ) 

2911 

2912 # If the cell starts with something that could start a 

2913 # definition (typically a reference symbol), make it a candidate 

2914 # regardless of whether the language is listed. 

2915 if re.match(def_re, cleaned) and not re.match(nondef_re, cleaned): 2915 ↛ 2916line 2915 didn't jump to line 2916 because the condition on line 2915 was never true

2916 candidate_hdr = True 

2917 

2918 # print("titletext={!r} hdr_expansion={!r} candidate_hdr={!r} " 

2919 # "lang={} pos={}" 

2920 # .format(titletext, hdr_expansion, candidate_hdr, 

2921 # lang, pos)) 

2922 if idx >= 0 and titletext[:idx] in infl_map: 

2923 target = titletext[idx + 2 :].strip() 

2924 celltext = celltext[:idx] 

2925 is_title = True 

2926 elif ( 

2927 kind == header_kind 

2928 and " + " not in titletext # For "avoir + blah blah"? 

2929 and not any( 

2930 isinstance(x, WikiNode) 

2931 and x.kind == NodeKind.HTML 

2932 and x.sarg == "span" 

2933 and x.attrs.get("lang") in ("az",) 

2934 for x in col.children 

2935 ) 

2936 ): 

2937 is_title = True 

2938 elif ( 

2939 candidate_hdr 

2940 and cleaned_titletext not in IGNORED_COLVALUES 

2941 and distw([cleaned_titletext], word) > 0.3 

2942 and cleaned_titletext not in ("I", "es") 

2943 ): 

2944 is_title = True 

2945 # if first column or same style as first column 

2946 elif ( 

2947 style == cellstyle 

2948 and 

2949 # and title is not identical to word name 

2950 titletext != word 

2951 and cleaned not in IGNORED_COLVALUES 

2952 and cleaned != "dummy-ignored-text-cell" 

2953 and 

2954 # the style composite string is not broken 

2955 not style.startswith("////") 

2956 and " + " not in titletext 

2957 ): 

2958 if not ignored_cell and lang not in LANGUAGES_WITH_CELLS_AS_HEADERS: 2958 ↛ 2959line 2958 didn't jump to line 2959 because the condition on line 2958 was never true

2959 wxr.wtp.debug( 

2960 "rejected heuristic header: " 

2961 "table cell identified as header based " 

2962 "on style, BUT {} is not in " 

2963 "LANGUAGES_WITH_CELLS_AS_HEADERS; " 

2964 "cleaned text: {}, style: {}".format(lang, cleaned, style), 

2965 sortid="inflection/2512", 

2966 ) 

2967 elif ( 2967 ↛ 2971line 2967 didn't jump to line 2971

2968 not ignored_cell 

2969 and cleaned not in LANGUAGES_WITH_CELLS_AS_HEADERS.get(lang, "") 

2970 ): 

2971 wxr.wtp.debug( 

2972 "rejected heuristic header: " 

2973 "table cell identified as header based " 

2974 "on style, BUT the cleaned text is " 

2975 "not in LANGUAGES_WITH_CELLS_AS_HEADERS[{}]; " 

2976 "cleaned text: {}, style: {}".format(lang, cleaned, style), 

2977 sortid="inflection/2522", 

2978 ) 

2979 else: 

2980 wxr.wtp.debug( 

2981 "accepted heuristic header: " 

2982 "table cell identified as header based " 

2983 "on style, AND the cleaned text is " 

2984 "in LANGUAGES_WITH_CELLS_AS_HEADERS[{}]; " 

2985 "cleaned text: {}, style: {}".format(lang, cleaned, style), 

2986 sortid="inflection/2530", 

2987 ) 

2988 is_title = True 

2989 if ( 2989 ↛ 2996line 2989 didn't jump to line 2996

2990 not is_title 

2991 and len(row) < len(cols_headered) 

2992 and cols_headered[len(row)] 

2993 ): 

2994 # Whole column has title suggesting they are headers 

2995 # (e.g. "Case") 

2996 is_title = True 

2997 if re.match( 

2998 r"Conjugation of |Declension of |Inflection of |" 

2999 r"Mutation of |Notes\b", # \b is word-boundary 

3000 titletext, 

3001 ): 

3002 is_title = True 

3003 return is_title, hdr_expansion, target, celltext 

3004 

3005 

3006class TableContext: 

3007 """Saved context used when parsing a table and its subtables.""" 

3008 

3009 __slot__ = ( 

3010 "stored_hdrspans", 

3011 "section_header", 

3012 "template_name", 

3013 ) 

3014 

3015 def __init__(self, template_name=None): 

3016 self.stored_hdrspans = [] 

3017 self.section_header = [] 

3018 if not template_name: 

3019 self.template_name = "" 

3020 else: 

3021 self.template_name = template_name 

3022 

3023 

3024def handle_wikitext_or_html_table( 

3025 wxr, word, lang, pos, data, tree, titles, source, after, tablecontext=None 

3026): 

3027 """Parses a table from parsed Wikitext format into rows and columns of 

3028 InflCell objects and then calls handle_generic_table() to parse it into 

3029 forms. This adds the forms into ``data``.""" 

3030 assert isinstance(wxr, WiktextractContext) 

3031 assert isinstance(word, str) 

3032 assert isinstance(lang, str) 

3033 assert isinstance(pos, str) 

3034 assert isinstance(data, dict) 

3035 assert isinstance(tree, WikiNode) 

3036 assert tree.kind == NodeKind.TABLE or ( 

3037 tree.kind == NodeKind.HTML and tree.sarg == "table" 

3038 ) 

3039 assert isinstance(titles, list) 

3040 assert isinstance(source, str) 

3041 for x in titles: 

3042 assert isinstance(x, str) 

3043 assert isinstance(after, str) 

3044 assert tablecontext is None or isinstance(tablecontext, TableContext) 

3045 # Imported here to avoid a circular import 

3046 from wiktextract.page import clean_node, recursively_extract 

3047 

3048 if not tablecontext: 

3049 tablecontext = TableContext() 

3050 

3051 def handle_table1( 

3052 wxr, 

3053 tablecontext, 

3054 word, 

3055 lang, 

3056 pos, 

3057 data, 

3058 tree, 

3059 titles, 

3060 source, 

3061 after, 

3062 depth, 

3063 ): 

3064 """Helper function allowing the 'flattening' out of the table 

3065 recursion: instead of handling the tables in the wrong order 

3066 (recursively), this function adds to new_row that is then 

3067 iterated through in the main function at the end, creating 

3068 a longer table (still in pieces) in the correct order.""" 

3069 

3070 assert isinstance(data, dict) 

3071 assert isinstance(titles, list) 

3072 assert isinstance(source, str) 

3073 for x in titles: 

3074 assert isinstance(x, str) 

3075 assert isinstance(after, str) 

3076 assert isinstance(depth, int) 

3077 # print("HANDLE_WIKITEXT_TABLE", titles) 

3078 

3079 col_gap_data = [] # Filling for columns with rowspan > 1 

3080 # col_gap_data contains None or InflCell 

3081 vertical_still_left = [] # Number of remaining rows for which to fill 

3082 # the column; vertical_still_left contains int 

3083 cols_headered = [] # [F, T, F, F...] 

3084 # True when the whole column contains headers, even 

3085 # when the cell is not considered a header; triggered 

3086 # by the "*" inflmap meta-tag. 

3087 rows = [] 

3088 

3089 sub_ret = [] 

3090 

3091 for node in tree.children: 

3092 if not isinstance(node, WikiNode): 

3093 continue 

3094 if node.kind == NodeKind.HTML: 

3095 kind = node.sarg 

3096 else: 

3097 kind = node.kind 

3098 

3099 # print(" {}".format(node)) 

3100 if kind in (NodeKind.TABLE_CAPTION, "caption"): 

3101 # print(" CAPTION:", node) 

3102 pass 

3103 elif kind in (NodeKind.TABLE_ROW, "tr"): 

3104 if "vsShow" in node.attrs.get("class", "").split(): 

3105 # vsShow rows are those that are intially shown in tables 

3106 # that have more data. The hidden data duplicates these 

3107 # rows, so we skip it and just process the hidden data. 

3108 continue 

3109 

3110 # Parse a table row. 

3111 row = [] 

3112 style = None 

3113 row_has_nonempty_cells = False 

3114 # Have nonempty cell not from rowspan 

3115 for col in node.children: 

3116 # loop through each cell in the ROW 

3117 if not isinstance(col, WikiNode): 

3118 # This skip is not used for counting, 

3119 # "None" is not used in 

3120 # indexing or counting or looping. 

3121 continue 

3122 if col.kind == NodeKind.HTML: 

3123 kind = col.sarg 

3124 else: 

3125 kind = col.kind 

3126 if kind not in ( 3126 ↛ 3132line 3126 didn't jump to line 3132 because the condition on line 3126 was never true

3127 NodeKind.TABLE_HEADER_CELL, 

3128 NodeKind.TABLE_CELL, 

3129 "th", 

3130 "td", 

3131 ): 

3132 print(" UNEXPECTED ROW CONTENT: {}".format(col)) 

3133 continue 

3134 

3135 while ( 

3136 len(row) < len(vertical_still_left) 

3137 and vertical_still_left[len(row)] > 0 

3138 ): 

3139 # vertical_still_left is [...0, 0, 2...] for each 

3140 # column. It is populated at the end of the loop, at the 

3141 # same time as col_gap_data. This needs to be looped and 

3142 # filled this way because each `for col`-looping jumps 

3143 # straight to the next meaningful cell; there is no 

3144 # "None" cells, only emptiness between, and rowspan and 

3145 # colspan are just to generate the "fill- 

3146 vertical_still_left[len(row)] -= 1 

3147 row.append(col_gap_data[len(row)]) 

3148 

3149 # appending row is how "indexing" is 

3150 # done here; something is appended, 

3151 # like a filler-cell here or a "start" 

3152 # cell at the end of the row-loop, 

3153 # which increased len(row) which is 

3154 # then used as the target-index to check 

3155 # for gaps. vertical_still_left is 

3156 # the countdown to when to stop 

3157 # filling in gaps, and goes down to 0, 

3158 # and col_gap_data is not touched 

3159 # except when a new rowspan is needed, 

3160 # at the same time that 

3161 # vertical_still_left gets reassigned. 

3162 

3163 try: 

3164 rowspan = int(col.attrs.get("rowspan", "1")) # 🡙 

3165 colspan = int(col.attrs.get("colspan", "1")) # 🡘 

3166 except ValueError: 

3167 rowspan = 1 

3168 colspan = 1 

3169 # print("COL:", col) 

3170 

3171 # Process any nested tables recursively. 

3172 tables, rest = recursively_extract( 

3173 col, 

3174 lambda x: isinstance(x, WikiNode) 

3175 and (x.kind == NodeKind.TABLE or x.sarg == "table"), 

3176 ) 

3177 

3178 # Clean the rest of the cell. 

3179 celltext = clean_node(wxr, None, rest) 

3180 # print("CLEANED:", celltext) 

3181 

3182 # Handle nested tables. 

3183 for tbl in tables: 

3184 # Some nested tables (e.g., croí/Irish) have subtitles 

3185 # as normal paragraphs in the same cell under a descrip- 

3186 # tive text that should be treated as a title (e.g., 

3187 # "Forms with the definite article", with "definite" not 

3188 # mentioned elsewhere). 

3189 new_titles = list(titles) 

3190 if celltext: 

3191 new_titles.append(celltext) 

3192 subtbl = handle_table1( 

3193 wxr, 

3194 tablecontext, 

3195 word, 

3196 lang, 

3197 pos, 

3198 data, 

3199 tbl, 

3200 new_titles, 

3201 source, 

3202 "", 

3203 depth + 1, 

3204 ) 

3205 if subtbl: 3205 ↛ 3183line 3205 didn't jump to line 3183 because the condition on line 3205 was always true

3206 sub_ret.append((rows, titles, after, depth)) 

3207 rows = [] 

3208 titles = [] 

3209 after = "" 

3210 sub_ret.extend(subtbl) 

3211 

3212 # This magic value is used as part of header detection 

3213 cellstyle = ( 

3214 col.attrs.get("style", "") 

3215 + "//" 

3216 + col.attrs.get("class", "") 

3217 + "//" 

3218 + str(kind) 

3219 ) 

3220 

3221 if not row: # if first column in row 

3222 style = cellstyle 

3223 target = None 

3224 titletext = celltext.strip() 

3225 while titletext and is_superscript(titletext[-1]): 

3226 titletext = titletext[:-1] 

3227 

3228 ( 

3229 is_title, 

3230 hdr_expansion, 

3231 target, 

3232 celltext, 

3233 ) = determine_header( 

3234 wxr, 

3235 tablecontext, 

3236 lang, 

3237 word, 

3238 pos, 

3239 tree.kind, 

3240 kind, 

3241 style, 

3242 row, 

3243 col, 

3244 celltext, 

3245 titletext, 

3246 cols_headered, 

3247 None, 

3248 cellstyle, 

3249 ) 

3250 

3251 if is_title: 

3252 # If this cell gets a "*" tag, make the whole column 

3253 # below it (toggling it in cols_headered = [F, F, T...]) 

3254 # into headers. 

3255 while len(cols_headered) <= len(row): 

3256 cols_headered.append(False) 

3257 if any("*" in tt for tt in hdr_expansion): 

3258 cols_headered[len(row)] = True 

3259 celltext = "" 

3260 # if row_has_nonempty_cells has been True at some point, it 

3261 # keeps on being True. 

3262 # if row_has_nonempty_cells or is_title or celltext != "": 

3263 # row_has_nonempty_cells = True 

3264 # ⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓ 

3265 row_has_nonempty_cells |= is_title or celltext != "" 

3266 cell = InflCell( 

3267 celltext, is_title, colspan, rowspan, target 

3268 ) 

3269 for _ in range(0, colspan): 

3270 # colspan🡘 current loop (col) or 1 

3271 # All the data-filling for colspan 

3272 # is done simply in this loop, 

3273 # while rowspan needs to use 

3274 # vertical_still_left to count gaps 

3275 # and col_gap_data to fill in 

3276 # those gaps with InflCell data. 

3277 if rowspan > 1: # rowspan🡙 current loop (col) or 1 

3278 while len(col_gap_data) <= len(row): 

3279 # Initialize col_gap_data/ed if 

3280 # it is lacking slots 

3281 # for each column; col_gap_data and 

3282 # vertical_still_left are never 

3283 # reset to [], during 

3284 # the whole table function. 

3285 col_gap_data.append(None) 

3286 vertical_still_left.append(0) 

3287 # Below is where the "rectangle" block of rowspan 

3288 # and colspan is filled for the future. 

3289 col_gap_data[len(row)] = cell 

3290 # col_gap_data contains cells that 

3291 # will be used in the 

3292 # future, or None 

3293 vertical_still_left[len(row)] = rowspan - 1 

3294 # A counter for how many gaps🡙 are still left to be 

3295 # filled (row.append or 

3296 # row[col_gap_data[len(row)] => 

3297 # rows), it is not reset to [], but decremented to 0 

3298 # each time a row gets something from col_gap_data. 

3299 # Append this cell 1+ times for colspan🡘 

3300 row.append(cell) 

3301 if not row: 

3302 continue 

3303 # After looping the original row-nodes above, fill 

3304 # in the rest of the row if the final cell has colspan 

3305 # (inherited from above, so a cell with rowspan and colspan) 

3306 for i in range(len(row), len(vertical_still_left)): 

3307 if vertical_still_left[i] <= 0: 

3308 continue 

3309 vertical_still_left[i] -= 1 

3310 while len(row) < i: 

3311 row.append(InflCell("", False, 1, 1, None)) 

3312 row.append(col_gap_data[i]) 

3313 # print(" ROW {!r}".format(row)) 

3314 if row_has_nonempty_cells: 3314 ↛ 3091line 3314 didn't jump to line 3091 because the condition on line 3314 was always true

3315 rows.append(row) 

3316 elif kind in ( 3316 ↛ 3091line 3316 didn't jump to line 3091 because the condition on line 3316 was always true

3317 NodeKind.TABLE_HEADER_CELL, 

3318 NodeKind.TABLE_CELL, 

3319 "th", 

3320 "td", 

3321 "span", 

3322 ): 

3323 # print(" TOP-LEVEL CELL", node) 

3324 pass 

3325 

3326 if sub_ret: 

3327 main_ret = sub_ret 

3328 main_ret.append((rows, titles, after, depth)) 

3329 else: 

3330 main_ret = [(rows, titles, after, depth)] 

3331 return main_ret 

3332 

3333 new_rows = handle_table1( 

3334 wxr, tablecontext, word, lang, pos, data, tree, titles, source, after, 0 

3335 ) 

3336 

3337 # Now we have a table that has been parsed into rows and columns of 

3338 # InflCell objects. Parse the inflection table from that format. 

3339 if new_rows: 3339 ↛ exitline 3339 didn't return from function 'handle_wikitext_or_html_table' because the condition on line 3339 was always true

3340 for rows, titles, after, depth in new_rows: 

3341 handle_generic_table( 

3342 wxr, 

3343 tablecontext, 

3344 data, 

3345 word, 

3346 lang, 

3347 pos, 

3348 rows, 

3349 titles, 

3350 source, 

3351 after, 

3352 depth, 

3353 ) 

3354 

3355 

3356def handle_html_table( 

3357 wxr, word, lang, pos, data, tree, titles, source, after, tablecontext=None 

3358): 

3359 """A passer-on function for html-tables, XXX, remove these?""" 

3360 handle_wikitext_or_html_table( 

3361 wxr, word, lang, pos, data, tree, titles, source, after, tablecontext 

3362 ) 

3363 

3364 

3365def handle_wikitext_table( 

3366 wxr, word, lang, pos, data, tree, titles, source, after, tablecontext=None 

3367): 

3368 """A passer-on function for html-tables, XXX, remove these?""" 

3369 handle_wikitext_or_html_table( 

3370 wxr, word, lang, pos, data, tree, titles, source, after, tablecontext 

3371 ) 

3372 

3373 

3374def parse_inflection_section( 

3375 wxr, data, word, lang, pos, section, tree, tablecontext=None 

3376): 

3377 """Parses an inflection section on a page. ``data`` should be the 

3378 data for a part-of-speech, and inflections will be added to it.""" 

3379 

3380 # print("PARSE_INFLECTION_SECTION {}/{}/{}/{}" 

3381 # .format(word, lang, pos, section)) 

3382 assert isinstance(wxr, WiktextractContext) 

3383 assert isinstance(data, dict) 

3384 assert isinstance(word, str) 

3385 assert isinstance(lang, str) 

3386 assert isinstance(section, str) 

3387 assert isinstance(tree, WikiNode) 

3388 assert tablecontext is None or isinstance(tablecontext, TableContext) 

3389 source = section 

3390 tables = [] 

3391 titleparts = [] 

3392 

3393 def process_tables(): 

3394 for kind, node, titles, after in tables: 

3395 after = "".join(after).strip() 

3396 after = clean_value(wxr, after) 

3397 if kind == "wikitext": 

3398 handle_wikitext_table( 

3399 wxr, 

3400 word, 

3401 lang, 

3402 pos, 

3403 data, 

3404 node, 

3405 titles, 

3406 source, 

3407 after, 

3408 tablecontext=tablecontext, 

3409 ) 

3410 elif kind == "html": 3410 ↛ 3424line 3410 didn't jump to line 3424 because the condition on line 3410 was always true

3411 handle_html_table( 

3412 wxr, 

3413 word, 

3414 lang, 

3415 pos, 

3416 data, 

3417 node, 

3418 titles, 

3419 source, 

3420 after, 

3421 tablecontext=tablecontext, 

3422 ) 

3423 else: 

3424 raise RuntimeError( 

3425 "{}: unimplemented table kind {}".format(word, kind) 

3426 ) 

3427 

3428 def recurse_navframe(node, titles): 

3429 nonlocal tables 

3430 nonlocal titleparts 

3431 titleparts = [] 

3432 old_tables = tables 

3433 tables = [] 

3434 

3435 recurse(node, [], navframe=True) 

3436 

3437 process_tables() 

3438 tables = old_tables 

3439 

3440 def recurse(node, titles, navframe=False): 

3441 nonlocal tables 

3442 if isinstance(node, (list, tuple)): 

3443 for x in node: 

3444 recurse(x, titles, navframe) 

3445 return 

3446 if isinstance(node, str): 

3447 if tables: 

3448 tables[-1][-1].append(node) 

3449 elif navframe: 

3450 titleparts.append(node) 

3451 return 

3452 if not isinstance(node, WikiNode): 3452 ↛ 3453line 3452 didn't jump to line 3453 because the condition on line 3452 was never true

3453 if navframe: 

3454 wxr.wtp.debug( 

3455 "inflection table: unhandled in NavFrame: {}".format(node), 

3456 sortid="inflection/2907", 

3457 ) 

3458 return 

3459 kind = node.kind 

3460 if navframe: 

3461 if kind == NodeKind.HTML: 

3462 classes = node.attrs.get("class", "").split() 

3463 if "NavToggle" in classes: 3463 ↛ 3464line 3463 didn't jump to line 3464 because the condition on line 3463 was never true

3464 return 

3465 if "NavHead" in classes: 

3466 # print("NAVHEAD:", node) 

3467 recurse(node.children, titles, navframe) 

3468 return 

3469 if "NavContent" in classes: 

3470 # print("NAVCONTENT:", node) 

3471 title = "".join(titleparts).strip() 

3472 title = html.unescape(title) 

3473 title = title.strip() 

3474 new_titles = list(titles) 

3475 if not re.match(r"(Note:|Notes:)", title): 3475 ↛ 3477line 3475 didn't jump to line 3477 because the condition on line 3475 was always true

3476 new_titles.append(title) 

3477 recurse(node, new_titles, navframe=False) 

3478 return 

3479 else: 

3480 if kind == NodeKind.TABLE: 

3481 tables.append(["wikitext", node, titles, []]) 

3482 return 

3483 elif kind == NodeKind.HTML and node.sarg == "table": 

3484 classes = node.attrs.get("class", ()) 

3485 if "audiotable" in classes: 

3486 return 

3487 tables.append(["html", node, titles, []]) 

3488 return 

3489 elif kind in ( 3489 ↛ 3496line 3489 didn't jump to line 3496 because the condition on line 3489 was never true

3490 NodeKind.LEVEL2, 

3491 NodeKind.LEVEL3, 

3492 NodeKind.LEVEL4, 

3493 NodeKind.LEVEL5, 

3494 NodeKind.LEVEL6, 

3495 ): 

3496 return # Skip subsections 

3497 if ( 

3498 kind == NodeKind.HTML 

3499 and node.sarg == "div" 

3500 and "NavFrame" in node.attrs.get("class", "").split() 

3501 ): 

3502 recurse_navframe(node, titles) 

3503 return 

3504 if kind == NodeKind.LINK: 

3505 if len(node.largs) > 1: 

3506 recurse(node.largs[1:], titles, navframe) 

3507 else: 

3508 recurse(node.largs[0], titles, navframe) 

3509 return 

3510 for x in node.children: 

3511 recurse(x, titles, navframe) 

3512 

3513 assert tree.kind == NodeKind.ROOT 

3514 for x in tree.children: 

3515 recurse(x, []) 

3516 

3517 # Process the tables we found 

3518 process_tables() 

3519 

3520 # XXX this code is used for extracting tables for inflection tests 

3521 if wxr.config.expand_tables: 3521 ↛ 3522line 3521 didn't jump to line 3522 because the condition on line 3521 was never true

3522 if section != "Mutation": 

3523 with open(wxr.config.expand_tables, "w") as f: 

3524 f.write(word + "\n") 

3525 f.write(lang + "\n") 

3526 f.write(pos + "\n") 

3527 f.write(section + "\n") 

3528 text = wxr.wtp.node_to_wikitext(tree) 

3529 f.write(text + "\n")