Coverage for src / wiktextract / extractor / en / inflection.py: 87%

1542 statements  

« prev     ^ index     » next       coverage.py v7.14.0, created at 2026-05-11 04:48 +0000

1# Code for parsing inflection tables. 

2# 

3# Copyright (c) 2021-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org. 

4 

5import collections 

6import copy 

7import functools 

8import html 

9import re 

10import unicodedata 

11from typing import TYPE_CHECKING, Generator, Literal, Optional, Union 

12 

13from mediawiki_langcodes import code_to_name, name_to_code 

14from wikitextprocessor import MAGIC_FIRST, HTMLNode, NodeKind, WikiNode 

15 

16from ...clean import clean_value 

17from ...datautils import data_append, freeze, split_at_comma_semi 

18from ...tags import valid_tags 

19from ...wxr_context import WiktextractContext 

20from .form_descriptions import ( 

21 classify_desc, 

22 decode_tags, 

23 distw, 

24 match_links_to_form, 

25 parse_head_final_tags, 

26) 

27from .inflection_kludges import ka_decl_noun_template_cell 

28from .inflectiondata import infl_map, infl_start_map, infl_start_re 

29from .lang_specific_configs import get_lang_conf, lang_specific_tags 

30from .table_headers_heuristics_data import LANGUAGES_WITH_CELLS_AS_HEADERS 

31from .type_utils import FormData, WordData 

32 

33# --debug-text-cell WORD 

34# Command-line parameter for debugging. When parsing inflection tables, 

35# print out debug messages when encountering this text. 

36debug_cell_text: Optional[str] = None 

37 

38 

39def set_debug_cell_text(text: str) -> None: 

40 global debug_cell_text 

41 debug_cell_text = text 

42 

43 

44TagSets = list[tuple[str, ...]] 

45 

46# Column texts that are interpreted as an empty column. 

47IGNORED_COLVALUES = { 

48 "-", 

49 "־", 

50 "᠆", 

51 "‐", 

52 "‑", 

53 "‒", 

54 "–", 

55 "—", 

56 "―", 

57 "−", 

58 "⸺", 

59 "⸻", 

60 "﹘", 

61 "﹣", 

62 "-", 

63 "/", 

64 "?", 

65 "not used", 

66 "not applicable", 

67} 

68 

69# These tags are never inherited from above 

70# XXX merge with lang_specific 

71noinherit_tags = { 

72 "infinitive-i", 

73 "infinitive-i-long", 

74 "infinitive-ii", 

75 "infinitive-iii", 

76 "infinitive-iv", 

77 "infinitive-v", 

78} 

79 

80# Subject->object transformation mapping, when using dummy-object-concord 

81# to replace subject concord tags with object concord tags 

82object_concord_replacements = { 

83 "first-person": "object-first-person", 

84 "second-person": "object-second-person", 

85 "third-person": "object-third-person", 

86 "singular": "object-singular", 

87 "plural": "object-plural", 

88 "definite": "object-definite", 

89 "indefinite": "object-indefinite", 

90 "class-1": "object-class-1", 

91 "class-2": "object-class-2", 

92 "class-3": "object-class-3", 

93 "class-4": "object-class-4", 

94 "class-5": "object-class-5", 

95 "class-6": "object-class-6", 

96 "class-7": "object-class-7", 

97 "class-8": "object-class-8", 

98 "class-9": "object-class-9", 

99 "class-10": "object-class-10", 

100 "class-11": "object-class-11", 

101 "class-12": "object-class-12", 

102 "class-13": "object-class-13", 

103 "class-14": "object-class-14", 

104 "class-15": "object-class-15", 

105 "class-16": "object-class-16", 

106 "class-17": "object-class-17", 

107 "class-18": "object-class-18", 

108 "masculine": "object-masculine", 

109 "feminine": "object-feminine", 

110} 

111 

112# Words in title that cause addition of tags in all entries 

113title_contains_global_map = { 

114 "possessive": "possessive", 

115 "possessed forms of": "possessive", 

116 "predicative forms of": "predicative", 

117 "negative": "negative", 

118 "positive definite forms": "positive definite", 

119 "positive indefinite forms": "positive indefinite", 

120 "comparative": "comparative", 

121 "superlative": "superlative", 

122 "combined forms": "combined-form", 

123 "mutation": "mutation", 

124 "definite article": "definite", 

125 "indefinite article": "indefinite", 

126 "indefinite declension": "indefinite", 

127 "bare forms": "indefinite", # e.g., cois/Irish 

128 "definite declension": "definite", 

129 "pre-reform": "dated", 

130 "personal pronouns": "personal pronoun", 

131 "composed forms of": "multiword-construction", 

132 "subordinate-clause forms of": "subordinate-clause", 

133 "participles of": "participle", 

134 "variation of": "dummy-skip-this", # a'/Scottish Gaelic 

135 "command form of": "imperative", # a راتلل/Pashto 

136 "historical inflection of": "dummy-skip-this", # kork/Norwegian Nynorsk 

137 "obsolete declension": "obsolete", # März/German 20241111 

138} 

139for k, v in title_contains_global_map.items(): 

140 if any(t not in valid_tags for t in v.split()): 140 ↛ 141line 140 didn't jump to line 141 because the condition on line 140 was never true

141 print("TITLE_CONTAINS_GLOBAL_MAP UNRECOGNIZED TAG: {}: {}".format(k, v)) 

142table_hdr_ign_part = r"(Inflection|Conjugation|Declension|Mutation) of [^\s]" 

143 

144table_hdr_ign_part_re = re.compile(r"(?i)(" + table_hdr_ign_part + ")") 

145# (?i) python regex extension, ignore case 

146title_contains_global_re = re.compile( 

147 r"(?i)(^|\b)({}|{})($|\b)".format( 

148 table_hdr_ign_part, 

149 "|".join(re.escape(x) for x in title_contains_global_map.keys()), 

150 ) 

151) 

152 

153# Words in title that cause addition of tags to table-tags "form" 

154title_contains_wordtags_map = { 

155 "pf": "perfective", 

156 "impf": "imperfective", 

157 "strong": "strong", 

158 "weak": "weak", 

159 "countable": "countable", 

160 "uncountable": "uncountable", 

161 "inanimate": "inanimate", 

162 "animate": "animate", 

163 "transitive": "transitive", 

164 "intransitive": "intransitive", 

165 "ditransitive": "ditransitive", 

166 "ambitransitive": "ambitransitive", 

167 "archaic": "archaic", 

168 "dated": "dated", 

169 "affirmative": "affirmative", 

170 "negative": "negative", 

171 "subject pronouns": "subjective", 

172 "object pronouns": "objective", 

173 "emphatic": "emphatic", 

174 "proper noun": "proper-noun", 

175 "no plural": "no-plural", 

176 "imperfective": "imperfective", 

177 "perfective": "perfective", 

178 "no supine stem": "no-supine", 

179 "no perfect stem": "no-perfect", 

180 "deponent": "deponent", 

181 "irregular": "irregular", 

182 "no short forms": "no-short-form", 

183 "iō-variant": "iō-variant", 

184 "1st declension": "declension-1", 

185 "2nd declension": "declension-2", 

186 "3rd declension": "declension-3", 

187 "4th declension": "declension-4", 

188 "5th declension": "declension-5", 

189 "6th declension": "declension-6", 

190 "first declension": "declension-1", 

191 "second declension": "declension-2", 

192 "third declension": "declension-3", 

193 "fourth declension": "declension-4", 

194 "fifth declension": "declension-5", 

195 "sixth declension": "declension-6", 

196 "1st conjugation": "conjugation-1", 

197 "2nd conjugation": "conjugation-2", 

198 "3rd conjugation": "conjugation-3", 

199 "4th conjugation": "conjugation-4", 

200 "5th conjugation": "conjugation-5", 

201 "6th conjugation": "conjugation-6", 

202 "7th conjugation": "conjugation-7", 

203 "first conjugation": "conjugation-1", 

204 "second conjugation": "conjugation-2", 

205 "third conjugation": "conjugation-3", 

206 "fourth conjugation": "conjugation-4", 

207 "fifth conjugation": "conjugation-5", 

208 "sixth conjugation": "conjugation-6", 

209 "seventh conjugation": "conjugation-7", 

210 # Corsican regional tags in table header 

211 "cismontane": "Cismontane", 

212 "ultramontane": "Ultramontane", 

213 "western lombard": "Western-Lombard", 

214 "eastern lombard": "Eastern-Lombard", 

215 "contracted": "contracted", 

216 "present": "present", 

217 "perfect": "perfect", 

218 "imperfect": "imperfect", 

219 "pluperfect": "pluperfect", 

220 "future": "future", 

221 "aorist": "aorist", 

222} 

223for k, v in title_contains_wordtags_map.items(): 

224 if any(t not in valid_tags for t in v.split()): 224 ↛ 225line 224 didn't jump to line 225 because the condition on line 224 was never true

225 print( 

226 "TITLE_CONTAINS_WORDTAGS_MAP UNRECOGNIZED TAG: {}: {}".format(k, v) 

227 ) 

228title_contains_wordtags_re = re.compile( 

229 r"(?i)(^|\b)({}|{})($|\b)".format( 

230 table_hdr_ign_part, 

231 "|".join(re.escape(x) for x in title_contains_wordtags_map.keys()), 

232 ) 

233) 

234 

235# Parenthesized elements in title that are converted to tags in 

236# "table-tags" form 

237title_elements_map = { 

238 "weak": "weak", 

239 "strong": "strong", 

240 "separable": "separable", 

241 "masculine": "masculine", 

242 "feminine": "feminine", 

243 "neuter": "neuter", 

244 "singular": "singular", 

245 "plural": "plural", 

246 "archaic": "archaic", 

247 "dated": "dated", 

248 "iterative": "iterative", 

249 "poetic": "poetic", 

250 "Attic": "Attic", 

251 "Epic": "Epic", 

252 "Aeolic": "Aeolic", 

253 "Arcadocypriot": "Arcadocypriot", 

254 "Old Attic": "Old-Attic", 

255 "Boeotian": "Boeotian", 

256 "Byzantine": "Byzantine", 

257 "Choral Doric": "Choral-Doric", 

258 "Doric": "Doric", 

259 "Elean": "Elean", 

260 "Epirote": "Epirote", 

261 "Ionic": "Ionic", 

262 "Koine": "Koine", 

263 "Cretan": "Cretan", 

264 "Corinthian": "Corinthian", 

265 "Laconian": "Laconian", 

266 "Later poetic": "Later-poetic-Ancient-Greek", 

267 "Lesbian": "Lesbian", 

268 "Locrian": "Locrian", 

269 "Lyric": "Lyric-Ancient-Greek", 

270 "Thessalian": "Thessalian", 

271 "Tragic": "Tragic-Ancient-Greek", 

272} 

273for k, v in title_elements_map.items(): 

274 if any(t not in valid_tags for t in v.split()): 274 ↛ 275line 274 didn't jump to line 275 because the condition on line 274 was never true

275 print("TITLE_ELEMENTS_MAP UNRECOGNIZED TAG: {}: {}".format(k, v)) 

276 

277# Parenthized element starts to map them to tags for form for the rest of 

278# the element 

279title_elemstart_map = { 

280 "auxiliary": "auxiliary", 

281 "Kotus type": "class", 

282 "ÕS type": "class", 

283 "class": "class", 

284 "short class": "class", 

285 "type": "class", 

286 "strong class": "class", 

287 "weak class": "class", 

288 "accent paradigm": "accent-paradigm", 

289 "stem in": "class", 

290} 

291for k, v in title_elemstart_map.items(): 

292 if any(t not in valid_tags for t in v.split()): 292 ↛ 293line 292 didn't jump to line 293 because the condition on line 292 was never true

293 print("TITLE_ELEMSTART_MAP UNRECOGNIZED TAG: {}: {}".format(k, v)) 

294title_elemstart_re = re.compile( 

295 r"^({}) ".format("|".join(re.escape(x) for x in title_elemstart_map.keys())) 

296) 

297 

298 

299# Regexp for cell starts that are likely definitions of reference symbols. 

300# See also nondef_re. 

301def_re = re.compile( 

302 r"(\s*•?\s+)?" 

303 r"((\*+|[△†0123456789⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻]+)([⁾):]|\s|(?=[A-Z]))|" 

304 r"\^(\*+|[△†])|" 

305 r"([¹²³⁴⁵⁶⁷⁸⁹])|" 

306 r"([ᴬᴮᴰᴱᴳᴴᴵᴶᴷᴸᴹᴺᴼᴾᴿᵀᵁⱽᵂᵃᵇᶜᵈᵉᶠᵍʰⁱʲᵏˡᵐⁿᵒᵖʳˢᵗᵘᵛʷˣʸᶻᵝᵞᵟᶿᶥᵠᵡ]))" 

307) 

308# ᴺᴸᴴ persan/Old Irish 

309 

310# Regexp for cell starts that are exceptions to def_re and do not actually 

311# start a definition. 

312nondef_re = re.compile( 

313 r"(^\s*(1|2|3)\s+(sg|pl)\s*$|" # 1s or 3p etc. 

314 r"\s*\d\d?\s*/\s*\d\d?\s*$)" 

315) # taka/Swahili "15 / 17" 

316 

317 

318class InflCell: 

319 """Cell in an inflection table.""" 

320 

321 __slots__ = ( 

322 "text", 

323 "is_title", 

324 "colspan", 

325 "rowspan", 

326 "target", 

327 "links", 

328 ) 

329 

330 def __init__( 

331 self, 

332 text: str, 

333 is_title: bool, 

334 colspan: int, 

335 rowspan: int, 

336 target: str | None, 

337 cell_links: list[tuple[str, str]] | None = None, 

338 ) -> None: 

339 assert isinstance(text, str) 

340 assert is_title in (True, False) 

341 assert isinstance(colspan, int) and colspan >= 1 

342 assert isinstance(rowspan, int) and rowspan >= 1 

343 assert target is None or isinstance(target, str) 

344 self.text = text.strip() 

345 self.is_title = text and is_title 

346 self.colspan = colspan 

347 self.rowspan = rowspan 

348 self.target = target 

349 self.links = cell_links 

350 

351 def __str__(self) -> str: 

352 v = "{}/{}/{}/{!r}".format( 

353 self.text, self.is_title, self.colspan, self.rowspan 

354 ) 

355 if self.target: 

356 v += ": {!r}".format(self.target) 

357 return v 

358 

359 def __repr__(self) -> str: 

360 return str(self) 

361 

362 

363class HdrSpan: 

364 """Saved information about a header cell/span during the parsing 

365 of a table.""" 

366 

367 __slots__ = ( 

368 "start", 

369 "colspan", 

370 "rowspan", 

371 "rownum", # Row number where this occurred 

372 "tagsets", # list of tuples 

373 "text", # For debugging 

374 "all_headers_row", 

375 "expanded", # The header has been expanded to cover whole row/part 

376 ) 

377 

378 def __init__( 

379 self, 

380 start: int, 

381 colspan: int, 

382 rowspan: int, 

383 rownum: int, 

384 tagsets: TagSets, 

385 text: str, 

386 all_headers_row: bool, 

387 ) -> None: 

388 assert isinstance(start, int) and start >= 0 

389 assert isinstance(colspan, int) and colspan >= 1 

390 assert isinstance(rownum, int) 

391 assert isinstance(tagsets, list) 

392 for x in tagsets: 

393 assert isinstance(x, tuple) 

394 assert all_headers_row in (True, False) 

395 self.start = start 

396 self.colspan = colspan 

397 self.rowspan = rowspan 

398 self.rownum = rownum 

399 self.tagsets = list(tuple(sorted(set(tags))) for tags in tagsets) 

400 self.text = text 

401 self.all_headers_row = all_headers_row 

402 self.expanded = False 

403 

404 

405def is_superscript(ch: str) -> bool: 

406 """Returns True if the argument is a superscript character.""" 

407 assert isinstance(ch, str) and len(ch) == 1 

408 try: 

409 name = unicodedata.name(ch) 

410 except ValueError: 

411 return False 

412 return ( 

413 re.match( 

414 r"SUPERSCRIPT |" 

415 r"MODIFIER LETTER SMALL |" 

416 r"MODIFIER LETTER CAPITAL ", 

417 name, 

418 ) 

419 is not None 

420 ) 

421 

422 

423def remove_useless_tags(lang: str, pos: str, tags: set[str]) -> None: 

424 """Remove certain tag combinations from ``tags`` when they serve no purpose 

425 together (cover all options).""" 

426 assert isinstance(lang, str) 

427 assert isinstance(pos, str) 

428 assert isinstance(tags, set) 

429 if ( 

430 "animate" in tags 

431 and "inanimate" in tags 

432 and get_lang_conf(lang, "animate_inanimate_remove") 

433 ): 

434 tags.remove("animate") 

435 tags.remove("inanimate") 

436 if ( 

437 "virile" in tags 

438 and "nonvirile" in tags 

439 and get_lang_conf(lang, "virile_nonvirile_remove") 

440 ): 

441 tags.remove("virile") 

442 tags.remove("nonvirile") 

443 # If all numbers in the language are listed, remove them all 

444 numbers = get_lang_conf(lang, "numbers") 

445 if numbers and all(x in tags for x in numbers): 

446 for x in numbers: 

447 tags.remove(x) 

448 # If all genders in the language are listed, remove them all 

449 genders = get_lang_conf(lang, "genders") 

450 if genders and all(x in tags for x in genders): 

451 for x in genders: 

452 tags.remove(x) 

453 # If all voices in the language are listed, remove them all 

454 voices = get_lang_conf(lang, "voices") 

455 if voices and all(x in tags for x in voices): 

456 for x in voices: 

457 tags.remove(x) 

458 # If all strengths of the language are listed, remove them all 

459 strengths = get_lang_conf(lang, "strengths") 

460 if strengths and all(x in tags for x in strengths): 

461 for x in strengths: 

462 tags.remove(x) 

463 # If all persons of the language are listed, remove them all 

464 persons = get_lang_conf(lang, "persons") 

465 if persons and all(x in tags for x in persons): 

466 for x in persons: 

467 tags.remove(x) 

468 # If all definitenesses of the language are listed, remove them all 

469 definitenesses = get_lang_conf(lang, "definitenesses") 

470 if definitenesses and all(x in tags for x in definitenesses): 

471 for x in definitenesses: 

472 tags.remove(x) 

473 

474 

475def tagset_cats(tagset: TagSets) -> set[str]: 

476 """Returns a set of tag categories for the tagset (merged from all 

477 alternatives).""" 

478 return set(valid_tags[t] for ts in tagset for t in ts) 

479 

480 

481def or_tagsets( 

482 lang: str, pos: str, tagsets1: TagSets, tagsets2: TagSets 

483) -> TagSets: 

484 """Merges two tagsets (the new tagset just merges the tags from both, in 

485 all combinations). If they contain simple alternatives (differ in 

486 only one category), they are simply merged; otherwise they are split to 

487 more alternatives. The tagsets are assumed be sets of sorted tuples.""" 

488 assert isinstance(tagsets1, list) 

489 assert all(isinstance(x, tuple) for x in tagsets1) 

490 assert isinstance(tagsets2, list) 

491 assert all(isinstance(x, tuple) for x in tagsets1) 

492 tagsets: TagSets = [] # This will be the result 

493 

494 def add_tags(tags1: tuple[str, ...]) -> None: 

495 # CONTINUE 

496 if not tags1: 

497 return # empty set would merge with anything, won't change result 

498 if not tagsets: 

499 tagsets.append(tags1) 

500 return 

501 for tags2 in tagsets: 

502 # Determine if tags1 can be merged with tags2 

503 num_differ = 0 

504 if tags1 and tags2: 504 ↛ 522line 504 didn't jump to line 522 because the condition on line 504 was always true

505 cats1 = set(valid_tags[t] for t in tags1) 

506 cats2 = set(valid_tags[t] for t in tags2) 

507 cats = cats1 | cats2 

508 for cat in cats: 

509 tags1_in_cat = set(t for t in tags1 if valid_tags[t] == cat) 

510 tags2_in_cat = set(t for t in tags2 if valid_tags[t] == cat) 

511 if ( 

512 tags1_in_cat != tags2_in_cat 

513 or not tags1_in_cat 

514 or not tags2_in_cat 

515 ): 

516 num_differ += 1 

517 if not tags1_in_cat or not tags2_in_cat: 

518 # Prevent merging if one is empty 

519 num_differ += 1 

520 # print("tags1={} tags2={} num_differ={}" 

521 # .format(tags1, tags2, num_differ)) 

522 if num_differ <= 1: 

523 # Yes, they can be merged 

524 tagsets.remove(tags2) 

525 tags_s = set(tags1) | set(tags2) 

526 remove_useless_tags(lang, pos, tags_s) 

527 tags_t = tuple(sorted(tags_s)) 

528 add_tags(tags_t) # Could result in further merging 

529 return 

530 # If we could not merge, add to tagsets 

531 tagsets.append(tags1) 

532 

533 for tags in tagsets1: 

534 add_tags(tags) 

535 for tags in tagsets2: 

536 add_tags(tags) 

537 if not tagsets: 

538 tagsets.append(()) 

539 

540 # print("or_tagsets: {} + {} -> {}" 

541 # .format(tagsets1, tagsets2, tagsets)) 

542 return tagsets 

543 

544 

545def and_tagsets( 

546 lang: str, 

547 pos: str, 

548 tagsets1: list[tuple[str, ...]], 

549 tagsets2: list[tuple[str, ...]], 

550) -> list[tuple[str, ...]]: 

551 """Merges tagsets by taking union of all cobinations, without trying 

552 to determine whether they are compatible.""" 

553 assert isinstance(tagsets1, list) and len(tagsets1) >= 1 

554 assert all(isinstance(x, tuple) for x in tagsets1) 

555 assert isinstance(tagsets2, list) and len(tagsets2) >= 1 

556 assert all(isinstance(x, tuple) for x in tagsets1) 

557 new_tagsets = [] 

558 tags: Union[set[str], tuple[str, ...]] 

559 for tags1 in tagsets1: 

560 for tags2 in tagsets2: 

561 tags = set(tags1) | set(tags2) 

562 remove_useless_tags(lang, pos, tags) 

563 if "dummy-ignored-text-cell" in tags: 563 ↛ 564line 563 didn't jump to line 564 because the condition on line 563 was never true

564 tags.remove("dummy-ignored-text-cell") 

565 tags = tuple(sorted(tags)) 

566 if tags not in new_tagsets: 566 ↛ 560line 566 didn't jump to line 560 because the condition on line 566 was always true

567 new_tagsets.append(tags) 

568 # print("and_tagsets: {} + {} -> {}" 

569 # .format(tagsets1, tagsets2, new_tagsets)) 

570 return new_tagsets 

571 

572 

573@functools.lru_cache(65536) 

574def extract_cell_content( 

575 lang: str, word: str, col: str 

576) -> tuple[str, list[str], list[tuple[str, str]], list[str]]: 

577 """Cleans a row/column header for later processing. This returns 

578 (cleaned, refs, defs, tags).""" 

579 # print("EXTRACT_CELL_CONTENT {!r}".format(col)) 

580 hdr_tags = [] 

581 col = re.sub(r"(?s)\s*,\s*$", "", col) 

582 col = re.sub(r"(?s)\s*•\s*$", "", col) 

583 col = re.sub(r"\s+", " ", col) 

584 col = col.strip() 

585 if re.search( 

586 r"^\s*(There are |" 

587 r"\* |" 

588 r"see |" 

589 r"Use |" 

590 r"use the |" 

591 r"Only used |" 

592 r"The forms in |" 

593 r"these are also written |" 

594 r"The genitive can be |" 

595 r"Genitive forms are rare or non-existant|" 

596 r"Accusative Note: |" 

597 r"Classifier Note: |" 

598 r"Noun: Assamese nouns are |" 

599 r"the active conjugation|" 

600 r"the instrumenal singular|" 

601 r"Note:|" 

602 r"\^* Note:|" 

603 r"possible mutated form |" 

604 r"The future tense: )", 

605 col, 

606 ): 

607 return "dummy-ignored-text-cell", [], [], [] 

608 

609 # Temporarily remove final parenthesized part (if separated by whitespace), 

610 # so that we can extract reference markers before it. 

611 final_paren = "" 

612 m = re.search(r"\s+\([^)]*\)$", col) 

613 if m is not None: 

614 final_paren = m.group(0) 

615 col = col[: m.start()] 

616 

617 # Extract references and tag markers 

618 refs = [] 

619 special_references = get_lang_conf(lang, "special_references") 

620 while True: 

621 m = re.search(r"\^(.|\([^)]*\))$", col) 

622 if not m: 

623 break 

624 r = m.group(1) 

625 if r.startswith("(") and r.endswith(")"): 

626 r = r[1:-1] 

627 for r1 in r.split(","): 

628 if r1 == "rare": 628 ↛ 629line 628 didn't jump to line 629 because the condition on line 628 was never true

629 hdr_tags.append("rare") 

630 elif special_references and r1 in special_references: 

631 hdr_tags.extend(special_references[r1].split()) 

632 else: 

633 # v = m.group(1) 

634 if r1.startswith("(") and r1.endswith(")"): 634 ↛ 635line 634 didn't jump to line 635 because the condition on line 634 was never true

635 r1 = r1[1:-1] 

636 refs.append(unicodedata.normalize("NFKD", r1)) 

637 col = col[: m.start()] 

638 # See if it is a ref definition 

639 # print("BEFORE REF CHECK: {!r}".format(col)) 

640 m = def_re.match(col) 

641 # print(f"Before def_re: {refs=}") 

642 if m and not nondef_re.match(col): 

643 ofs = 0 

644 ref = None 

645 deflst = [] 

646 for m in re.finditer(def_re, col): 

647 if ref: 

648 deflst.append((ref, col[ofs : m.start()].strip())) 

649 ref = unicodedata.normalize( 

650 "NFKD", m.group(3) or m.group(5) or m.group(6) or "" 

651 ) 

652 ofs = m.end() 

653 if ref: 653 ↛ 656line 653 didn't jump to line 656 because the condition on line 653 was always true

654 deflst.append((ref, col[ofs:].strip())) 

655 # print("deflst:", deflst) 

656 return "", [], deflst, [] 

657 # See if it *looks* like a reference to a definition 

658 # print(f"After def_re: {refs=}") 

659 while col: 

660 if is_superscript(col[-1]) or col[-1] in ("†",): 

661 if col.endswith("ʳᵃʳᵉ"): 

662 hdr_tags.append("rare") 

663 col = col[:-4].strip() 

664 continue 

665 if special_references: 

666 stop_flag = False 

667 for r in special_references: 

668 if col.endswith(r): 

669 hdr_tags.extend(special_references[r].split()) 

670 col = col[: -len(r)].strip() 

671 stop_flag = True 

672 break # this for loop 

673 if stop_flag: 

674 continue # this while loop 

675 # Numbers and H/L/N are useful information 

676 refs.append(unicodedata.normalize("NFKD", col[-1])) 

677 col = col[:-1] 

678 else: 

679 break 

680 

681 # Check for another form of note definition 

682 if ( 682 ↛ 688line 682 didn't jump to line 688 because the condition on line 682 was never true

683 len(col) > 2 

684 and col[1] in (")", " ", ":") 

685 and col[0].isdigit() 

686 and not re.match(nondef_re, col) 

687 ): 

688 return "", [], [(col[0], col[2:].strip())], [] 

689 col = col.strip() 

690 

691 # Extract final "*" reference symbols. Sometimes there are multiple. 

692 m = re.search(r"\*+$", col) 

693 if m is not None: 

694 col = col[: m.start()] 

695 refs.append(unicodedata.normalize("NFKD", m.group(0))) 

696 if col.endswith("(*)"): 696 ↛ 697line 696 didn't jump to line 697 because the condition on line 696 was never true

697 col = col[:-3].strip() 

698 refs.append("*") 

699 

700 # Put back the final parenthesized part 

701 col = col.strip() + final_paren 

702 # print("EXTRACT_CELL_CONTENT: orig_col={!r} col={!r} refs={!r} hdr_tags={}" 

703 # .format(orig_col, col, refs, hdr_tags)) 

704 return col.strip(), refs, [], hdr_tags 

705 

706 

707@functools.lru_cache(10000) 

708def parse_title( 

709 title: str, source: str 

710) -> tuple[list[str], list[str], list[FormData]]: 

711 """Parses inflection table title. This returns (global_tags, table_tags, 

712 extra_forms), where ``global_tags`` is tags to be added to each inflection 

713 entry, ``table_tags`` are tags for the word but not to be added to every 

714 form, and ``extra_forms`` is dictionary describing additional forms to be 

715 included in the part-of-speech entry).""" 

716 assert isinstance(title, str) 

717 assert isinstance(source, str) 

718 title = html.unescape(title) 

719 title = re.sub(r"(?i)<[^>]*>", "", title).strip() 

720 title = re.sub(r"\s+", " ", title) 

721 # print("PARSE_TITLE:", title) 

722 global_tags: list[str] = [] 

723 table_tags: list[str] = [] 

724 extra_forms = [] 

725 # Add certain global tags based on contained words 

726 for m in re.finditer(title_contains_global_re, title): 

727 v = m.group(0).lower() 

728 if re.match(table_hdr_ign_part_re, v): 728 ↛ 729line 728 didn't jump to line 729 because the condition on line 728 was never true

729 continue 

730 global_tags.extend(title_contains_global_map[v].split()) 

731 # Add certain tags to table-tags "form" based on contained words 

732 for m in re.finditer(title_contains_wordtags_re, title): 

733 v = m.group(0).lower() 

734 if re.match(table_hdr_ign_part_re, v): 734 ↛ 735line 734 didn't jump to line 735 because the condition on line 734 was never true

735 continue 

736 table_tags.extend(title_contains_wordtags_map[v].split()) 

737 if re.search(r"Conjugation of (s’|se ).*French verbs", title): 737 ↛ 738line 737 didn't jump to line 738 because the condition on line 737 was never true

738 global_tags.append("reflexive") 

739 # Check for <x>-type at the beginning of title (e.g., Armenian) and various 

740 # other ways of specifying an inflection class. 

741 for m in re.finditer( 

742 r"\b(" 

743 r"[\w/]+-type|" 

744 r"accent-\w+|" 

745 r"[\w/]+-stem|" 

746 r"[^ ]+ gradation|" 

747 r"\b(stem in [\w/ ]+)|" 

748 r"[^ ]+ alternation|" 

749 r"(First|Second|Third|Fourth|Fifth|Sixth|Seventh) " 

750 r"(Conjugation|declension)|" 

751 r"First and second declension|" 

752 r"(1st|2nd|3rd|4th|5th|6th) declension|" 

753 r"\w[\w/ ]* harmony" 

754 r")\b", 

755 title, 

756 ): 

757 dt: FormData = {"form": m.group(1), "source": source, "tags": ["class"]} 

758 extra_forms.append(dt) 

759 # Parse parenthesized part from title 

760 for m in re.finditer(r"\(([^)]*)\)", title): 

761 for elem in m.group(1).split(","): 

762 # group(0) is the whole string, group(1) first parens 

763 elem = elem.strip() 

764 if elem in title_elements_map: 

765 table_tags.extend(title_elements_map[elem].split()) 

766 else: 

767 m1 = re.match(title_elemstart_re, elem) 

768 if m1: 

769 tags = title_elemstart_map[m1.group(1)].split() 

770 dt = { 

771 "form": elem[m1.end() :], 

772 "source": source, 

773 "tags": tags, 

774 } 

775 extra_forms.append(dt) 

776 # For titles that contains no parenthesized parts, do some special 

777 # handling to still interpret parts from them 

778 if "(" not in title: 

779 # No parenthesized parts 

780 m1 = re.search(r"\b(Portuguese) (-.* verb) ", title) 

781 if m1 is not None: 

782 dt = {"form": m1.group(2), "tags": ["class"], "source": source} 

783 extra_forms.append(dt) 

784 for elem in title.split(","): 

785 elem = elem.strip() 

786 if elem in title_elements_map: 786 ↛ 787line 786 didn't jump to line 787 because the condition on line 786 was never true

787 table_tags.extend(title_elements_map[elem].split()) 

788 elif elem.endswith("-stem"): 788 ↛ 789line 788 didn't jump to line 789 because the condition on line 788 was never true

789 dt = {"form": elem, "tags": ["class"], "source": source} 

790 extra_forms.append(dt) 

791 return global_tags, table_tags, extra_forms 

792 

793 

794def expand_header( 

795 wxr: WiktextractContext, 

796 tablecontext: "TableContext", 

797 word: str, 

798 lang: str, 

799 pos: str, 

800 text: str, 

801 base_tags: Union[list[str], set[str], tuple[str, ...]], 

802 silent=False, 

803 ignore_tags=False, 

804 depth=0, 

805 column_number: int | None = None, 

806) -> list[tuple[str, ...]]: 

807 """Expands a cell header to tagset, handling conditional expressions 

808 in infl_map. This returns list of tuples of tags, each list element 

809 describing an alternative interpretation. ``base_tags`` is combined 

810 column and row tags for the cell in which the text is being interpreted 

811 (conditional expressions in inflection data may depend on it). 

812 If ``silent`` is True, then no warnings will be printed. If ``ignore_tags`` 

813 is True, then tags listed in "if" will be ignored in the test (this is 

814 used when trying to heuristically detect whether a non-<th> cell is anyway 

815 a header).""" 

816 assert isinstance(wxr, WiktextractContext) 

817 assert isinstance(word, str) 

818 assert isinstance(lang, str) 

819 assert isinstance(pos, str) 

820 assert isinstance(text, str) 

821 assert isinstance(base_tags, (list, tuple, set)) 

822 assert silent in (True, False) 

823 assert isinstance(depth, int) 

824 # print("EXPAND_HDR: text={!r} base_tags={!r}".format(text, base_tags)) 

825 # First map the text using the inflection map 

826 text = clean_value(wxr, text) 

827 combined_return: list[tuple[str, ...]] = [] 

828 parts = split_at_comma_semi(text, separators=[";"]) 

829 for text in parts: 

830 if not text: 830 ↛ 831line 830 didn't jump to line 831 because the condition on line 830 was never true

831 continue 

832 if text in infl_map: 

833 v = infl_map[text] # list or string 

834 else: 

835 m = re.match(infl_start_re, text) 

836 if m is not None: 836 ↛ 837line 836 didn't jump to line 837 because the condition on line 836 was never true

837 v = infl_start_map[m.group(1)] 

838 # print("INFL_START {} -> {}".format(text, v)) 

839 elif re.match(r"Notes", text): 

840 # Ignored header 

841 # print("IGNORING NOTES") 

842 combined_return = or_tagsets( 

843 lang, pos, combined_return, [("dummy-skip-this",)] 

844 ) 

845 # this just adds dummy-skip-this 

846 continue 

847 elif text in IGNORED_COLVALUES: 

848 combined_return = or_tagsets( 

849 lang, pos, combined_return, [("dummy-ignore-skipped",)] 

850 ) 

851 continue 

852 # Try without final parenthesized part 

853 text_without_parens = re.sub(r"[,/]?\s+\([^)]*\)\s*$", "", text) 

854 if text_without_parens in infl_map: 

855 v = infl_map[text_without_parens] 

856 elif m is None: 856 ↛ 872line 856 didn't jump to line 872 because the condition on line 856 was always true

857 if not silent: 

858 wxr.wtp.debug( 

859 "inflection table: unrecognized header: {}".format( 

860 repr(text) 

861 ), 

862 sortid="inflection/735", 

863 ) 

864 # Unrecognized header 

865 combined_return = or_tagsets( 

866 lang, pos, combined_return, [("error-unrecognized-form",)] 

867 ) 

868 continue 

869 

870 # Then loop interpreting the value, until the value is a simple string. 

871 # This may evaluate nested conditional expressions. 

872 default_else = None 

873 while True: 

874 # If it is a string, we are done. 

875 if isinstance(v, str): 

876 tags = set(v.split()) 

877 remove_useless_tags(lang, pos, tags) 

878 tagset = [tuple(sorted(tags))] 

879 break 

880 # For a list, just interpret it as alternatives. (Currently the 

881 # alternatives must directly be strings.) 

882 if isinstance(v, (list, tuple)): 

883 tagset = [] 

884 for x in v: 

885 tags = set(x.split()) 

886 remove_useless_tags(lang, pos, tags) 

887 tags_t = tuple(sorted(tags)) 

888 if tags_t not in tagset: 888 ↛ 884line 888 didn't jump to line 884 because the condition on line 888 was always true

889 tagset.append(tags_t) 

890 break 

891 # Otherwise the value should be a dictionary describing a 

892 # conditional expression. 

893 if not isinstance(v, dict): 893 ↛ 894line 893 didn't jump to line 894 because the condition on line 893 was never true

894 wxr.wtp.debug( 

895 "inflection table: internal: " 

896 "UNIMPLEMENTED INFL_MAP VALUE: {}".format(infl_map[text]), 

897 sortid="inflection/767", 

898 ) 

899 tagset = [()] 

900 break 

901 # Evaluate the conditional expression. 

902 assert isinstance(v, dict) 

903 cond: Union[bool, str] = "default-true" 

904 c: Union[str, list[str], set[str]] = "" 

905 # Handle "lang" condition. The value must be either a 

906 # single language or a list of languages, and the 

907 # condition evaluates to True if the table is one of 

908 # those languages. 

909 if "lang" in v: 

910 c = v["lang"] 

911 # check if it's a code and transform if necessary 

912 if isinstance(c, str): 

913 if c != lang: 

914 cond = lang == code_to_name(c, "en") 

915 else: 

916 cond = True 

917 else: 

918 assert isinstance(c, (list, tuple, set)) 

919 if lang not in c: 

920 cond = name_to_code(lang, "en") in c 

921 else: 

922 cond = True 

923 # Handle "nested-table-depth" condition. The value must 

924 # be an int or list of ints, and the condition evaluates 

925 # True if the depth is one of those values. 

926 # "depth" is how deep into a nested table tree the current 

927 # table lies. It is first started in handle_wikitext_table, 

928 # so only applies to tables-within-tables, not other 

929 # WikiNode content. `depth` is currently only passed as a 

930 # parameter down the table parsing stack, and not stored. 

931 if cond and "nested-table-depth" in v: 931 ↛ 932line 931 didn't jump to line 932 because the condition on line 931 was never true

932 d = v["nested-table-depth"] 

933 if isinstance(d, int): 

934 cond = d == depth 

935 else: 

936 assert isinstance(d, (list, tuple, set)) 

937 cond = depth in d 

938 # Column index: check if we're in position X of the row 

939 if cond and "column-index" in v: 

940 index = v["column-index"] 

941 if isinstance(index, int): 941 ↛ 944line 941 didn't jump to line 944 because the condition on line 941 was always true

942 cond = index == column_number 

943 else: 

944 assert isinstance(index, (list, tuple, set)) 

945 cond = column_number in index 

946 # Handle inflection-template condition. Must be a string 

947 # or list of strings, and if tablecontext.template_name is in 

948 # those, accept the condition. 

949 # TableContext.template_name is passed down from page/ 

950 # parse_inflection, before parsing and expanding itself 

951 # has begun. 

952 if cond and tablecontext and "inflection-template" in v: 

953 d1 = v["inflection-template"] 

954 if isinstance(d1, str): 954 ↛ 957line 954 didn't jump to line 957 because the condition on line 954 was always true

955 cond = d1 == tablecontext.template_name 

956 else: 

957 assert isinstance(d1, (list, tuple, set)) 

958 cond = tablecontext.template_name in d1 

959 # Handle "pos" condition. The value must be either a single 

960 # part-of-speech or a list of them, and the condition evaluates to 

961 # True if the part-of-speech is any of those listed. 

962 if cond and "pos" in v: 

963 c = v["pos"] 

964 if isinstance(c, str): 

965 cond = c == pos 

966 else: 

967 assert isinstance(c, (list, tuple, set)) 

968 cond = pos in c 

969 # Handle "if" condition. The value must be a string containing a 

970 # space-separated list of tags. The condition evaluates to True if 

971 # ``base_tags`` contains all of the listed tags. If the condition 

972 # is of the form "any: ...tags...", then any of the tags will be 

973 # enough. 

974 if cond and "if" in v and not ignore_tags: 

975 c = v["if"] 

976 assert isinstance(c, str) 

977 # "if" condition is true if any of the listed tags is present if 

978 # it starts with "any:", otherwise all must be present 

979 if c.startswith("any: "): 

980 cond = any(t in base_tags for t in c[5:].split()) 

981 else: 

982 cond = all(t in base_tags for t in c.split()) 

983 

984 # Handle "default" assignment. Store the value to be used 

985 # as a default later. 

986 if "default" in v: 

987 assert isinstance(v["default"], str) 

988 default_else = v["default"] 

989 

990 # Warning message about missing conditions for debugging. 

991 

992 if cond == "default-true" and not default_else and not silent: 

993 wxr.wtp.debug( 

994 "inflection table: IF MISSING COND: word={} " 

995 "lang={} text={} base_tags={} c={} cond={}".format( 

996 word, lang, text, base_tags, c, cond 

997 ), 

998 sortid="inflection/851", 

999 ) 

1000 # Based on the result of evaluating the condition, select either 

1001 # "then" part or "else" part. 

1002 if cond: 

1003 v = v.get("then", "") 

1004 else: 

1005 v1 = v.get("else") 

1006 if v1 is None: 

1007 if default_else is not None: 

1008 v = default_else 

1009 else: 

1010 if not silent: 

1011 wxr.wtp.debug( 

1012 "inflection table: IF WITHOUT ELSE EVALS " 

1013 "False: " 

1014 "{}/{} {!r} base_tags={}".format( 

1015 word, lang, text, base_tags 

1016 ), 

1017 sortid="inflection/865", 

1018 ) 

1019 v = "error-unrecognized-form" 

1020 else: 

1021 v = v1 

1022 

1023 # Merge the resulting tagset from this header part with the other 

1024 # tagsets from the whole header 

1025 combined_return = or_tagsets(lang, pos, combined_return, tagset) 

1026 

1027 # Return the combined tagsets, or empty tagset if we got no tagsets 

1028 if not combined_return: 

1029 combined_return = [()] 

1030 return combined_return 

1031 

1032 

1033def compute_coltags( 

1034 lang: str, 

1035 pos: str, 

1036 hdrspans: list[HdrSpan], 

1037 start: int, 

1038 colspan: int, 

1039 celltext: str, 

1040) -> list[tuple[str, ...]]: 

1041 """Computes column tags for a column of the given width based on the 

1042 current header spans.""" 

1043 assert isinstance(lang, str) 

1044 assert isinstance(pos, str) 

1045 assert isinstance(hdrspans, list) 

1046 assert isinstance(start, int) and start >= 0 

1047 assert isinstance(colspan, int) and colspan >= 1 

1048 assert isinstance(celltext, str) # For debugging only 

1049 # print("COMPUTE_COLTAGS CALLED start={} colspan={} celltext={!r}" 

1050 # .format(start, colspan, celltext)) 

1051 # For debugging, set this to the form for whose cell you want debug prints 

1052 if celltext == debug_cell_text: 1052 ↛ 1053line 1052 didn't jump to line 1053 because the condition on line 1052 was never true

1053 print( 

1054 "COMPUTE_COLTAGS CALLED start={} colspan={} celltext={!r}".format( 

1055 start, colspan, celltext 

1056 ) 

1057 ) 

1058 for hdrspan in hdrspans: 

1059 print( 

1060 " row={} start={} colspans={} tagsets={}".format( 

1061 hdrspan.rownum, 

1062 hdrspan.start, 

1063 hdrspan.colspan, 

1064 hdrspan.tagsets, 

1065 ) 

1066 ) 

1067 used = set() 

1068 coltags: list[tuple[str, ...]] = [()] 

1069 last_header_row = 1000000 

1070 # Iterate through the headers in reverse order, i.e., headers lower in the 

1071 # table (closer to the cell) first. 

1072 row_tagsets: list[tuple[str, ...]] = [()] 

1073 row_tagsets_rownum = 1000000 

1074 used_hdrspans = set() 

1075 for hdrspan in reversed(hdrspans): 

1076 if ( 

1077 hdrspan.start + hdrspan.colspan <= start 

1078 or hdrspan.start >= start + colspan 

1079 ): 

1080 # Does not horizontally overlap current cell. Ignore this hdrspan. 

1081 if celltext == debug_cell_text: 1081 ↛ 1082line 1081 didn't jump to line 1082 because the condition on line 1081 was never true

1082 print( 

1083 "Ignoring row={} start={} colspan={} tagsets={}".format( 

1084 hdrspan.rownum, 

1085 hdrspan.start, 

1086 hdrspan.colspan, 

1087 hdrspan.tagsets, 

1088 ) 

1089 ) 

1090 continue 

1091 # If the cell partially overlaps the current cell, assume we have 

1092 # reached something unrelated and abort. 

1093 if ( 

1094 hdrspan.start < start 

1095 and hdrspan.start + hdrspan.colspan > start 

1096 and hdrspan.start + hdrspan.colspan < start + colspan 

1097 ): 

1098 if celltext == debug_cell_text: 1098 ↛ 1099line 1098 didn't jump to line 1099 because the condition on line 1098 was never true

1099 print( 

1100 "break on partial overlap at start {} {} {}".format( 

1101 hdrspan.start, hdrspan.colspan, hdrspan.tagsets 

1102 ) 

1103 ) 

1104 break 

1105 if ( 

1106 hdrspan.start < start + colspan 

1107 and hdrspan.start > start 

1108 and hdrspan.start + hdrspan.colspan > start + colspan 

1109 and not hdrspan.expanded 

1110 ): 

1111 if celltext == debug_cell_text: 1111 ↛ 1112line 1111 didn't jump to line 1112 because the condition on line 1111 was never true

1112 print( 

1113 "break on partial overlap at end {} {} {}".format( 

1114 hdrspan.start, hdrspan.colspan, hdrspan.tagsets 

1115 ) 

1116 ) 

1117 break 

1118 # Check if we have already used this cell. 

1119 if id(hdrspan) in used_hdrspans: 

1120 continue 

1121 # We are going to use this cell. 

1122 used_hdrspans.add(id(hdrspan)) 

1123 tagsets = hdrspan.tagsets 

1124 # If the hdrspan is fully inside the current cell and does not cover 

1125 # it fully, check if we should merge information from multiple cells. 

1126 if not hdrspan.expanded and ( 

1127 hdrspan.start > start 

1128 or hdrspan.start + hdrspan.colspan < start + colspan 

1129 ): 

1130 # Multiple columns apply to the current cell, only 

1131 # gender/number/case tags present 

1132 # If there are no tags outside the range in any of the 

1133 # categories included in these cells, don't add anything 

1134 # (assume all choices valid in the language are possible). 

1135 in_cats = set( 

1136 valid_tags[t] 

1137 for x in hdrspans 

1138 if x.rownum == hdrspan.rownum 

1139 and x.start >= start 

1140 and x.start + x.colspan <= start + colspan 

1141 for tt in x.tagsets 

1142 for t in tt 

1143 ) 

1144 if celltext == debug_cell_text: 1144 ↛ 1145line 1144 didn't jump to line 1145 because the condition on line 1144 was never true

1145 print("in_cats={} tagsets={}".format(in_cats, tagsets)) 

1146 # Merge the tagsets into existing tagsets. This merges 

1147 # alternatives into the same tagset if there is only one 

1148 # category different; otherwise this splits the tagset into 

1149 # more alternatives. 

1150 includes_all_on_row = True 

1151 for x in hdrspans: 

1152 # print("X: x.rownum={} x.start={}".format(x.rownum, x.start)) 

1153 if x.rownum != hdrspan.rownum: 

1154 continue 

1155 if x.start < start or x.start + x.colspan > start + colspan: 

1156 if celltext == debug_cell_text: 1156 ↛ 1157line 1156 didn't jump to line 1157 because the condition on line 1156 was never true

1157 print( 

1158 "NOT IN RANGE: {} {} {}".format( 

1159 x.start, x.colspan, x.tagsets 

1160 ) 

1161 ) 

1162 includes_all_on_row = False 

1163 continue 

1164 if id(x) in used_hdrspans: 

1165 if celltext == debug_cell_text: 1165 ↛ 1166line 1165 didn't jump to line 1166 because the condition on line 1165 was never true

1166 print( 

1167 "ALREADY USED: {} {} {}".format( 

1168 x.start, x.colspan, x.tagsets 

1169 ) 

1170 ) 

1171 continue 

1172 used_hdrspans.add(id(x)) 

1173 if celltext == debug_cell_text: 1173 ↛ 1174line 1173 didn't jump to line 1174 because the condition on line 1173 was never true

1174 print( 

1175 "Merging into wide col: x.rownum={} " 

1176 "x.start={} x.colspan={} " 

1177 "start={} colspan={} tagsets={} x.tagsets={}".format( 

1178 x.rownum, 

1179 x.start, 

1180 x.colspan, 

1181 start, 

1182 colspan, 

1183 tagsets, 

1184 x.tagsets, 

1185 ) 

1186 ) 

1187 tagsets = or_tagsets(lang, pos, tagsets, x.tagsets) 

1188 # If all headers on the row were included, ignore them. 

1189 # See e.g. kunna/Swedish/Verb. 

1190 ts_cats = tagset_cats(tagsets) 

1191 if ( 

1192 includes_all_on_row 

1193 or 

1194 # Kludge, see fut/Hungarian/Verb 

1195 ("tense" in ts_cats and "object" in ts_cats) 

1196 ): 

1197 tagsets = [()] 

1198 # For limited categories, if the category doesn't appear 

1199 # outside, we won't include the category 

1200 if not in_cats - set( 

1201 ("gender", "number", "person", "case", "category", "voice") 

1202 ): 

1203 # Sometimes we have masc, fem, neut and plural, so treat 

1204 # number and gender as the same here (if one given, look for 

1205 # the other too) 

1206 if "number" in in_cats or "gender" in in_cats: 

1207 in_cats.update(("number", "gender")) 

1208 # Determine which categories occur outside on 

1209 # the same row. Ignore headers that have been expanded 

1210 # to cover the whole row/part of it. 

1211 out_cats = set( 

1212 valid_tags[t] 

1213 for x in hdrspans 

1214 if x.rownum == hdrspan.rownum 

1215 and not x.expanded 

1216 and ( 

1217 x.start < start or x.start + x.colspan > start + colspan 

1218 ) 

1219 for tt in x.tagsets 

1220 for t in tt 

1221 ) 

1222 if celltext == debug_cell_text: 1222 ↛ 1223line 1222 didn't jump to line 1223 because the condition on line 1222 was never true

1223 print("in_cats={} out_cats={}".format(in_cats, out_cats)) 

1224 # Remove all inside categories that do not appear outside 

1225 

1226 new_tagsets = [] 

1227 for ts in tagsets: 

1228 tags = tuple( 

1229 sorted(t for t in ts if valid_tags[t] in out_cats) 

1230 ) 

1231 if tags not in new_tagsets: 1231 ↛ 1227line 1231 didn't jump to line 1227 because the condition on line 1231 was always true

1232 new_tagsets.append(tags) 

1233 if celltext == debug_cell_text and new_tagsets != tagsets: 1233 ↛ 1234line 1233 didn't jump to line 1234 because the condition on line 1233 was never true

1234 print( 

1235 "Removed tags that do not " 

1236 "appear outside {} -> {}".format( 

1237 # have_hdr never used? 

1238 tagsets, 

1239 new_tagsets, 

1240 ) 

1241 ) 

1242 tagsets = new_tagsets 

1243 key = (hdrspan.start, hdrspan.colspan) 

1244 if key in used: 

1245 if celltext == debug_cell_text: 1245 ↛ 1246line 1245 didn't jump to line 1246 because the condition on line 1245 was never true

1246 print( 

1247 "Cellspan already used: start={} " 

1248 "colspan={} rownum={} {}".format( 

1249 hdrspan.start, 

1250 hdrspan.colspan, 

1251 hdrspan.rownum, 

1252 hdrspan.tagsets, 

1253 ) 

1254 ) 

1255 action = get_lang_conf(lang, "reuse_cellspan") 

1256 # can be "stop", "skip" or "reuse" 

1257 if action == "stop": 

1258 break 

1259 if action == "skip": 

1260 continue 

1261 assert action == "reuse" 

1262 tcats = tagset_cats(tagsets) 

1263 # Most headers block using the same column position above. However, 

1264 # "register" tags don't do this (cf. essere/Italian/verb: "formal") 

1265 if len(tcats) != 1 or "register" not in tcats: 

1266 used.add(key) 

1267 # If we have moved to a different row, merge into column tagsets 

1268 # (we use different and_tagsets within the row) 

1269 if row_tagsets_rownum != hdrspan.rownum: 

1270 # row_tagsets_rownum was initialized as 10000000 

1271 ret = and_tagsets(lang, pos, coltags, row_tagsets) 

1272 if celltext == debug_cell_text: 1272 ↛ 1273line 1272 didn't jump to line 1273 because the condition on line 1272 was never true

1273 print( 

1274 "merging rows: {} {} -> {}".format( 

1275 coltags, row_tagsets, ret 

1276 ) 

1277 ) 

1278 coltags = ret 

1279 row_tagsets = [()] 

1280 row_tagsets_rownum = hdrspan.rownum 

1281 # Merge into coltags 

1282 if hdrspan.all_headers_row and hdrspan.rownum + 1 == last_header_row: 

1283 # If this row is all headers and immediately preceeds the last 

1284 # header we accepted, take any header from there. 

1285 row_tagsets = and_tagsets(lang, pos, row_tagsets, tagsets) 

1286 if celltext == debug_cell_text: 1286 ↛ 1287line 1286 didn't jump to line 1287 because the condition on line 1286 was never true

1287 print("merged (next header row): {}".format(row_tagsets)) 

1288 else: 

1289 # new_cats is for the new tags (higher up in the table) 

1290 new_cats = tagset_cats(tagsets) 

1291 # cur_cats is for the tags already collected (lower in the table) 

1292 cur_cats = tagset_cats(coltags) 

1293 if celltext == debug_cell_text: 1293 ↛ 1294line 1293 didn't jump to line 1294 because the condition on line 1293 was never true

1294 print( 

1295 "row={} start={} colspan={} tagsets={} coltags={} " 

1296 "new_cats={} cur_cats={}".format( 

1297 hdrspan.rownum, 

1298 hdrspan.start, 

1299 hdrspan.colspan, 

1300 tagsets, 

1301 coltags, 

1302 new_cats, 

1303 cur_cats, 

1304 ) 

1305 ) 

1306 if "detail" in new_cats: 

1307 if not any(coltags): # Only if no tags so far 

1308 coltags = or_tagsets(lang, pos, coltags, tagsets) 

1309 if celltext == debug_cell_text: 1309 ↛ 1310line 1309 didn't jump to line 1310 because the condition on line 1309 was never true

1310 print("stopping on detail after merge") 

1311 break 

1312 # Here, we block bleeding of categories from above 

1313 elif "non-finite" in cur_cats and "non-finite" in new_cats: 

1314 stop = get_lang_conf(lang, "stop_non_finite_non_finite") 

1315 if stop: 1315 ↛ 1341line 1315 didn't jump to line 1341 because the condition on line 1315 was always true

1316 if celltext == debug_cell_text: 1316 ↛ 1317line 1316 didn't jump to line 1317 because the condition on line 1316 was never true

1317 print("stopping on non-finite-non-finite") 

1318 break 

1319 elif "non-finite" in cur_cats and "voice" in new_cats: 

1320 stop = get_lang_conf(lang, "stop_non_finite_voice") 

1321 if stop: 1321 ↛ 1341line 1321 didn't jump to line 1341 because the condition on line 1321 was always true

1322 if celltext == debug_cell_text: 1322 ↛ 1323line 1322 didn't jump to line 1323 because the condition on line 1322 was never true

1323 print("stopping on non-finite-voice") 

1324 break 

1325 elif "non-finite" in new_cats and cur_cats & set( 

1326 ("person", "number") 

1327 ): 

1328 if celltext == debug_cell_text: 1328 ↛ 1329line 1328 didn't jump to line 1329 because the condition on line 1328 was never true

1329 print("stopping on non-finite new") 

1330 break 

1331 elif "non-finite" in new_cats and "tense" in new_cats: 

1332 stop = get_lang_conf(lang, "stop_non_finite_tense") 

1333 if stop: 

1334 if celltext == debug_cell_text: 1334 ↛ 1335line 1334 didn't jump to line 1335 because the condition on line 1334 was never true

1335 print("stopping on non-finite new") 

1336 break 

1337 elif "non-finite" in cur_cats and new_cats & set(("mood",)): 1337 ↛ 1338line 1337 didn't jump to line 1338 because the condition on line 1337 was never true

1338 if celltext == debug_cell_text: 

1339 print("stopping on non-finite cur") 

1340 break 

1341 if ( 

1342 "tense" in new_cats 

1343 and any("imperative" in x for x in coltags) 

1344 and get_lang_conf(lang, "imperative_no_tense") 

1345 ): 

1346 if celltext == debug_cell_text: 1346 ↛ 1347line 1346 didn't jump to line 1347 because the condition on line 1346 was never true

1347 print("skipping tense in imperative") 

1348 continue 

1349 elif ( 

1350 "mood" in new_cats 

1351 and "mood" in cur_cats 

1352 and 

1353 # Allow if all new tags are already in current set 

1354 any( 

1355 t not in ts1 

1356 for ts1 in coltags # current 

1357 for ts2 in tagsets # new (from above) 

1358 for t in ts2 

1359 ) 

1360 ): 

1361 skip = get_lang_conf(lang, "skip_mood_mood") 

1362 if skip: 

1363 if celltext == debug_cell_text: 1363 ↛ 1364line 1363 didn't jump to line 1364 because the condition on line 1363 was never true

1364 print("skipping on mood-mood") 

1365 # we continue to next header 

1366 else: 

1367 if celltext == debug_cell_text: 1367 ↛ 1368line 1367 didn't jump to line 1368 because the condition on line 1367 was never true

1368 print("stopping on mood-mood") 

1369 break 

1370 elif "tense" in new_cats and "tense" in cur_cats: 

1371 skip = get_lang_conf(lang, "skip_tense_tense") 

1372 if skip: 

1373 if celltext == debug_cell_text: 1373 ↛ 1374line 1373 didn't jump to line 1374 because the condition on line 1373 was never true

1374 print("skipping on tense-tense") 

1375 # we continue to next header 

1376 else: 

1377 if celltext == debug_cell_text: 1377 ↛ 1378line 1377 didn't jump to line 1378 because the condition on line 1377 was never true

1378 print("stopping on tense-tense") 

1379 break 

1380 elif "aspect" in new_cats and "aspect" in cur_cats: 

1381 if celltext == debug_cell_text: 1381 ↛ 1382line 1381 didn't jump to line 1382 because the condition on line 1381 was never true

1382 print("skipping on aspect-aspect") 

1383 continue 

1384 elif "number" in cur_cats and "number" in new_cats: 

1385 if celltext == debug_cell_text: 1385 ↛ 1386line 1385 didn't jump to line 1386 because the condition on line 1385 was never true

1386 print("stopping on number-number") 

1387 break 

1388 elif "number" in cur_cats and "gender" in new_cats: 

1389 if celltext == debug_cell_text: 1389 ↛ 1390line 1389 didn't jump to line 1390 because the condition on line 1389 was never true

1390 print("stopping on number-gender") 

1391 break 

1392 elif "person" in cur_cats and "person" in new_cats: 

1393 if celltext == debug_cell_text: 1393 ↛ 1394line 1393 didn't jump to line 1394 because the condition on line 1393 was never true

1394 print("stopping on person-person") 

1395 break 

1396 else: 

1397 # Merge tags and continue to next header up/left in the table. 

1398 row_tagsets = and_tagsets(lang, pos, row_tagsets, tagsets) 

1399 if celltext == debug_cell_text: 1399 ↛ 1400line 1399 didn't jump to line 1400 because the condition on line 1399 was never true

1400 print("merged: {}".format(coltags)) 

1401 # Update the row number from which we have last taken headers 

1402 last_header_row = hdrspan.rownum 

1403 # Merge the final row tagset into coltags 

1404 coltags = and_tagsets(lang, pos, coltags, row_tagsets) 

1405 # print( 

1406 # "HDRSPANS:", list((x.start, x.colspan, x.tagsets) for x in hdrspans) 

1407 # ) 

1408 if celltext == debug_cell_text: 1408 ↛ 1409line 1408 didn't jump to line 1409 because the condition on line 1408 was never true

1409 print("COMPUTE_COLTAGS {} {}: {}".format(start, colspan, coltags)) 

1410 assert isinstance(coltags, list) 

1411 assert all(isinstance(x, tuple) for x in coltags) 

1412 return coltags 

1413 

1414 

1415def parse_simple_table( 

1416 wxr: WiktextractContext, 

1417 tablecontext: "TableContext", 

1418 word: str, 

1419 lang: str, 

1420 pos: str, 

1421 rows: list[list[InflCell]], 

1422 titles: list[str], 

1423 source: str, 

1424 after: str, 

1425 depth: int, 

1426) -> list[FormData]: 

1427 """This is the default table parser. Despite its name, it can parse 

1428 complex tables. This returns a list of forms to be added to the 

1429 part-of-speech, or None if the table could not be parsed.""" 

1430 assert isinstance(wxr, WiktextractContext) 

1431 assert isinstance(tablecontext, TableContext) 

1432 assert isinstance(word, str) 

1433 assert isinstance(lang, str) 

1434 assert isinstance(pos, str) 

1435 assert isinstance(rows, list) 

1436 assert isinstance(source, str) 

1437 assert isinstance(after, str) 

1438 assert isinstance(depth, int) 

1439 for row in rows: 

1440 for cell in row: 

1441 assert isinstance(cell, InflCell) 

1442 assert isinstance(titles, list) 

1443 for x in titles: 

1444 assert isinstance(x, str) 

1445 

1446 # print("PARSE_SIMPLE_TABLE: TITLES:", titles) 

1447 if debug_cell_text: 1447 ↛ 1448line 1447 didn't jump to line 1448 because the condition on line 1447 was never true

1448 print("ROWS:") 

1449 for row in rows: 

1450 print(" ", row) 

1451 

1452 # Check for forced rowspan kludge. See e.g. 

1453 # maorski/Serbo-Croatian. These are essentially multi-row 

1454 # cells implemented using <br> rather than separate cell. We fix this 

1455 # by identifying rows where this happens, and splitting the current row 

1456 # to multiple rows by synthesizing additional cells. 

1457 new_rows = [] 

1458 for row in rows: 

1459 split_row = ( 

1460 any(x.is_title and x.text in ("inanimate\nanimate",) for x in row) 

1461 and 

1462 # x is an InflCell 

1463 all(x.rowspan == 1 for x in row) 

1464 ) 

1465 if not split_row: 

1466 new_rows.append(row) 

1467 continue 

1468 row1 = [] 

1469 row2 = [] 

1470 for cell in row: 

1471 cell1 = copy.deepcopy(cell) 

1472 if "\n" in cell.text: 

1473 # Has more than one line - split this cell 

1474 parts = cell.text.strip().splitlines() 

1475 if len(parts) != 2: 1475 ↛ 1476line 1475 didn't jump to line 1476 because the condition on line 1475 was never true

1476 wxr.wtp.debug( 

1477 "forced rowspan kludge got {} parts: {!r}".format( 

1478 len(parts), cell.text 

1479 ), 

1480 sortid="inflection/1234", 

1481 ) 

1482 cell2 = copy.deepcopy(cell) 

1483 cell1.text = parts[0] 

1484 cell2.text = parts[1] 

1485 else: 

1486 cell1.rowspan = 2 

1487 cell2 = cell1 # ref, not a copy 

1488 row1.append(cell1) 

1489 row2.append(cell2) 

1490 new_rows.append(row1) 

1491 new_rows.append(row2) 

1492 rows = new_rows 

1493 # print("ROWS AFTER FORCED ROWSPAN KLUDGE:") 

1494 # for row in rows: 

1495 # print(" ", row) 

1496 

1497 # Parse definitions for references (from table itself and from text 

1498 # after it) 

1499 def_ht = {} 

1500 

1501 def add_defs(defs: list[tuple[str, str]]) -> None: 

1502 for ref, d in defs: 

1503 # print("DEF: ref={} d={}".format(ref, d)) 

1504 d = d.strip() 

1505 d = d.split(". ")[0].strip() # text before ". " 

1506 if not d: 1506 ↛ 1507line 1506 didn't jump to line 1507 because the condition on line 1506 was never true

1507 continue 

1508 if d.endswith("."): # catc ".."?? 

1509 d = d[:-1] 

1510 tags, topics = decode_tags(d, no_unknown_starts=True) 

1511 # print(f"{ref=}, {transformed=}, {tags=}") 

1512 if topics or any("error-unknown-tag" in ts for ts in tags): 

1513 d = d[0].lower() + d[1:] 

1514 tags, topics = decode_tags(d, no_unknown_starts=True) 

1515 if topics or any("error-unknown-tag" in ts for ts in tags): 

1516 # Failed to parse as tags 

1517 # print("Failed: topics={} tags={}" 

1518 # .format(topics, tags)) 

1519 continue 

1520 tags1_s: set[str] = set() 

1521 for ts in tags: 

1522 # Set.update is a union operation: definition tags are flat 

1523 tags1_s.update(ts) 

1524 tags1 = tuple(sorted(tags1_s)) 

1525 # print("DEFINED: {} -> {}".format(ref, tags1)) 

1526 def_ht[ref] = tags1 

1527 

1528 def generate_tags( 

1529 rowtags: list[tuple[str, ...]], table_tags: list[str] 

1530 ) -> tuple[ 

1531 list[tuple[str, ...]], list[tuple[str, ...]], list[tuple[str, ...]] 

1532 ]: 

1533 new_coltags: list[tuple[str, ...]] = [] 

1534 all_hdr_tags: list[tuple[str, ...]] = [] # list of tuples 

1535 new_rowtags: list[tuple[str, ...]] = [] 

1536 for rt0 in rowtags: 

1537 for ct0 in compute_coltags( 

1538 lang, 

1539 pos, 

1540 hdrspans, 

1541 col_idx, # col_idx=>start 

1542 colspan, 

1543 col, # cell_text 

1544 ): 

1545 base_tags: set[str] = ( 

1546 set(rt0) | set(ct0) | set(global_tags) | set(table_tags) 

1547 ) # Union. 

1548 # print(f"{rt0=}, {ct0=}, {global_tags=}," 

1549 # f" {table_tags=}, {base_tags=}") 

1550 alt_tags = expand_header( 

1551 wxr, 

1552 tablecontext, 

1553 word, 

1554 lang, 

1555 pos, 

1556 text, 

1557 base_tags, 

1558 depth=depth, 

1559 column_number=col_idx, 

1560 ) 

1561 # base_tags are used in infl_map "if"-conds. 

1562 for tt in alt_tags: 

1563 if tt not in all_hdr_tags: 

1564 all_hdr_tags.append(tt) 

1565 tt_s = set(tt) 

1566 # Add tags from referenced footnotes 

1567 tt_s.update(refs_tags) 

1568 # Sort, convert to tuple, and add to set of 

1569 # alternatives. 

1570 tt = tuple(sorted(tt_s)) 

1571 if tt not in new_coltags: 

1572 new_coltags.append(tt) 

1573 # Kludge (saprast/Latvian/Verb): ignore row tags 

1574 # if trying to add a non-finite after mood. 

1575 if any(valid_tags[t] == "mood" for t in rt0) and any( 

1576 valid_tags[t] == "non-finite" for t in tt 

1577 ): 

1578 tags = tuple(sorted(set(tt) | set(hdr_tags))) 

1579 else: 

1580 tags = tuple(sorted(set(tt) | set(rt0) | set(hdr_tags))) 

1581 if tags not in new_rowtags: 

1582 new_rowtags.append(tags) 

1583 return new_rowtags, new_coltags, all_hdr_tags 

1584 

1585 def add_new_hdrspan( 

1586 col: str, 

1587 hdrspans: list[HdrSpan], 

1588 store_new_hdrspan: bool, 

1589 col0_followed_by_nonempty: bool, 

1590 col0_hdrspan: Optional[HdrSpan], 

1591 ) -> tuple[str, bool, Optional[HdrSpan]]: 

1592 hdrspan = HdrSpan( 

1593 col_idx, colspan, rowspan, rownum, new_coltags, col, all_headers 

1594 ) 

1595 hdrspans.append(hdrspan) 

1596 

1597 # infl-map tag "dummy-store-hdrspan" causes this new hdrspan 

1598 # to be added to a register of stored hdrspans to be used 

1599 # later with "dummy-load-stored-hdrspans". 

1600 if store_new_hdrspan: 1600 ↛ 1601line 1600 didn't jump to line 1601 because the condition on line 1600 was never true

1601 tablecontext.stored_hdrspans.append(hdrspan) 

1602 

1603 # Handle headers that are above left-side header 

1604 # columns and are followed by personal pronouns in 

1605 # remaining columns (basically headers that 

1606 # evaluate to no tags). In such cases widen the 

1607 # left-side header to the full row. 

1608 if previously_seen: # id(cell) in seen_cells previously 

1609 col0_followed_by_nonempty = True 

1610 return col, col0_followed_by_nonempty, col0_hdrspan 

1611 elif col0_hdrspan is None: 

1612 col0_hdrspan = hdrspan 

1613 elif any(all_hdr_tags): 1613 ↛ 1681line 1613 didn't jump to line 1681 because the condition on line 1613 was always true

1614 col0_cats = tagset_cats(col0_hdrspan.tagsets) 

1615 later_cats = tagset_cats(all_hdr_tags) 

1616 col0_allowed = get_lang_conf(lang, "hdr_expand_first") 

1617 later_allowed = get_lang_conf(lang, "hdr_expand_cont") 

1618 later_allowed = later_allowed | set(["dummy"]) 

1619 # dummy2 has different behavior than plain dummy 

1620 # and does not belong here. 

1621 

1622 # print("col0_cats={} later_cats={} " 

1623 # "fol_by_nonempty={} col_idx={} end={} " 

1624 # "tagsets={}" 

1625 # .format(col0_cats, later_cats, 

1626 # col0_followed_by_nonempty, col_idx, 

1627 # col0_hdrspan.start + 

1628 # col0_hdrspan.colspan, 

1629 # col0_hdrspan.tagsets)) 

1630 # print("col0.rowspan={} rowspan={}" 

1631 # .format(col0_hdrspan.rowspan, rowspan)) 

1632 # Only expand if [col0_cats and later_cats are allowed 

1633 # and don't overlap] and [col0 has tags], and there have 

1634 # been [no disallowed cells in between]. 

1635 # 

1636 # There are three cases here: 

1637 # - col0_hdrspan set, continue with allowed current 

1638 # - col0_hdrspan set, expand, start new 

1639 # - col0_hdrspan set, no expand, start new 

1640 if ( 

1641 not col0_followed_by_nonempty 

1642 and 

1643 # XXX Only one cat of tags: kunna/Swedish 

1644 # XXX len(col0_cats) == 1 and 

1645 col0_hdrspan.rowspan >= rowspan 

1646 and 

1647 # from hdrspan 

1648 not (later_cats - later_allowed) 

1649 and not (col0_cats & later_cats) 

1650 ): 

1651 # First case: col0 set, continue 

1652 return col, col0_followed_by_nonempty, col0_hdrspan 

1653 # We are going to start new col0_hdrspan. Check if 

1654 # we should expand. 

1655 if ( 

1656 not col0_followed_by_nonempty 

1657 and not (col0_cats - col0_allowed) 

1658 and 

1659 # Only "allowed" allowed 

1660 # XXX len(col0_cats) == 1 and 

1661 col_idx > col0_hdrspan.start + col0_hdrspan.colspan 

1662 ): 

1663 # col_idx is beyond current colspan 

1664 # *Expand* current col0_hdrspan 

1665 # print("EXPANDING COL0 MID: {} from {} to {} " 

1666 # "cols {}" 

1667 # .format(col0_hdrspan.text, 

1668 # col0_hdrspan.colspan, 

1669 # col_idx - col0_hdrspan.start, 

1670 # col0_hdrspan.tagsets)) 

1671 col0_hdrspan.colspan = col_idx - col0_hdrspan.start 

1672 col0_hdrspan.expanded = True 

1673 # Clear old col0_hdrspan 

1674 if col == debug_cell_text: 1674 ↛ 1675line 1674 didn't jump to line 1675 because the condition on line 1674 was never true

1675 print("START NEW {}".format(hdrspan.tagsets)) 

1676 col0_hdrspan = None 

1677 # Now start new, unless it comes from previous row 

1678 if not previously_seen: 1678 ↛ 1681line 1678 didn't jump to line 1681 because the condition on line 1678 was always true

1679 col0_hdrspan = hdrspan 

1680 col0_followed_by_nonempty = False 

1681 return col, col0_followed_by_nonempty, col0_hdrspan 

1682 

1683 def split_text_into_alts(col: str) -> tuple[str, list[str], list[str]]: 

1684 # Split the cell text into alternatives 

1685 split_extra_tags = [] 

1686 if col and is_superscript(col[0]): 1686 ↛ 1687line 1686 didn't jump to line 1687 because the condition on line 1686 was never true

1687 alts = [col] 

1688 else: 

1689 separators = [";", "•", r"\n", " or "] 

1690 if " + " not in col: 

1691 separators.append(",") 

1692 if not col.endswith("/"): 

1693 separators.append("/") 

1694 if col in special_phrase_splits: 

1695 # Use language-specific special splits. 

1696 # These are phrases and constructions that have 

1697 # unique ways of splitting, not specific characters 

1698 # to split on like with the default splitting. 

1699 alts, tags = special_phrase_splits[col] 

1700 split_extra_tags = tags.split() 

1701 for x in split_extra_tags: 

1702 assert x in valid_tags 

1703 assert isinstance(alts, (list, tuple)) 

1704 assert isinstance(tags, str) 

1705 else: 

1706 # Use default splitting. However, recognize 

1707 # language-specific replacements and change them to magic 

1708 # characters before splitting. This way we won't split 

1709 # them. This is important for, e.g., recognizing 

1710 # alternative pronouns. 

1711 # The magic characters are characters out of Unicode scope 

1712 # that are given a simple incremental value, int > unicode. 

1713 repls = {} 

1714 magic_ch = MAGIC_FIRST 

1715 trs = get_lang_conf(lang, "form_transformations") 

1716 # trs is a list of lists of strings 

1717 for _, v, _, _ in trs: 

1718 # v is a pattern string, like "^ich" 

1719 # form_transformations data is doing double-duty here, 

1720 # because the pattern strings are already known to us and 

1721 # not meant to be split. 

1722 m = re.search(v, col) 

1723 if m is not None: 

1724 # if pattern found in text 

1725 magic = chr(magic_ch) 

1726 magic_ch += 1 # next magic character value 

1727 col = re.sub(v, magic, col) # replace with magic ch 

1728 repls[magic] = m.group(0) 

1729 # remember what regex match string each magic char 

1730 # replaces. .group(0) is the whole match. 

1731 alts0 = split_at_comma_semi(col, separators=separators) 

1732 # with magic characters in place, split the text so that 

1733 # pre-transformation text is out of the way. 

1734 alts = [] 

1735 for alt in alts0: 

1736 # create a new list with the separated items and 

1737 # the magic characters replaced with the original texts. 

1738 for k, v in repls.items(): 

1739 alt = re.sub(k, v, alt) 

1740 alts.append(alt) 

1741 

1742 # Remove "*" from beginning of forms, as in non-attested 

1743 # or reconstructed forms. Otherwise it might confuse romanization 

1744 # detection. 

1745 alts = list(re.sub(r"^\*\*?([^ ])", r"\1", x) for x in alts) 

1746 alts = list( 

1747 x for x in alts if not re.match(r"pronounced with |\(with ", x) 

1748 ) 

1749 alts = list( 

1750 re.sub(r"^\((in the sense [^)]*)\)\s+", "", x) for x in alts 

1751 ) 

1752 return col, alts, split_extra_tags 

1753 

1754 def handle_mixed_lines(alts: list[str]) -> list[tuple[str, str, str]]: 

1755 # Handle the special case where romanization is given under 

1756 # normal form, e.g. in Russian. There can be multiple 

1757 # comma-separated forms in each case. We also handle the case 

1758 # where instead of romanization we have IPA pronunciation 

1759 # (e.g., avoir/French/verb). 

1760 len2 = len(alts) // 2 

1761 # Check for IPAs (forms first, IPAs under) 

1762 # base, base, IPA, IPA 

1763 if ( 

1764 len(alts) % 2 == 0 # Divisibly by two 

1765 and all( 

1766 re.match(r"^\s*/.*/\s*$", x) # Inside slashes = IPA 

1767 for x in alts[len2:] 

1768 ) 

1769 ): # In the second half of alts 

1770 nalts = list( 

1771 (alts[i], "", alts[i + len2]) 

1772 # List of tuples: (base, "", ipa) 

1773 for i in range(len2) 

1774 ) 

1775 # base, base, base, IPA 

1776 elif ( 

1777 len(alts) > 2 

1778 and re.match(r"^\s*/.*/\s*$", alts[-1]) 

1779 and all(not x.startswith("/") for x in alts[:-1]) 

1780 ): 

1781 # Only if the last alt is IPA 

1782 nalts = list((alts[i], "", alts[-1]) for i in range(len(alts) - 1)) 

1783 # base, IPA, IPA, IPA 

1784 elif ( 

1785 len(alts) > 2 

1786 and not alts[0].startswith("/") 

1787 and all( 

1788 re.match(r"^\s*/.*/\s*$", alts[i]) for i in range(1, len(alts)) 

1789 ) 

1790 ): 

1791 # First is base and the rest is IPA alternatives 

1792 nalts = list((alts[0], "", alts[i]) for i in range(1, len(alts))) 

1793 

1794 # Check for romanizations, forms first, romanizations under 

1795 elif ( 

1796 len(alts) % 2 == 0 

1797 and not any("(" in x for x in alts) 

1798 and all( 

1799 classify_desc( 

1800 re.sub( 

1801 r"\^.*$", 

1802 "", 

1803 # Remove ends of strings starting from ^. 

1804 # Supescripts have been already removed 

1805 # from the string, while ^xyz needs to be 

1806 # removed separately, though it's usually 

1807 # something with a single letter? 

1808 "".join(xx for xx in x if not is_superscript(xx)), 

1809 ) 

1810 ) 

1811 == "other" 

1812 for x in alts[:len2] 

1813 ) 

1814 and all( 

1815 classify_desc( 

1816 re.sub( 

1817 r"\^.*$", 

1818 "", 

1819 "".join(xx for xx in x if not is_superscript(xx)), 

1820 ) 

1821 ) 

1822 in ("romanization", "english") 

1823 for x in alts[len2:] 

1824 ) 

1825 ): 

1826 nalts = list((alts[i], alts[i + len2], "") for i in range(len2)) 

1827 # Check for romanizations, forms and romanizations alternating 

1828 elif ( 

1829 len(alts) % 2 == 0 

1830 and not any("(" in x for x in alts) 

1831 and all( 

1832 classify_desc( 

1833 re.sub( 

1834 r"\^.*$", 

1835 "", 

1836 "".join(xx for xx in alts[i] if not is_superscript(xx)), 

1837 ) 

1838 ) 

1839 == "other" 

1840 for i in range(0, len(alts), 2) 

1841 ) 

1842 and all( 

1843 classify_desc( 

1844 re.sub( 

1845 r"\^.*$", 

1846 "", 

1847 "".join(xx for xx in alts[i] if not is_superscript(xx)), 

1848 ) 

1849 ) 

1850 in ("romanization", "english") 

1851 for i in range(1, len(alts), 2) 

1852 ) 

1853 ): 

1854 # odds 

1855 nalts = list( 

1856 (alts[i], alts[i + 1], "") for i in range(0, len(alts), 2) 

1857 ) 

1858 # evens 

1859 # Handle complex Georgian entries with alternative forms and* 

1860 # *romanizations. It's a bit of a mess. Remove this kludge if not 

1861 # needed anymore. NOTE THAT THE PARENTHESES ON THE WEBSITE ARE NOT 

1862 # DISPLAYED. They are put inside their own span elements that are 

1863 # then hidden with some CSS. 

1864 # https://en.wiktionary.org/wiki/%E1%83%90%E1%83%9B%E1%83%94%E1%83%A0%E1%83%98%E1%83%99%E1%83%98%E1%83%A1_%E1%83%A8%E1%83%94%E1%83%94%E1%83%A0%E1%83%97%E1%83%94%E1%83%91%E1%83%A3%E1%83%9A%E1%83%98_%E1%83%A8%E1%83%A2%E1%83%90%E1%83%A2%E1%83%94%E1%83%91%E1%83%98 

1865 # ამერიკის შეერთებულ შტატებს(ა) (ameriḳis šeertebul šṭaṭebs(a)) 

1866 # The above should generate two alts entries, with two different 

1867 # parallel versions, one without (a) and with (a) at the end, 

1868 # for both the Georgian original and the romanization. 

1869 elif ( 1869 ↛ 1874line 1869 didn't jump to line 1874 because the condition on line 1869 was never true

1870 tablecontext.template_name == "ka-decl-noun" 

1871 and len(alts) >= 1 

1872 and any(" (" in alt_ for alt_ in alts) 

1873 ): 

1874 nalts = ka_decl_noun_template_cell(alts) 

1875 else: 

1876 new_alts = [] 

1877 for alt in alts: 

1878 lst = [""] 

1879 idx = 0 

1880 for m in re.finditer( 

1881 r"(^|\w|\*)\((\w+" r"(/\w+)*)\)", 

1882 # start OR letter OR asterisk (word/word*) 

1883 # \\___________group 1_______/ \ \_g3_/// 

1884 # \ \__gr. 2_// 

1885 # \_____________group 0________________/ 

1886 alt, 

1887 ): 

1888 v = m.group(2) # (word/word/word...) 

1889 if ( 

1890 classify_desc(v) == "tags" # Tags inside parens 

1891 or m.group(0) == alt 

1892 ): # All in parens 

1893 continue 

1894 new_lst = [] 

1895 for x in lst: 

1896 x += alt[idx : m.start()] + m.group(1) 

1897 # alt until letter or asterisk 

1898 idx = m.end() 

1899 vparts = v.split("/") 

1900 # group(2) = ["word", "wörd"...] 

1901 if len(vparts) == 1: 

1902 new_lst.append(x) 

1903 new_lst.append(x + v) 

1904 # "kind(er)" -> ["kind", "kinder"] 

1905 else: 

1906 for vv in vparts: 

1907 new_lst.append(x + vv) 

1908 # "lampai(tten/den)" -> 

1909 # ["lampaitten", "lampaiden"] 

1910 lst = new_lst 

1911 for x in lst: 

1912 new_alts.append(x + alt[idx:]) 

1913 # add the end of alt 

1914 nalts = list((x, "", "") for x in new_alts) 

1915 # [form, no romz, no ipa] 

1916 return nalts 

1917 

1918 def find_semantic_parens(form: str) -> tuple[str, list[str]]: 

1919 # "Some languages" (=Greek) use brackets to mark things that 

1920 # require tags, like (informality), [rarity] and {archaicity}. 

1921 extra_tags = [] 

1922 if re.match(r"\([^][(){}]*\)$", form): 

1923 if get_lang_conf(lang, "parentheses_for_informal"): 

1924 form = form[1:-1] 

1925 extra_tags.append("informal") 

1926 else: 

1927 form = form[1:-1] 

1928 elif re.match(r"\{\[[^][(){}]*\]\}$", form): 

1929 if get_lang_conf( 1929 ↛ 1936line 1929 didn't jump to line 1936 because the condition on line 1929 was always true

1930 lang, "square_brackets_for_rare" 

1931 ) and get_lang_conf(lang, "curly_brackets_for_archaic"): 

1932 # είμαι/Greek/Verb 

1933 form = form[2:-2] 

1934 extra_tags.extend(["rare", "archaic"]) 

1935 else: 

1936 form = form[2:-2] 

1937 elif re.match(r"\{[^][(){}]*\}$", form): 

1938 if get_lang_conf(lang, "curly_brackets_for_archaic"): 1938 ↛ 1943line 1938 didn't jump to line 1943 because the condition on line 1938 was always true

1939 # είμαι/Greek/Verb 

1940 form = form[1:-1] 

1941 extra_tags.extend(["archaic"]) 

1942 else: 

1943 form = form[1:-1] 

1944 elif re.match(r"\[[^][(){}]*\]$", form): 

1945 if get_lang_conf(lang, "square_brackets_for_rare"): 1945 ↛ 1950line 1945 didn't jump to line 1950 because the condition on line 1945 was always true

1946 # είμαι/Greek/Verb 

1947 form = form[1:-1] 

1948 extra_tags.append("rare") 

1949 else: 

1950 form = form[1:-1] 

1951 return form, extra_tags 

1952 

1953 def handle_parens( 

1954 form: str, roman: str, clitic: str | None, extra_tags: list[str] 

1955 ) -> tuple[str, str, str | None]: 

1956 if TYPE_CHECKING: 

1957 assert isinstance(paren, str) 

1958 assert isinstance(m, re.Match) 

1959 if re.match(r"[’'][a-z]([a-z][a-z]?)?$", paren): 

1960 # is there a clitic starting with apostrophe? 

1961 clitic = paren 

1962 # assume the whole paren is a clitic 

1963 # then remove paren from form 

1964 form = (form[: m.start()] + subst + form[m.end() :]).strip() 

1965 elif classify_desc(paren) == "tags": 

1966 tagsets1, topics1 = decode_tags(paren) 

1967 if not topics1: 1967 ↛ 1988line 1967 didn't jump to line 1988 because the condition on line 1967 was always true

1968 for ts in tagsets1: 

1969 ts = tuple(x for x in ts if " " not in x) 

1970 # There are some generated tags containing 

1971 # spaces; do not let them through here. 

1972 extra_tags.extend(ts) 

1973 form = (form[: m.start()] + subst + form[m.end() :]).strip() 

1974 # brackets contain romanization 

1975 elif ( 

1976 m.start() > 0 

1977 and not roman 

1978 and classify_desc(form[: m.start()]) == "other" 

1979 and 

1980 # "other" ~ text 

1981 classify_desc(paren) in ("romanization", "english") 

1982 and not re.search(r"^with |-form$", paren) 

1983 ): 

1984 roman = paren 

1985 form = (form[: m.start()] + subst + form[m.end() :]).strip() 

1986 elif re.search(r"^with |-form", paren): 1986 ↛ 1987line 1986 didn't jump to line 1987 because the condition on line 1986 was never true

1987 form = (form[: m.start()] + subst + form[m.end() :]).strip() 

1988 return form, roman, clitic 

1989 

1990 def merge_row_and_column_tags( 

1991 form: str, 

1992 some_has_covered_text: bool, 

1993 links: list[tuple[str, str]] | None = None, 

1994 ) -> tuple[list[FormData], str, bool]: 

1995 # Merge column tags and row tags. We give preference 

1996 # to moods etc coming from rowtags (cf. austteigen/German/Verb 

1997 # imperative forms). 

1998 

1999 # In certain cases, what a tag means depends on whether 

2000 # it is a row or column header. Depending on the language, 

2001 # we replace certain tags with others if they're in 

2002 # a column or row 

2003 

2004 ret: list[FormData] = [] 

2005 # rtagreplacs = get_lang_conf(lang, "rowtag_replacements") 

2006 # ctagreplacs = get_lang_conf(lang, "coltag_replacements") 

2007 for rt in sorted(rowtags): 

2008 if "dummy-use-as-coltags" in rt: 2008 ↛ 2009line 2008 didn't jump to line 2009 because the condition on line 2008 was never true

2009 continue 

2010 # if lang was in rowtag_replacements) 

2011 # if not rtagreplacs == None: 

2012 # rt = replace_directional_tags(rt, rtagreplacs) 

2013 for ct in sorted(coltags): 

2014 if "dummy-use-as-rowtags" in ct: 2014 ↛ 2015line 2014 didn't jump to line 2015 because the condition on line 2014 was never true

2015 continue 

2016 # if lang was in coltag_replacements 

2017 # if not ctagreplacs == None: 

2018 # ct = replace_directional_tags(ct, 

2019 # ctagreplacs) 

2020 tags = set(global_tags) 

2021 tags.update(extra_tags) 

2022 tags.update(rt) 

2023 tags.update(refs_tags) 

2024 tags.update(tablecontext.section_header) 

2025 # Merge tags from column. For certain kinds of tags, 

2026 # those coming from row take precedence. 

2027 old_tags = set(tags) 

2028 for t in ct: 

2029 c = valid_tags[t] 

2030 if c in ("mood", "case", "number") and any( 

2031 valid_tags[tt] == c for tt in old_tags 

2032 ): 

2033 continue 

2034 tags.add(t) 

2035 

2036 # Extract language-specific tags from the 

2037 # form. This may also adjust the form. 

2038 form, lang_tags = lang_specific_tags(lang, pos, form) 

2039 tags.update(lang_tags) 

2040 

2041 # For non-finite verb forms, see if they have 

2042 # a gender/class suffix 

2043 if pos == "verb" and any( 

2044 valid_tags[t] == "non-finite" for t in tags 

2045 ): 

2046 form, tt = parse_head_final_tags(wxr, lang, form) 

2047 tags.update(tt) 

2048 

2049 # Remove "personal" tag if have nth person; these 

2050 # come up with e.g. reconhecer/Portuguese/Verb. But 

2051 # not if we also have "pronoun" 

2052 if ( 

2053 "personal" in tags 

2054 and "pronoun" not in tags 

2055 and any( 

2056 x in tags 

2057 for x in [ 

2058 "first-person", 

2059 "second-person", 

2060 "third-person", 

2061 ] 

2062 ) 

2063 ): 

2064 tags.remove("personal") 

2065 

2066 # If we have impersonal, remove person and number. 

2067 # This happens with e.g. viajar/Portuguese/Verb 

2068 if "impersonal" in tags: 

2069 tags = tags - set( 

2070 [ 

2071 "first-person", 

2072 "second-person", 

2073 "third-person", 

2074 "singular", 

2075 "plural", 

2076 ] 

2077 ) 

2078 

2079 # Remove unnecessary "positive" tag from verb forms 

2080 if pos == "verb" and "positive" in tags: 

2081 if "negative" in tags: 2081 ↛ 2082line 2081 didn't jump to line 2082 because the condition on line 2081 was never true

2082 tags.remove("negative") 

2083 tags.remove("positive") 

2084 

2085 # Many Russian (and other Slavic) inflection tables 

2086 # have animate/inanimate distinction that generates 

2087 # separate entries for neuter/feminine, but the 

2088 # distinction only applies to masculine. Remove them 

2089 # form neuter/feminine and eliminate duplicates. 

2090 if get_lang_conf(lang, "masc_only_animate"): 

2091 for t1 in ("animate", "inanimate"): 

2092 for t2 in ("neuter", "feminine"): 

2093 if ( 

2094 t1 in tags 

2095 and t2 in tags 

2096 and "masculine" not in tags 

2097 and "plural" not in tags 

2098 ): 

2099 tags.remove(t1) 

2100 

2101 # German adjective tables contain "(keiner)" etc 

2102 # for mixed declension plural. When the adjective 

2103 # disappears and it becomes just one word, remove 

2104 # the "includes-article" tag. e.g. eiskalt/German 

2105 if "includes-article" in tags and " " not in form: 

2106 tags.remove("includes-article") 

2107 

2108 # Handle ignored forms. We mark that the form was 

2109 # provided. This is important information; some words 

2110 # just do not have a certain form. However, there also 

2111 # many cases where no word in a language has a 

2112 # particular form. Post-processing could detect and 

2113 # remove such cases. 

2114 if form in IGNORED_COLVALUES: 

2115 # if cell text seems to be ignorable 

2116 if "dummy-ignore-skipped" in tags: 

2117 continue 

2118 if ( 

2119 col_idx not in has_covering_hdr 

2120 and some_has_covered_text 

2121 ): 

2122 continue 

2123 # don't ignore this cell if there's been a header 

2124 # above it 

2125 form = "-" 

2126 elif col_idx in has_covering_hdr: 

2127 some_has_covered_text = True 

2128 

2129 # Handle ambiguous object concord. If a header 

2130 # gives the "dummy-object-concord"-tag to a word, 

2131 # replace person, number and gender tags with 

2132 # their "object-" counterparts so that the verb 

2133 # agrees with the object instead. 

2134 # Use only when the verb has ONLY object agreement! 

2135 # a پخول/Pashto 

2136 if "dummy-object-concord" in tags: 2136 ↛ 2137line 2136 didn't jump to line 2137 because the condition on line 2136 was never true

2137 for subtag, objtag in object_concord_replacements.items(): 

2138 if subtag in tags: 

2139 tags.remove(subtag) 

2140 tags.add(objtag) 

2141 

2142 # Remove the dummy mood tag that we sometimes 

2143 # use to block adding other mood and related 

2144 # tags 

2145 tags = tags - set( 

2146 [ 

2147 "dummy-mood", 

2148 "dummy-tense", 

2149 "dummy-ignore-skipped", 

2150 "dummy-object-concord", 

2151 "dummy-reset-headers", 

2152 "dummy-use-as-coltags", 

2153 "dummy-use-as-rowtags", 

2154 "dummy-store-hdrspan", 

2155 "dummy-load-stored-hdrspans", 

2156 "dummy-reset-stored-hdrspans", 

2157 "dummy-section-header", 

2158 ] 

2159 ) 

2160 

2161 # Perform language-specific tag replacements according 

2162 # to rules in a table. 

2163 lang_tag_mappings = get_lang_conf(lang, "lang_tag_mappings") 

2164 if lang_tag_mappings is not None: 2164 ↛ 2165line 2164 didn't jump to line 2165 because the condition on line 2164 was never true

2165 for pre, post in lang_tag_mappings.items(): 

2166 if all(t in tags for t in pre): 

2167 tags = (tags - set(pre)) | set(post) 

2168 

2169 # Warn if there are entries with empty tags 

2170 if not tags: 

2171 wxr.wtp.debug( 

2172 "inflection table: empty tags for {}".format(form), 

2173 sortid="inflection/1826", 

2174 ) 

2175 

2176 # Warn if form looks like IPA 

2177 ########## XXX ######## 

2178 # Because IPA is its own unicode block, we could also 

2179 # technically do a Unicode name check to see if a string 

2180 # contains IPA. Not all valid IPA characters are in the 

2181 # IPA extension block, so you can technically have false 

2182 # negatives if it's something like /toki/, but it 

2183 # shouldn't give false positives. 

2184 # Alternatively, you could make a list of IPA-admissible 

2185 # characters and reject non-IPA stuff with that. 

2186 if re.match(r"\s*/.*/\s*$", form): 2186 ↛ 2187line 2186 didn't jump to line 2187 because the condition on line 2186 was never true

2187 wxr.wtp.debug( 

2188 "inflection table form looks like IPA: " 

2189 "form={} tags={}".format(form, tags), 

2190 sortid="inflection/1840", 

2191 ) 

2192 

2193 # Note that this checks `form`, not `in tags` 

2194 if form == "dummy-ignored-text-cell": 2194 ↛ 2195line 2194 didn't jump to line 2195 because the condition on line 2194 was never true

2195 continue 

2196 

2197 if "dummy-remove-this-cell" in tags: 2197 ↛ 2198line 2197 didn't jump to line 2198 because the condition on line 2197 was never true

2198 continue 

2199 

2200 # Add the form 

2201 tags_list = list(sorted(tags)) 

2202 dt: FormData = { 

2203 "form": form, 

2204 "tags": tags_list, 

2205 "source": source, 

2206 } 

2207 if roman: 

2208 dt["roman"] = roman 

2209 if ipa: 

2210 dt["ipa"] = ipa 

2211 if cell_links is not None and ( 

2212 matched_links := match_links_to_form( 

2213 wxr, form, cell_links, None 

2214 ) 

2215 ): 

2216 dt["links"] = matched_links 

2217 ret.append(dt) 

2218 # If we got separate clitic form, add it 

2219 if clitic: 

2220 dt = { 

2221 "form": clitic, 

2222 "tags": tags_list + ["clitic"], 

2223 "source": source, 

2224 } 

2225 ret.append(dt) 

2226 return ret, form, some_has_covered_text 

2227 

2228 # First extract definitions from cells 

2229 # See defs_ht for footnote defs stuff 

2230 for row in rows: 

2231 for cell in row: 

2232 text, refs, defs, hdr_tags = extract_cell_content( 

2233 lang, word, cell.text 

2234 ) 

2235 # refs, defs = footnote stuff, defs -> (ref, def) 

2236 add_defs(defs) 

2237 # Extract definitions from text after table 

2238 text, refs, defs, hdr_tags = extract_cell_content(lang, word, after) 

2239 add_defs(defs) 

2240 

2241 # Then extract the actual forms 

2242 ret = [] 

2243 hdrspans: list[HdrSpan] = [] 

2244 first_col_has_text = False 

2245 rownum = 0 

2246 title = None 

2247 global_tags = [] 

2248 table_tags = [] 

2249 special_phrase_splits = get_lang_conf(lang, "special_phrase_splits") 

2250 form_replacements = get_lang_conf(lang, "form_replacements") 

2251 form_transformations = get_lang_conf(lang, "form_transformations") 

2252 possibly_ignored_forms = get_lang_conf(lang, "conditionally_ignored_cells") 

2253 cleanup_rules = get_lang_conf(lang, "minor_text_cleanups") 

2254 

2255 for title in titles: 

2256 more_global_tags, more_table_tags, extra_forms = parse_title( 

2257 title, source 

2258 ) 

2259 global_tags.extend(more_global_tags) 

2260 table_tags.extend(more_table_tags) 

2261 ret.extend(extra_forms) 

2262 cell_rowcnt: collections.defaultdict[int, int] = collections.defaultdict( 

2263 int 

2264 ) 

2265 seen_cells = set() 

2266 has_covering_hdr = set() 

2267 some_has_covered_text = False 

2268 for row in rows: 

2269 # print("ROW:", row) 

2270 # print("====") 

2271 # print(f"Start of PREVIOUS row hdrspans:" 

2272 # f"{tuple(sp.tagsets for sp in hdrspans)}") 

2273 # print(f"Start of row txt: {tuple(t.text for t in row)}") 

2274 if not row: 2274 ↛ 2275line 2274 didn't jump to line 2275 because the condition on line 2274 was never true

2275 continue # Skip empty rows 

2276 all_headers = all(x.is_title or not x.text.strip() for x in row) 

2277 text = row[0].text 

2278 if ( 

2279 row[0].is_title 

2280 and text 

2281 and not is_superscript(text[0]) 

2282 and text not in infl_map # zealous inflation map? 

2283 and ( 

2284 re.match(r"Inflection ", text) 

2285 or re.sub( 

2286 r"\s+", 

2287 " ", # flatten whitespace 

2288 re.sub( 

2289 r"\s*\([^)]*\)", 

2290 "", 

2291 # Remove whitespace+parens 

2292 text, 

2293 ), 

2294 ).strip() 

2295 not in infl_map 

2296 ) 

2297 and not re.match(infl_start_re, text) 

2298 and all( 

2299 x.is_title == row[0].is_title and x.text == text 

2300 # all InflCells in `row` have the same is_title and text 

2301 for x in row 

2302 ) 

2303 ): 

2304 if text and title is None: 

2305 # Only if there were no titles previously make the first 

2306 # text that is found the title 

2307 title = text 

2308 if re.match(r"(Note:|Notes:)", title): 2308 ↛ 2309line 2308 didn't jump to line 2309 because the condition on line 2308 was never true

2309 continue # not a title 

2310 more_global_tags, more_table_tags, extra_forms = parse_title( 

2311 title, source 

2312 ) 

2313 global_tags.extend(more_global_tags) 

2314 table_tags.extend(more_table_tags) 

2315 ret.extend(extra_forms) 

2316 continue # Skip title rows without incrementing i 

2317 if "dummy-skip-this" in global_tags: 2317 ↛ 2318line 2317 didn't jump to line 2318 because the condition on line 2317 was never true

2318 return [] 

2319 rowtags: list[tuple[str, ...]] = [()] 

2320 # have_hdr = False 

2321 # have_hdr never used? 

2322 have_text = False 

2323 samecell_cnt = 0 

2324 col0_hdrspan = None # col0 or later header (despite its name) 

2325 col0_followed_by_nonempty = False 

2326 row_empty = True 

2327 for col_idx, cell in enumerate(row): 

2328 colspan = cell.colspan # >= 1 

2329 rowspan = cell.rowspan # >= 1 

2330 cell_links = cell.links # for weird links 

2331 previously_seen = id(cell) in seen_cells 

2332 # checks to see if this cell was in the previous ROW 

2333 seen_cells.add(id(cell)) 

2334 if samecell_cnt == 0: 

2335 # First column of a (possible multi-column) cell 

2336 samecell_cnt = colspan - 1 

2337 else: 

2338 assert samecell_cnt > 0 

2339 samecell_cnt -= 1 

2340 continue 

2341 

2342 # is_first_row_of_cell = cell_rowcnt[id(cell)] == 0 

2343 # never used? 

2344 

2345 # defaultdict(int) around line 1900 

2346 cell_rowcnt[id(cell)] += 1 

2347 # => how many cols this spans 

2348 col: str = cell.text 

2349 if not col: 

2350 continue 

2351 row_empty = False 

2352 is_title = cell.is_title 

2353 

2354 # If the cell has a target, i.e., text after colon, interpret 

2355 # it as simply specifying a value for that value and ignore 

2356 # it otherwise. 

2357 if cell.target: 

2358 text, refs, defs, hdr_tags = extract_cell_content( 

2359 lang, word, col 

2360 ) 

2361 if not text: 2361 ↛ 2362line 2361 didn't jump to line 2362 because the condition on line 2361 was never true

2362 continue 

2363 refs_tags: set[str] = set() 

2364 for ref in refs: # gets tags from footnotes 2364 ↛ 2365line 2364 didn't jump to line 2365 because the loop on line 2364 never started

2365 if ref in def_ht: 

2366 refs_tags.update(def_ht[ref]) 

2367 rowtags = expand_header( 

2368 wxr, 

2369 tablecontext, 

2370 word, 

2371 lang, 

2372 pos, 

2373 text, 

2374 [], 

2375 silent=True, 

2376 depth=depth, 

2377 column_number=col_idx, 

2378 ) 

2379 rowtags = list( 

2380 set(tuple(sorted(set(x) | refs_tags)) for x in rowtags) 

2381 ) 

2382 is_title = False 

2383 col = cell.target 

2384 

2385 # print(rownum, col_idx, col) 

2386 # print(f"is_title: {is_title}") 

2387 if is_title: 

2388 # It is a header cell 

2389 text, refs, defs, hdr_tags = extract_cell_content( 

2390 lang, word, col 

2391 ) 

2392 if not text: 

2393 continue 

2394 # Extract tags from referenced footnotes 

2395 refs_tags = set() 

2396 for ref in refs: 

2397 if ref in def_ht: 

2398 refs_tags.update(def_ht[ref]) 

2399 

2400 # Expand header to tags 

2401 v = expand_header( 

2402 wxr, 

2403 tablecontext, 

2404 word, 

2405 lang, 

2406 pos, 

2407 text, 

2408 [], 

2409 silent=True, 

2410 depth=depth, 

2411 column_number=col_idx, 

2412 ) 

2413 # print("EXPANDED {!r} to {}".format(text, v)) 

2414 

2415 if col_idx == 0: 

2416 # first_col_has_text is used for a test to ignore 

2417 # upper-left cells that are just text without 

2418 # header info 

2419 first_col_has_text = True 

2420 # Check if the header expands to reset hdrspans 

2421 if any("dummy-reset-headers" in tt for tt in v): 

2422 new_hdrspans = [] 

2423 for hdrspan in hdrspans: 

2424 # if there are HdrSpan objects (abstract headers with 

2425 # row- and column-spans) that are to the left or at the 

2426 # same row or below, KEEP those; things above and to 

2427 # the right of the hdrspan with dummy-reset-headers 

2428 # are discarded. Tags from the header together with 

2429 # dummy-reset-headers are kept as normal. 

2430 if ( 

2431 hdrspan.start + hdrspan.colspan < col_idx 

2432 or hdrspan.rownum > rownum - cell.rowspan 

2433 ): 

2434 new_hdrspans.append(hdrspan) 

2435 hdrspans = new_hdrspans 

2436 

2437 for tt in v: 

2438 if "dummy-section-header" in tt: 2438 ↛ 2439line 2438 didn't jump to line 2439 because the condition on line 2438 was never true

2439 tablecontext.section_header = tt 

2440 break 

2441 if "dummy-reset-section-header" in tt: 2441 ↛ 2442line 2441 didn't jump to line 2442 because the condition on line 2441 was never true

2442 tablecontext.section_header = tuple() 

2443 # Text between headers on a row causes earlier headers to 

2444 # be reset 

2445 if have_text: 

2446 # print(" HAVE_TEXT BEFORE HDR:", col) 

2447 # Reset rowtags if new title column after previous 

2448 # text cells 

2449 # +-----+-----+-----+-----+ 

2450 # |hdr-a|txt-a|hdr-B|txt-B| 

2451 # +-----+-----+-----+-----+ 

2452 # ^reset rowtags=> 

2453 # XXX beware of header "—": "" - must not clear on that if 

2454 # it expands to no tags 

2455 rowtags = [()] 

2456 # have_hdr = True 

2457 # have_hdr never used? 

2458 # print("HAVE_HDR: {} rowtags={}".format(col, rowtags)) 

2459 # Update rowtags and coltags 

2460 has_covering_hdr.add(col_idx) # col_idx == current column 

2461 # has_covering_hdr is a set that has the col_idx-ids of columns 

2462 # that have previously had some kind of header. It is never 

2463 # resetted inside the col_idx-loops OR the bigger rows-loop, so 

2464 # applies to the whole table. 

2465 

2466 new_coltags: list[tuple[str, ...]] 

2467 all_hdr_tags: list[tuple[str, ...]] 

2468 rowtags, new_coltags, all_hdr_tags = generate_tags( 

2469 rowtags, table_tags 

2470 ) 

2471 

2472 if any("dummy-skip-this" in ts for ts in rowtags): 

2473 continue # Skip this cell 

2474 

2475 if any("dummy-load-stored-hdrspans" in ts for ts in v): 2475 ↛ 2476line 2475 didn't jump to line 2476 because the condition on line 2475 was never true

2476 hdrspans.extend(tablecontext.stored_hdrspans) 

2477 

2478 if any("dummy-reset-stored-hdrspans" in ts for ts in v): 2478 ↛ 2479line 2478 didn't jump to line 2479 because the condition on line 2478 was never true

2479 tablecontext.stored_hdrspans = [] 

2480 

2481 if any("dummy-store-hdrspan" in ts for ts in v): 2481 ↛ 2483line 2481 didn't jump to line 2483 because the condition on line 2481 was never true

2482 # print(f"STORED: {col}") 

2483 store_new_hdrspan = True 

2484 else: 

2485 store_new_hdrspan = False 

2486 

2487 new_coltags = list( 

2488 x 

2489 for x in new_coltags 

2490 if not any(t in noinherit_tags for t in x) 

2491 ) 

2492 # print("new_coltags={} previously_seen={} all_hdr_tags={}" 

2493 # .format(new_coltags, previously_seen, all_hdr_tags)) 

2494 if any(new_coltags): 

2495 ( 

2496 col, 

2497 col0_followed_by_nonempty, 

2498 col0_hdrspan, 

2499 ) = add_new_hdrspan( 

2500 col, 

2501 hdrspans, 

2502 store_new_hdrspan, 

2503 col0_followed_by_nonempty, 

2504 col0_hdrspan, 

2505 ) 

2506 

2507 continue 

2508 

2509 # These values are ignored, at least for now 

2510 if re.match(r"^(# |\(see )", col): 2510 ↛ 2511line 2510 didn't jump to line 2511 because the condition on line 2510 was never true

2511 continue 

2512 

2513 if any("dummy-skip-this" in ts for ts in rowtags): 

2514 continue # Skip this cell 

2515 

2516 # If the word has no rowtags and is a multi-row cell, then 

2517 # ignore this. This happens with empty separator rows 

2518 # within a rowspan>1 cell. cf. wander/English/Conjugation. 

2519 if rowtags == [()] and rowspan > 1: 

2520 continue 

2521 

2522 # Minor cleanup. See e.g. είμαι/Greek/Verb present participle. 

2523 if cleanup_rules: 

2524 for regx, substitution in cleanup_rules.items(): 

2525 col = re.sub(regx, substitution, col) 

2526 

2527 if ( 2527 ↛ 2532line 2527 didn't jump to line 2532 because the condition on line 2527 was never true

2528 col_idx == 0 

2529 and not first_col_has_text 

2530 and get_lang_conf(lang, "ignore_top_left_text_cell") is True 

2531 ): 

2532 continue # Skip text at top left, as in Icelandic, Faroese 

2533 

2534 # if col0_hdrspan is not None: 

2535 # print("COL0 FOLLOWED NONHDR: {!r} by {!r}" 

2536 # .format(col0_hdrspan.text, col)) 

2537 col0_followed_by_nonempty = True 

2538 have_text = True 

2539 

2540 # Determine column tags for the multi-column cell 

2541 combined_coltags = compute_coltags( 

2542 lang, pos, hdrspans, col_idx, colspan, col 

2543 ) 

2544 if any("dummy-ignored-text-cell" in ts for ts in combined_coltags): 2544 ↛ 2545line 2544 didn't jump to line 2545 because the condition on line 2544 was never true

2545 continue 

2546 

2547 # Split the text into separate forms. First simplify spaces except 

2548 # newline. 

2549 col = re.sub(r"[ \t\r]+", " ", col) 

2550 # Split the cell text into alternatives 

2551 

2552 col, alts, split_extra_tags = split_text_into_alts(col) 

2553 

2554 # Some cells have mixed form content, like text and romanization, 

2555 # or text and IPA. Handle these. 

2556 altss = handle_mixed_lines(alts) 

2557 

2558 altsss = list((x, combined_coltags, cell_links) for x in altss) 

2559 

2560 # Generate forms from the alternatives 

2561 # alts is a list of (tuple of forms, tuple of tags) 

2562 coltags: list[tuple[str, ...]] 

2563 base_roman: str 

2564 ipa: str 

2565 for (form, base_roman, ipa), coltags, cell_links in altsss: 

2566 form = form.strip() 

2567 extra_tags: list[str] = [] 

2568 extra_tags.extend(split_extra_tags) 

2569 # Handle special splits again here, so that we can have custom 

2570 # mappings from form to form and tags. 

2571 if form in form_replacements: 

2572 replacement, tags = form_replacements[form] 

2573 for x in tags.split(): 

2574 assert x in valid_tags 

2575 assert isinstance(replacement, str) 

2576 assert isinstance(tags, str) 

2577 form = replacement 

2578 extra_tags.extend(tags.split()) 

2579 

2580 check_romanization_form_transformation = False 

2581 # loop over regexes in form_transformation and replace text 

2582 # in form using regex patterns 

2583 # this does a bit of the same stuff the above does, 

2584 # but with regexes and re.sub() instead 

2585 subst: str 

2586 for ( 

2587 form_transformations_pos, 

2588 vv, 

2589 subst, 

2590 tags, 

2591 ) in form_transformations: 

2592 # v is a pattern string, like "^ich" 

2593 if ( 

2594 isinstance(form_transformations_pos, str) 

2595 and pos != form_transformations_pos 

2596 ) or ( 

2597 (not isinstance(form_transformations_pos, str)) 

2598 and pos not in form_transformations_pos 

2599 ): 

2600 continue 

2601 m: re.Match | None = re.search(vv, form) 

2602 if m is not None: 

2603 if base_roman: 2603 ↛ 2604line 2603 didn't jump to line 2604 because the condition on line 2603 was never true

2604 for _, rom_v, rom_sub, _ in form_transformations: 

2605 rom_m = re.search(rom_v, base_roman) 

2606 if rom_m is not None: 

2607 base_roman = re.sub( 

2608 rom_v, rom_sub, base_roman 

2609 ) 

2610 break 

2611 form = re.sub(vv, subst, form) 

2612 for x in tags.split(): 

2613 assert x in valid_tags 

2614 extra_tags.extend(tags.split()) 

2615 check_romanization_form_transformation = True 

2616 break 

2617 

2618 # Clean the value, extracting reference symbols 

2619 form, refs, defs, hdr_tags = extract_cell_content( 

2620 lang, word, form 

2621 ) 

2622 # if refs: 

2623 # print("REFS:", refs) 

2624 extra_tags.extend(hdr_tags) 

2625 # Extract tags from referenced footnotes 

2626 refs_tags = set() 

2627 for ref in refs: 

2628 if ref in def_ht: 

2629 refs_tags.update(def_ht[ref]) 

2630 

2631 if base_roman: 

2632 if check_romanization_form_transformation: 2632 ↛ 2636line 2632 didn't jump to line 2636 because the condition on line 2632 was never true

2633 # because form_transformations are used to handle things 

2634 # where the romanization has the "same" structure, we 

2635 # need to handle that here too.... 

2636 for ( 

2637 _, 

2638 vv, 

2639 subst, 

2640 _, 

2641 ) in form_transformations: 

2642 # v is a pattern string, like "^ich" 

2643 m = re.search(vv, base_roman) 

2644 if m is not None: 

2645 base_roman = re.sub(vv, subst, base_roman) 

2646 # XXX add tag stuff here if needed 

2647 break 

2648 

2649 base_roman, _, _, hdr_tags = extract_cell_content( 

2650 lang, word, base_roman 

2651 ) 

2652 extra_tags.extend(hdr_tags) 

2653 

2654 # Do some additional cleanup on the cell. 

2655 form = re.sub(r"^\s*,\s*", "", form) 

2656 form = re.sub(r"\s*,\s*$", "", form) 

2657 form = re.sub(r"\s*(,\s*)+", ", ", form) 

2658 form = re.sub(r"(?i)^Main:", "", form) 

2659 form = re.sub(r"\s+", " ", form) 

2660 form = form.strip() 

2661 

2662 # Look for parentheses that have semantic meaning 

2663 form, et = find_semantic_parens(form) 

2664 extra_tags.extend(et) 

2665 

2666 # Handle parentheses in the table element. We parse 

2667 # tags anywhere and romanizations anywhere but beginning. 

2668 roman: str = base_roman 

2669 paren: str | None = None 

2670 clitic: str | None = None 

2671 m = re.search(r"(\s+|^)\(([^)]*)\)", form) 

2672 # start|spaces + (anything) 

2673 if m is not None: 

2674 subst = m.group(1) 

2675 paren = m.group(2) 

2676 else: 

2677 m = re.search(r"\(([^)]*)\)(\s+|$)", form) 

2678 # (anything) + spaces|end 

2679 if m is not None: 2679 ↛ 2680line 2679 didn't jump to line 2680 because the condition on line 2679 was never true

2680 paren = m.group(1) 

2681 subst = m.group(2) 

2682 if paren is not None: 

2683 form, roman, clitic = handle_parens( 

2684 form, roman, clitic, extra_tags 

2685 ) 

2686 

2687 # Ignore certain forms that are not really forms, 

2688 # unless they're really, really close to the article title 

2689 if form in ( 2689 ↛ 2694line 2689 didn't jump to line 2694 because the condition on line 2689 was never true

2690 "", 

2691 "unchanged", 

2692 "after an", # in sona/Irish/Adj/Mutation 

2693 ): 

2694 Lev = distw([form], word) 

2695 if form and Lev < 0.1: 

2696 wxr.wtp.debug( 

2697 "accepted possible false positive '{}' with" 

2698 "> 0.1 Levenshtein distance in {}/{}".format( 

2699 form, word, lang 

2700 ), 

2701 sortid="inflection/2213", 

2702 ) 

2703 elif form and Lev < 0.3: 

2704 wxr.wtp.debug( 

2705 "skipped possible match '{}' with > 0.3" 

2706 "Levenshtein distance in {}/{}".format( 

2707 form, word, lang 

2708 ), 

2709 sortid="inflection/2218", 

2710 ) 

2711 continue 

2712 else: 

2713 continue 

2714 # print("ROWTAGS={} COLTAGS={} REFS_TAGS={} " 

2715 # "FORM={!r} ROMAN={!r}" 

2716 # .format(rowtags, coltags, refs_tags, 

2717 # form, roman)) 

2718 

2719 # Merge tags from row and column and do miscellaneous 

2720 # tag-related handling. 

2721 ( 

2722 merge_ret, 

2723 form, 

2724 some_has_covered_text, 

2725 ) = merge_row_and_column_tags( 

2726 form, some_has_covered_text, cell_links 

2727 ) 

2728 ret.extend(merge_ret) 

2729 

2730 # End of row. 

2731 rownum += 1 

2732 # For certain languages, if the row was empty, reset 

2733 # hdrspans (saprast/Latvian/Verb, but not aussteigen/German/Verb). 

2734 if row_empty and get_lang_conf(lang, "empty_row_resets"): 

2735 hdrspans = [] 

2736 # Check if we should expand col0_hdrspan. 

2737 if col0_hdrspan is not None: 

2738 col0_allowed = get_lang_conf(lang, "hdr_expand_first") 

2739 col0_cats = tagset_cats(col0_hdrspan.tagsets) 

2740 # Only expand if col0_cats and later_cats are allowed 

2741 # and don't overlap and col0 has tags, and there have 

2742 # been no disallowed cells in between. 

2743 if ( 

2744 not col0_followed_by_nonempty 

2745 and not (col0_cats - col0_allowed) 

2746 and 

2747 # len(col0_cats) == 1 and 

2748 col_idx > col0_hdrspan.start + col0_hdrspan.colspan 

2749 ): 

2750 # If an earlier header is only followed by headers that yield 

2751 # no tags, expand it to entire row 

2752 # print("EXPANDING COL0: {} from {} to {} cols {}" 

2753 # .format(col0_hdrspan.text, col0_hdrspan.colspan, 

2754 # len(row) - col0_hdrspan.start, 

2755 # col0_hdrspan.tagsets)) 

2756 col0_hdrspan.colspan = len(row) - col0_hdrspan.start 

2757 col0_hdrspan.expanded = True 

2758 # XXX handle refs and defs 

2759 # for x in hdrspans: 

2760 # print(" HDRSPAN {} {} {} {!r}" 

2761 # .format(x.start, x.colspan, x.tagsets, x.text)) 

2762 

2763 # Post-process German nouns with articles in separate columns. We move the 

2764 # definite/indefinite/usually-without-article markers into the noun and 

2765 # remove the article entries. 

2766 if get_lang_conf(lang, "articles_in_separate_columns") and any( 

2767 "noun" in x["tags"] for x in ret 

2768 ): 

2769 new_ret = [] 

2770 saved_tags: set[str] = set() 

2771 had_noun = False 

2772 for dt in ret: 

2773 tags = dt["tags"] 

2774 # print(tags) 

2775 if "noun" in tags: 

2776 tags = list( 

2777 sorted(set(t for t in tags if t != "noun") | saved_tags) 

2778 ) 

2779 had_noun = True 

2780 elif ( 2780 ↛ 2807line 2780 didn't jump to line 2807 because the condition on line 2780 was always true

2781 "indefinite" in tags 

2782 or "definite" in tags 

2783 or "usually-without-article" in tags 

2784 or "without-article" in tags 

2785 ): 

2786 if had_noun: 

2787 saved_tags = set(tags) 

2788 else: 

2789 saved_tags = saved_tags | set(tags) # E.g. Haus/German 

2790 remove_useless_tags(lang, pos, saved_tags) 

2791 saved_tags = saved_tags & set( 

2792 [ 

2793 "masculine", 

2794 "feminine", 

2795 "neuter", 

2796 "singular", 

2797 "plural", 

2798 "indefinite", 

2799 "definite", 

2800 "usually-without-article", 

2801 "without-article", 

2802 ] 

2803 ) 

2804 had_noun = False 

2805 continue # Skip the articles 

2806 

2807 dt = dt.copy() 

2808 dt["tags"] = tags 

2809 new_ret.append(dt) 

2810 ret = new_ret 

2811 

2812 elif possibly_ignored_forms: 

2813 # Some languages have tables with cells that are kind of separated 

2814 # and difficult to handle, like eulersche Formel/German where 

2815 # the definite and indefinite articles are just floating. 

2816 # If a language has a dict of conditionally_ignored_cells, 

2817 # and if the contents of a cell is found in one of the rules 

2818 # there, ignore that cell if it 

2819 # 1. Does not have the appropriate tag (like "definite" for "die") 

2820 # and 

2821 # 2. The title of the article is not one of the other co-words 

2822 # (ie. it's an article for the definite articles in german etc.) 

2823 # pass 

2824 new_ret = [] 

2825 for cell_data in ret: 

2826 tags = cell_data["tags"] 

2827 text = cell_data["form"] 

2828 skip_this = False 

2829 for key_tag, ignored_forms in possibly_ignored_forms.items(): 

2830 if text not in ignored_forms: 2830 ↛ 2832line 2830 didn't jump to line 2832 because the condition on line 2830 was always true

2831 continue 

2832 if word in ignored_forms: 

2833 continue 

2834 if key_tag not in tags: 

2835 skip_this = True 

2836 

2837 if skip_this: 2837 ↛ 2838line 2837 didn't jump to line 2838 because the condition on line 2837 was never true

2838 continue 

2839 new_ret.append(cell_data) 

2840 

2841 ret = new_ret 

2842 

2843 # Post-process English inflection tables, addding "multiword-construction" 

2844 # when the number of words has increased. 

2845 if lang == "English" and pos == "verb": 

2846 word_words = len(word.split()) 

2847 new_ret = [] 

2848 for dt in ret: 

2849 form = dt.get("form", "") 

2850 if len(form.split()) > word_words: 

2851 dt = dt.copy() 

2852 dt["tags"] = list(dt.get("tags", [])) 

2853 # This strange copy-assigning shuffle is preventative black 

2854 # magic; do not touch lest you invoke deep bugs. 

2855 data_append(dt, "tags", "multiword-construction") 

2856 new_ret.append(dt) 

2857 ret = new_ret 

2858 

2859 # Always insert "table-tags" detail as the first entry in any inflection 

2860 # table. This way we can reliably detect where a new table starts. 

2861 # Table-tags applies until the next table-tags entry. 

2862 if ret or table_tags: 

2863 table_tags = sorted(set(table_tags)) 

2864 dt = { 

2865 "form": " ".join(table_tags), 

2866 "source": source, 

2867 "tags": ["table-tags"], 

2868 } 

2869 if dt["form"] == "": 

2870 dt["form"] = "no-table-tags" 

2871 if tablecontext.template_name: 

2872 tn: FormData = { 

2873 "form": tablecontext.template_name, 

2874 "source": source, 

2875 "tags": ["inflection-template"], 

2876 } 

2877 ret = [dt] + [tn] + ret 

2878 else: 

2879 ret = [dt] + ret 

2880 

2881 return ret 

2882 

2883 

2884def handle_generic_table( 

2885 wxr: WiktextractContext, 

2886 tablecontext: "TableContext", 

2887 data: WordData, 

2888 word: str, 

2889 lang: str, 

2890 pos: str, 

2891 rows: list[list[InflCell]], 

2892 titles: list[str], 

2893 source: str, 

2894 after: str, 

2895 depth: int, 

2896) -> None: 

2897 assert isinstance(wxr, WiktextractContext) 

2898 assert isinstance(data, dict) 

2899 assert isinstance(word, str) 

2900 assert isinstance(lang, str) 

2901 assert isinstance(pos, str) 

2902 assert isinstance(rows, list) 

2903 assert isinstance(source, str) 

2904 assert isinstance(after, str) 

2905 assert isinstance(depth, int) 

2906 for row in rows: 

2907 assert isinstance(row, list) 

2908 for x in row: 

2909 assert isinstance(x, InflCell) 

2910 assert isinstance(titles, list) 

2911 for s in titles: 

2912 assert isinstance(s, str) 

2913 

2914 # Try to parse the table as a simple table 

2915 ret = parse_simple_table( 

2916 wxr, tablecontext, word, lang, pos, rows, titles, source, after, depth 

2917 ) 

2918 if ret is None: 2918 ↛ 2921line 2918 didn't jump to line 2921 because the condition on line 2918 was never true

2919 # XXX handle other table formats 

2920 # We were not able to handle the table 

2921 wxr.wtp.debug( 

2922 "unhandled inflection table format, {}/{}".format(word, lang), 

2923 sortid="inflection/2370", 

2924 ) 

2925 return 

2926 

2927 # Add the returned forms but eliminate duplicates. 

2928 have_forms = set() 

2929 for dt in ret: 

2930 fdt = freeze(dt) 

2931 if fdt in have_forms: 

2932 continue # Don't add duplicates 

2933 # Some Russian words have Declension and Pre-reform declension partially 

2934 # duplicating same data. Don't add "dated" tags variant if already have 

2935 # the same without "dated" from the modern declension table 

2936 

2937 tags = dt.get("tags", []) 

2938 for dated_tag in ("dated",): 

2939 if dated_tag in tags: 

2940 dt2 = dt.copy() 

2941 tags2 = list(x for x in tags if x != dated_tag) 

2942 dt2["tags"] = tags2 

2943 if tags2 and freeze(dt2) in have_forms: 2943 ↛ 2944line 2943 didn't jump to line 2944 because the condition on line 2943 was never true

2944 break # Already have without archaic 

2945 else: 

2946 if "table-tags" not in tags: 

2947 have_forms.add(fdt) 

2948 data_append(data, "forms", dt) 

2949 

2950 

2951def determine_header( 

2952 wxr: WiktextractContext, 

2953 tablecontext, 

2954 lang: str, 

2955 word: str, 

2956 pos: str, 

2957 table_kind: NodeKind, 

2958 kind: NodeKind | str, 

2959 style: str | None, 

2960 row: list[InflCell], 

2961 col: WikiNode, 

2962 celltext: str, 

2963 titletext: str, 

2964 cols_headered: list[bool], 

2965 target: str | None, 

2966 cellstyle: str, 

2967 # is_title, 

2968 # hdr_expansion, 

2969 # target, 

2970 # celltext, 

2971) -> tuple[bool, list[tuple[str, ...]], str | None, str]: 

2972 assert isinstance(table_kind, NodeKind) 

2973 assert isinstance(kind, (NodeKind, str)) 

2974 assert style is None or isinstance(style, str) 

2975 assert cellstyle is None or isinstance(cellstyle, str) 

2976 

2977 header_kind: NodeKind | str 

2978 if table_kind == NodeKind.TABLE: 

2979 header_kind = NodeKind.TABLE_HEADER_CELL 

2980 elif table_kind == NodeKind.HTML: 2980 ↛ 2982line 2980 didn't jump to line 2982 because the condition on line 2980 was always true

2981 header_kind = "th" 

2982 idx = celltext.find(": ") 

2983 is_title = False 

2984 # remove anything in parentheses, compress whitespace, .strip() 

2985 cleaned_titletext = re.sub( 

2986 r"\s+", " ", re.sub(r"\s*\([^)]*\)", "", titletext) 

2987 ).strip() 

2988 cleaned, _, _, _ = extract_cell_content(lang, word, celltext) 

2989 cleaned = re.sub(r"\s+", " ", cleaned) 

2990 hdr_expansion = expand_header( 

2991 wxr, 

2992 tablecontext, 

2993 word, 

2994 lang, 

2995 pos, 

2996 cleaned, 

2997 [], 

2998 silent=True, 

2999 ignore_tags=True, 

3000 ) 

3001 candidate_hdr = not any( 

3002 any(t.startswith("error-") for t in ts) for ts in hdr_expansion 

3003 ) 

3004 # KJ candidate_hdr says that a specific cell is a candidate 

3005 # for being a header because it passed through expand_header 

3006 # without getting any "error-" tags; that is, the contents 

3007 # is "valid" for being a header; these are the false positives 

3008 # we want to catch 

3009 ignored_cell = any( 

3010 any(t.startswith("dummy-") for t in ts) for ts in hdr_expansion 

3011 ) 

3012 # ignored_cell should NOT be used to filter for headers, like 

3013 # candidate_hdr is used, but only to filter for related *debug 

3014 # messages*: some dummy-tags are actually half-way to headers, 

3015 # like ones with "Notes", so they MUST be headers, but later 

3016 # on they're ignored *as* headers so they don't need to print 

3017 # out any cells-as-headers debug messages. 

3018 if ( 

3019 candidate_hdr 

3020 and kind != header_kind 

3021 and cleaned != "" 

3022 and cleaned != "dummy-ignored-text-cell" 

3023 and cleaned not in IGNORED_COLVALUES 

3024 ): 

3025 # print("col: {}".format(col)) 

3026 if not ignored_cell and lang not in LANGUAGES_WITH_CELLS_AS_HEADERS: 

3027 wxr.wtp.debug( 

3028 "rejected heuristic header: " 

3029 "table cell identified as header and given " 

3030 "candidate status, BUT {} is not in " 

3031 "LANGUAGES_WITH_CELLS_AS_HEADERS; " 

3032 "cleaned text: {}".format(lang, cleaned), 

3033 sortid="inflection/2447", 

3034 ) 

3035 candidate_hdr = False 

3036 elif cleaned not in LANGUAGES_WITH_CELLS_AS_HEADERS.get(lang, ""): 

3037 wxr.wtp.debug( 

3038 "rejected heuristic header: " 

3039 "table cell identified as header and given " 

3040 "candidate status, BUT the cleaned text is " 

3041 "not in LANGUAGES_WITH_CELLS_AS_HEADERS[{}]; " 

3042 "cleaned text: {}".format(lang, cleaned), 

3043 sortid="inflection/2457", 

3044 ) 

3045 candidate_hdr = False 

3046 else: 

3047 wxr.wtp.debug( 

3048 "accepted heuristic header: " 

3049 "table cell identified as header and given " 

3050 "candidate status, AND the cleaned text is " 

3051 "in LANGUAGES_WITH_CELLS_AS_HEADERS[{}]; " 

3052 "cleaned text: {}".format(lang, cleaned), 

3053 sortid="inflection/2466", 

3054 ) 

3055 

3056 # If the cell starts with something that could start a 

3057 # definition (typically a reference symbol), make it a candidate 

3058 # regardless of whether the language is listed. 

3059 if re.match(def_re, cleaned) and not re.match(nondef_re, cleaned): 3059 ↛ 3060line 3059 didn't jump to line 3060 because the condition on line 3059 was never true

3060 candidate_hdr = True 

3061 

3062 # print("titletext={!r} hdr_expansion={!r} candidate_hdr={!r} " 

3063 # "lang={} pos={}" 

3064 # .format(titletext, hdr_expansion, candidate_hdr, 

3065 # lang, pos)) 

3066 if idx >= 0 and titletext[:idx] in infl_map: 

3067 target = titletext[idx + 2 :].strip() 

3068 celltext = celltext[:idx] 

3069 is_title = True 

3070 elif ( 

3071 kind == header_kind 

3072 and " + " not in titletext # For "avoir + blah blah"? 

3073 and not any( 

3074 isinstance(x, WikiNode) 

3075 and x.kind == NodeKind.HTML 

3076 and x.sarg == "span" 

3077 and x.attrs.get("lang") in ("az",) 

3078 for x in col.children 

3079 ) 

3080 ): 

3081 is_title = True 

3082 elif ( 

3083 candidate_hdr 

3084 and cleaned_titletext not in IGNORED_COLVALUES 

3085 and distw([cleaned_titletext], word) > 0.3 

3086 and cleaned_titletext not in ("I", "es") 

3087 ): 

3088 is_title = True 

3089 # if first column or same style as first column 

3090 elif ( 

3091 style == cellstyle 

3092 and 

3093 # and title is not identical to word name 

3094 titletext != word 

3095 and cleaned not in IGNORED_COLVALUES 

3096 and cleaned != "dummy-ignored-text-cell" 

3097 and 

3098 # the style composite string is not broken 

3099 not style.startswith("////") 

3100 and " + " not in titletext 

3101 ): 

3102 if not ignored_cell and lang not in LANGUAGES_WITH_CELLS_AS_HEADERS: 3102 ↛ 3103line 3102 didn't jump to line 3103 because the condition on line 3102 was never true

3103 wxr.wtp.debug( 

3104 "rejected heuristic header: " 

3105 "table cell identified as header based " 

3106 "on style, BUT {} is not in " 

3107 "LANGUAGES_WITH_CELLS_AS_HEADERS; " 

3108 "cleaned text: {}, style: {}".format(lang, cleaned, style), 

3109 sortid="inflection/2512", 

3110 ) 

3111 elif ( 3111 ↛ 3115line 3111 didn't jump to line 3115 because the condition on line 3111 was never true

3112 not ignored_cell 

3113 and cleaned not in LANGUAGES_WITH_CELLS_AS_HEADERS.get(lang, "") 

3114 ): 

3115 wxr.wtp.debug( 

3116 "rejected heuristic header: " 

3117 "table cell identified as header based " 

3118 "on style, BUT the cleaned text is " 

3119 "not in LANGUAGES_WITH_CELLS_AS_HEADERS[{}]; " 

3120 "cleaned text: {}, style: {}".format(lang, cleaned, style), 

3121 sortid="inflection/2522", 

3122 ) 

3123 else: 

3124 wxr.wtp.debug( 

3125 "accepted heuristic header: " 

3126 "table cell identified as header based " 

3127 "on style, AND the cleaned text is " 

3128 "in LANGUAGES_WITH_CELLS_AS_HEADERS[{}]; " 

3129 "cleaned text: {}, style: {}".format(lang, cleaned, style), 

3130 sortid="inflection/2530", 

3131 ) 

3132 is_title = True 

3133 if ( 3133 ↛ 3140line 3133 didn't jump to line 3140 because the condition on line 3133 was never true

3134 not is_title 

3135 and len(row) < len(cols_headered) 

3136 and cols_headered[len(row)] 

3137 ): 

3138 # Whole column has title suggesting they are headers 

3139 # (e.g. "Case") 

3140 is_title = True 

3141 if re.match( 

3142 r"Conjugation of |Declension of |Inflection of |" 

3143 r"Mutation of |Notes\b", # \b is word-boundary 

3144 titletext, 

3145 ): 

3146 is_title = True 

3147 return is_title, hdr_expansion, target, celltext 

3148 

3149 

3150class TableContext: 

3151 """Saved context used when parsing a table and its subtables.""" 

3152 

3153 __slot__ = ( 

3154 "stored_hdrspans", 

3155 "section_header", 

3156 "template_name", 

3157 ) 

3158 

3159 def __init__(self, template_name: str | None = None) -> None: 

3160 self.stored_hdrspans: list[HdrSpan] = [] 

3161 self.section_header: tuple[str, ...] = tuple() 

3162 if template_name is None: 

3163 self.template_name = "" 

3164 else: 

3165 self.template_name = template_name 

3166 

3167 

3168def handle_wikitext_or_html_table( 

3169 wxr: WiktextractContext, 

3170 word: str, 

3171 lang: str, 

3172 pos: str, 

3173 data: WordData, 

3174 tree: WikiNode, 

3175 titles: list[str], 

3176 source: str, 

3177 after: str, 

3178 tablecontext: TableContext | None = None, 

3179): 

3180 """Parses a table from parsed Wikitext format into rows and columns of 

3181 InflCell objects and then calls handle_generic_table() to parse it into 

3182 forms. This adds the forms into ``data``.""" 

3183 assert isinstance(wxr, WiktextractContext) 

3184 assert isinstance(word, str) 

3185 assert isinstance(lang, str) 

3186 assert isinstance(pos, str) 

3187 assert isinstance(data, dict) 

3188 assert isinstance(tree, WikiNode) 

3189 assert tree.kind == NodeKind.TABLE or ( 

3190 tree.kind == NodeKind.HTML and tree.sarg == "table" 

3191 ) 

3192 assert isinstance(titles, list) 

3193 assert isinstance(source, str) 

3194 for x in titles: 

3195 assert isinstance(x, str) 

3196 assert isinstance(after, str) 

3197 assert tablecontext is None or isinstance(tablecontext, TableContext) 

3198 # Imported here to avoid a circular import 

3199 from wiktextract.page import clean_node, recursively_extract 

3200 

3201 # from wikitextprocessor.parser import print_tree 

3202 # print_tree(tree) 

3203 # print("-------==========-------") 

3204 

3205 if not tablecontext: 

3206 tablecontext = TableContext() 

3207 

3208 # Get language specific text removal patterns 

3209 remove_text_patterns: tuple[str | re.Pattern, ...] | None = None 

3210 if rem := get_lang_conf(lang, "remove_text_patterns"): 

3211 for poses in rem.keys(): 

3212 if pos in poses: 

3213 remove_text_patterns = rem[poses] 

3214 break 

3215 

3216 def handle_table1( 

3217 wxr: WiktextractContext, 

3218 tablecontext: TableContext, 

3219 word: str, 

3220 lang: str, 

3221 pos: str, 

3222 data: WordData, 

3223 tree: WikiNode, 

3224 titles: list[str], 

3225 source: str, 

3226 after: str, 

3227 depth: int, 

3228 ) -> list[tuple[list[list[InflCell]], list[str], str, int]]: 

3229 # rows, titles, after, depth 

3230 """Helper function allowing the 'flattening' out of the table 

3231 recursion: instead of handling the tables in the wrong order 

3232 (recursively), this function adds to new_row that is then 

3233 iterated through in the main function at the end, creating 

3234 a longer table (still in pieces) in the correct order.""" 

3235 

3236 assert isinstance(data, dict) 

3237 assert isinstance(titles, list) 

3238 assert isinstance(source, str) 

3239 for x in titles: 

3240 assert isinstance(x, str) 

3241 assert isinstance(after, str) 

3242 assert isinstance(depth, int) 

3243 # print("HANDLE_WIKITEXT_TABLE", titles) 

3244 

3245 # Filling for columns with rowspan > 1 

3246 col_gap_data: list[InflCell | None] = [] 

3247 # Number of remaining rows for which to fill the column 

3248 vertical_still_left: list[int] = [] 

3249 cols_headered: list[bool] = [] # [F, T, F, F...] 

3250 # True when the whole column contains headers, even 

3251 # when the cell is not considered a header; triggered 

3252 # by the "*" inflmap meta-tag. 

3253 rows: list[list[InflCell]] = [] 

3254 

3255 sub_ret = [] 

3256 

3257 # from wikitextprocessor.parser import print_tree 

3258 # print_tree(tree) 

3259 for node in tree.children: 

3260 if not isinstance(node, WikiNode): 

3261 continue 

3262 kind: NodeKind | str 

3263 if node.kind == NodeKind.HTML: 

3264 kind = node.sarg 

3265 else: 

3266 kind = node.kind 

3267 

3268 # print(" {}".format(node)) 

3269 if kind in (NodeKind.TABLE_CAPTION, "caption"): 

3270 # print(" CAPTION:", node) 

3271 pass 

3272 elif kind in (NodeKind.TABLE_ROW, "tr"): 

3273 if "vsShow" in node.attrs.get("class", "").split(): 

3274 # vsShow rows are those that are intially shown in tables 

3275 # that have more data. The hidden data duplicates these 

3276 # rows, so we skip it and just process the hidden data. 

3277 continue 

3278 

3279 # if ( 

3280 # len(node.children) == 1 

3281 # and node.children[0].attrs.get("class") == "separator" 

3282 # ): 

3283 # print("------------------ skip separator") 

3284 # continue 

3285 

3286 # Parse a table row. 

3287 row: list[InflCell] = [] 

3288 style = None 

3289 row_has_nonempty_cells = False 

3290 # Have nonempty cell not from rowspan 

3291 for col in get_table_cells(node): 

3292 # loop through each cell in the ROW 

3293 

3294 # The below skip is not needed anymore, because we "skip" in 

3295 # get_table_cells, but left here as a comment 

3296 # if not isinstance(col, WikiNode): 

3297 # # This skip is not used for counting, 

3298 # # "None" is not used in 

3299 # # indexing or counting or looping. 

3300 # continue 

3301 if col.kind == NodeKind.HTML: 

3302 kind = col.sarg 

3303 else: 

3304 kind = col.kind 

3305 if kind not in ( 3305 ↛ 3311line 3305 didn't jump to line 3311 because the condition on line 3305 was never true

3306 NodeKind.TABLE_HEADER_CELL, 

3307 NodeKind.TABLE_CELL, 

3308 "th", 

3309 "td", 

3310 ): 

3311 print(" UNEXPECTED ROW CONTENT: {}".format(col)) 

3312 continue 

3313 

3314 while ( 

3315 len(row) < len(vertical_still_left) 

3316 and vertical_still_left[len(row)] > 0 

3317 ): 

3318 # vertical_still_left is [...0, 0, 2...] for each 

3319 # column. It is populated at the end of the loop, at the 

3320 # same time as col_gap_data. This needs to be looped and 

3321 # filled this way because each `for col`-looping jumps 

3322 # straight to the next meaningful cell; there is no 

3323 # "None" cells, only emptiness between, and rowspan and 

3324 # colspan are just to generate the "fill- 

3325 vertical_still_left[len(row)] -= 1 

3326 

3327 # KJ Apr 2026 

3328 # type checking is ignored; I am pretty sure that 

3329 # row will never contain None, even if col_gap_data 

3330 # is `InflCell | None`, but this code is such 

3331 # spaghetti that it's hard to figure out, except 

3332 # by the process of elimination: this has never 

3333 # caused trouble before, ergo, it works. 

3334 row.append(col_gap_data[len(row)]) # type: ignore 

3335 

3336 # appending row is how "indexing" is 

3337 # done here; something is appended, 

3338 # like a filler-cell here or a "start" 

3339 # cell at the end of the row-loop, 

3340 # which increased len(row) which is 

3341 # then used as the target-index to check 

3342 # for gaps. vertical_still_left is 

3343 # the countdown to when to stop 

3344 # filling in gaps, and goes down to 0, 

3345 # and col_gap_data is not touched 

3346 # except when a new rowspan is needed, 

3347 # at the same time that 

3348 # vertical_still_left gets reassigned. 

3349 

3350 try: 

3351 rowspan = int(col.attrs.get("rowspan", "1")) # 🡙 

3352 colspan = int(col.attrs.get("colspan", "1")) # 🡘 

3353 except ValueError: 

3354 rowspan = 1 

3355 colspan = 1 

3356 # print("COL:", col) 

3357 

3358 # Too many of these errors 

3359 if colspan > 100: 

3360 # wxr.wtp.error( 

3361 # f"Colspan {colspan} over 30, set to 1", 

3362 # sortid="inflection/20250113a", 

3363 # ) 

3364 colspan = 100 

3365 if rowspan > 100: 3365 ↛ 3370line 3365 didn't jump to line 3370 because the condition on line 3365 was never true

3366 # wxr.wtp.error( 

3367 # f"Rowspan {rowspan} over 30, set to 1", 

3368 # sortid="inflection/20250113b", 

3369 # ) 

3370 rowspan = 100 

3371 

3372 # Process any nested tables recursively. 

3373 tables, rest = recursively_extract( 

3374 col, 

3375 lambda x: ( 

3376 isinstance(x, WikiNode) 

3377 and (x.kind == NodeKind.TABLE or x.sarg == "table") 

3378 ), 

3379 ) 

3380 

3381 # Clean the rest of the cell. 

3382 link_capture_dict: dict = {} 

3383 celltext = clean_node( 

3384 wxr, link_capture_dict, rest, collect_links=True 

3385 ) 

3386 cell_links: list[tuple[str, str]] | None = ( 

3387 link_capture_dict.get("links", None) 

3388 ) 

3389 # print(f"CLEANED: {celltext=}") 

3390 # print(f"SUBTABLES: {tables}") 

3391 # print(f"{link_capture_dict=}") 

3392 

3393 # Remove regexed patterns from text 

3394 if remove_text_patterns is not None: 

3395 for pat in remove_text_patterns: 

3396 celltext = re.sub(pat, "", celltext) 

3397 # print(f"AFTER: {celltext=} <<") 

3398 

3399 # Handle nested tables. 

3400 for tbl in tables: 

3401 # Some nested tables (e.g., croí/Irish) have subtitles 

3402 # as normal paragraphs in the same cell under a descrip- 

3403 # tive text that should be treated as a title (e.g., 

3404 # "Forms with the definite article", with "definite" not 

3405 # mentioned elsewhere). 

3406 new_titles = list(titles) 

3407 if celltext: 

3408 new_titles.append(celltext) 

3409 subtbl = handle_table1( 

3410 wxr, 

3411 tablecontext, 

3412 word, 

3413 lang, 

3414 pos, 

3415 data, 

3416 tbl, # type: ignore 

3417 new_titles, 

3418 source, 

3419 "", 

3420 depth + 1, 

3421 ) 

3422 if subtbl: 3422 ↛ 3400line 3422 didn't jump to line 3400 because the condition on line 3422 was always true

3423 sub_ret.append((rows, titles, after, depth)) 

3424 rows = [] 

3425 titles = [] 

3426 after = "" 

3427 sub_ret.extend(subtbl) 

3428 

3429 # This magic value is used as part of header detection 

3430 cellstyle = ( 

3431 col.attrs.get("style", "") 

3432 + "//" 

3433 + col.attrs.get("class", "") 

3434 + "//" 

3435 + str(kind) 

3436 ) 

3437 

3438 if not row: # if first column in row 

3439 style = cellstyle 

3440 target = None 

3441 titletext = celltext.strip() 

3442 while titletext and is_superscript(titletext[-1]): 

3443 titletext = titletext[:-1] 

3444 

3445 ( 

3446 is_title, 

3447 hdr_expansion, 

3448 target, 

3449 celltext, 

3450 ) = determine_header( 

3451 wxr, 

3452 tablecontext, 

3453 lang, 

3454 word, 

3455 pos, 

3456 tree.kind, 

3457 kind, 

3458 style, 

3459 row, 

3460 col, 

3461 celltext, 

3462 titletext, 

3463 cols_headered, 

3464 None, 

3465 cellstyle, 

3466 ) 

3467 

3468 if is_title: 

3469 # If this cell gets a "*" tag, make the whole column 

3470 # below it (toggling it in cols_headered = [F, F, T...]) 

3471 # into headers. 

3472 while len(cols_headered) <= len(row): 

3473 cols_headered.append(False) 

3474 if any("*" in tt for tt in hdr_expansion): 

3475 cols_headered[len(row)] = True 

3476 celltext = "" 

3477 # if row_has_nonempty_cells has been True at some point, it 

3478 # keeps on being True. 

3479 # if row_has_nonempty_cells or is_title or celltext != "": 

3480 # row_has_nonempty_cells = True 

3481 # ⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓ 

3482 row_has_nonempty_cells |= is_title or celltext != "" 

3483 cell = InflCell( 

3484 celltext, is_title, colspan, rowspan, target, cell_links 

3485 ) 

3486 for _ in range(0, colspan): 

3487 # colspan🡘 current loop (col) or 1 

3488 # All the data-filling for colspan 

3489 # is done simply in this loop, 

3490 # while rowspan needs to use 

3491 # vertical_still_left to count gaps 

3492 # and col_gap_data to fill in 

3493 # those gaps with InflCell data. 

3494 if rowspan > 1: # rowspan🡙 current loop (col) or 1 

3495 while len(col_gap_data) <= len(row): 

3496 # Initialize col_gap_data/ed if 

3497 # it is lacking slots 

3498 # for each column; col_gap_data and 

3499 # vertical_still_left are never 

3500 # reset to [], during 

3501 # the whole table function. 

3502 col_gap_data.append(None) 

3503 vertical_still_left.append(0) 

3504 # Below is where the "rectangle" block of rowspan 

3505 # and colspan is filled for the future. 

3506 col_gap_data[len(row)] = cell 

3507 # col_gap_data contains cells that 

3508 # will be used in the 

3509 # future, or None 

3510 vertical_still_left[len(row)] = rowspan - 1 

3511 # A counter for how many gaps🡙 are still left to be 

3512 # filled (row.append or 

3513 # row[col_gap_data[len(row)] => 

3514 # rows), it is not reset to [], but decremented to 0 

3515 # each time a row gets something from col_gap_data. 

3516 # Append this cell 1+ times for colspan🡘 

3517 row.append(cell) 

3518 if not row: 

3519 continue 

3520 # After looping the original row-nodes above, fill 

3521 # in the rest of the row if the final cell has colspan 

3522 # (inherited from above, so a cell with rowspan and colspan) 

3523 for i in range(len(row), len(vertical_still_left)): 

3524 if vertical_still_left[i] <= 0: 

3525 continue 

3526 vertical_still_left[i] -= 1 

3527 while len(row) < i: 

3528 row.append(InflCell("", False, 1, 1, None)) 

3529 row.append(col_gap_data[i]) # type: ignore 

3530 # print(" ROW {!r}".format(row)) 

3531 if row_has_nonempty_cells: 3531 ↛ 3259line 3531 didn't jump to line 3259 because the condition on line 3531 was always true

3532 rows.append(row) 

3533 elif kind in ( 3533 ↛ 3259line 3533 didn't jump to line 3259 because the condition on line 3533 was always true

3534 NodeKind.TABLE_HEADER_CELL, 

3535 NodeKind.TABLE_CELL, 

3536 "th", 

3537 "td", 

3538 "span", 

3539 ): 

3540 # print(" TOP-LEVEL CELL", node) 

3541 pass 

3542 

3543 if sub_ret: 

3544 main_ret = sub_ret 

3545 main_ret.append((rows, titles, after, depth)) 

3546 else: 

3547 main_ret = [(rows, titles, after, depth)] 

3548 return main_ret 

3549 

3550 new_rows = handle_table1( 

3551 wxr, tablecontext, word, lang, pos, data, tree, titles, source, after, 0 

3552 ) 

3553 

3554 # Now we have a table that has been parsed into rows and columns of 

3555 # InflCell objects. Parse the inflection table from that format. 

3556 if new_rows: 3556 ↛ exitline 3556 didn't return from function 'handle_wikitext_or_html_table' because the condition on line 3556 was always true

3557 for rows, titles, after, depth in new_rows: 

3558 handle_generic_table( 

3559 wxr, 

3560 tablecontext, 

3561 data, 

3562 word, 

3563 lang, 

3564 pos, 

3565 rows, 

3566 titles, 

3567 source, 

3568 after, 

3569 depth, 

3570 ) 

3571 

3572 

3573def get_table_cells(node: WikiNode) -> Generator[WikiNode, None, None]: 

3574 """If a wikitext table cell contains HTML cells `<td>`, as they sometimes 

3575 do because it is easier to write wikitext conditionals that way, 

3576 those td-elements are parsed as child elements of the Wikitext cell. 

3577 This generator will yield wikitext and HTML direct children of 

3578 `node` and if a Wikitext TABLE_CELL has direct td-element children, 

3579 those are also yielded.""" 

3580 for col in node.children: 

3581 if not isinstance(col, WikiNode): 

3582 continue 

3583 if any( 

3584 isinstance(c, HTMLNode) and c.sarg in ("th", "td") 

3585 for c in col.children 

3586 ): 

3587 html_cells = [] 

3588 content = [] 

3589 for c in col.children: 

3590 if isinstance(c, HTMLNode) and c.sarg in ("th", "td"): 

3591 html_cells.append(c) 

3592 else: 

3593 content.append(c) 

3594 # Remove td-elements from col so they are not returned twice 

3595 col.children = content 

3596 yield col 

3597 for c in html_cells: 

3598 yield c 

3599 else: 

3600 yield col 

3601 

3602 

3603def handle_html_table( 

3604 wxr: WiktextractContext, 

3605 word: str, 

3606 lang: str, 

3607 pos: str, 

3608 data: WordData, 

3609 tree: WikiNode, 

3610 titles: list[str], 

3611 source: str, 

3612 after: str, 

3613 tablecontext: TableContext | None = None, 

3614) -> None: 

3615 """A passer-on function for html-tables, XXX, remove these?""" 

3616 handle_wikitext_or_html_table( 

3617 wxr, word, lang, pos, data, tree, titles, source, after, tablecontext 

3618 ) 

3619 

3620 

3621def handle_wikitext_table( 

3622 wxr: WiktextractContext, 

3623 word: str, 

3624 lang: str, 

3625 pos: str, 

3626 data: WordData, 

3627 tree: WikiNode, 

3628 titles: list[str], 

3629 source: str, 

3630 after: str, 

3631 tablecontext: TableContext | None = None, 

3632) -> None: 

3633 """A passer-on function for html-tables, XXX, remove these?""" 

3634 handle_wikitext_or_html_table( 

3635 wxr, word, lang, pos, data, tree, titles, source, after, tablecontext 

3636 ) 

3637 

3638 

3639def parse_inflection_section( 

3640 wxr: WiktextractContext, 

3641 data: WordData, 

3642 word: str, 

3643 lang: str, 

3644 pos: str, 

3645 section: str, 

3646 tree: WikiNode, 

3647 tablecontext: TableContext | None = None, 

3648) -> None: 

3649 """Parses an inflection section on a page. ``data`` should be the 

3650 data for a part-of-speech, and inflections will be added to it.""" 

3651 

3652 # print("PARSE_INFLECTION_SECTION {}/{}/{}/{}" 

3653 # .format(word, lang, pos, section)) 

3654 assert isinstance(wxr, WiktextractContext) 

3655 assert isinstance(data, dict) 

3656 assert isinstance(word, str) 

3657 assert isinstance(lang, str) 

3658 assert isinstance(section, str) 

3659 assert isinstance(tree, WikiNode) 

3660 assert tablecontext is None or isinstance(tablecontext, TableContext) 

3661 source = section 

3662 tables: list[ 

3663 tuple[Literal["html", "wikitext"], WikiNode, list[str], list[str]] 

3664 ] = [] 

3665 titleparts: list[str] = [] 

3666 preceding_bolded_title = "" 

3667 

3668 # from wikitextprocessor.parser import print_tree 

3669 # print_tree(tree) 

3670 # print("--------------******************----------------") 

3671 

3672 def process_tables() -> None: 

3673 for kind, node, titles, after_l in tables: 

3674 after = "".join(after_l).strip() 

3675 after = clean_value(wxr, after) 

3676 if kind == "wikitext": 

3677 handle_wikitext_table( 

3678 wxr, 

3679 word, 

3680 lang, 

3681 pos, 

3682 data, 

3683 node, 

3684 titles, 

3685 source, 

3686 after, 

3687 tablecontext=tablecontext, 

3688 ) 

3689 elif kind == "html": 3689 ↛ 3703line 3689 didn't jump to line 3703 because the condition on line 3689 was always true

3690 handle_html_table( 

3691 wxr, 

3692 word, 

3693 lang, 

3694 pos, 

3695 data, 

3696 node, 

3697 titles, 

3698 source, 

3699 after, 

3700 tablecontext=tablecontext, 

3701 ) 

3702 else: 

3703 raise RuntimeError( 

3704 "{}: unimplemented table kind {}".format(word, kind) 

3705 ) 

3706 

3707 def recurse_navframe(node: WikiNode | str, titles: list[str]) -> None: 

3708 nonlocal tables 

3709 nonlocal titleparts 

3710 titleparts = [] 

3711 old_tables = tables 

3712 tables = [] 

3713 

3714 recurse(node, [], navframe=True) 

3715 

3716 process_tables() 

3717 tables = old_tables 

3718 

3719 def recurse( 

3720 node: WikiNode 

3721 | str 

3722 | list[WikiNode | str] 

3723 | list[list[WikiNode | str]], 

3724 titles: list[str], 

3725 navframe=False, 

3726 ) -> None: 

3727 nonlocal tables 

3728 if isinstance(node, (list, tuple)): 

3729 for x in node: 

3730 recurse(x, titles, navframe) 

3731 return 

3732 if isinstance(node, str): 

3733 if tables: 

3734 tables[-1][-1].append(node) 

3735 elif navframe: 

3736 titleparts.append(node) 

3737 return 

3738 if not isinstance(node, WikiNode): 3738 ↛ 3739line 3738 didn't jump to line 3739 because the condition on line 3738 was never true

3739 if navframe: 

3740 wxr.wtp.debug( 

3741 "inflection table: unhandled in NavFrame: {}".format(node), 

3742 sortid="inflection/2907", 

3743 ) 

3744 return 

3745 kind = node.kind 

3746 if navframe: 

3747 if kind == NodeKind.HTML: 

3748 classes = node.attrs.get("class", "").split() 

3749 if "NavToggle" in classes: 3749 ↛ 3750line 3749 didn't jump to line 3750 because the condition on line 3749 was never true

3750 return 

3751 if "NavHead" in classes: 

3752 # print("NAVHEAD:", node) 

3753 recurse(node.children, titles, navframe) 

3754 return 

3755 if "NavContent" in classes: 

3756 # print("NAVCONTENT:", node) 

3757 title = "".join(titleparts).strip() 

3758 title = html.unescape(title) 

3759 title = title.strip() 

3760 new_titles = list(titles) 

3761 if not re.match(r"(Note:|Notes:)", title): 3761 ↛ 3763line 3761 didn't jump to line 3763 because the condition on line 3761 was always true

3762 new_titles.append(title) 

3763 recurse(node, new_titles, navframe=False) 

3764 return 

3765 else: 

3766 if kind == NodeKind.TABLE: 

3767 tables.append(("wikitext", node, titles, [])) 

3768 return 

3769 elif kind == NodeKind.HTML and node.sarg == "table": 

3770 htmlclasses = node.attrs.get("class", ()) 

3771 if "audiotable" in htmlclasses: 

3772 return 

3773 tables.append(("html", node, titles, [])) 

3774 return 

3775 elif kind in ( 3775 ↛ 3782line 3775 didn't jump to line 3782 because the condition on line 3775 was never true

3776 NodeKind.LEVEL2, 

3777 NodeKind.LEVEL3, 

3778 NodeKind.LEVEL4, 

3779 NodeKind.LEVEL5, 

3780 NodeKind.LEVEL6, 

3781 ): 

3782 return # Skip subsections 

3783 if ( 

3784 kind == NodeKind.HTML 

3785 and node.sarg == "div" 

3786 and "NavFrame" in node.attrs.get("class", "").split() 

3787 ): 

3788 recurse_navframe(node, titles) 

3789 return 

3790 if kind == NodeKind.LINK: 

3791 if len(node.largs) > 1: 

3792 recurse(node.largs[1:], titles, navframe) 

3793 else: 

3794 recurse(node.largs[0], titles, navframe) 

3795 return 

3796 if kind == NodeKind.HTML and node.sarg == "ref": 

3797 return 

3798 if kind == NodeKind.LIST and node.sarg == ";": 

3799 nonlocal preceding_bolded_title 

3800 from wiktextract.page import clean_node 

3801 

3802 preceding_bolded_title = clean_node(wxr, None, node).strip("; ") 

3803 for x in node.children: 

3804 recurse(x, titles, navframe) 

3805 

3806 assert tree.kind == NodeKind.ROOT 

3807 for x in tree.children: 

3808 if preceding_bolded_title != "": 

3809 recurse(x, [preceding_bolded_title]) 

3810 else: 

3811 recurse(x, []) 

3812 

3813 # Process the tables we found 

3814 process_tables() 

3815 

3816 # XXX this code is used for extracting tables for inflection tests 

3817 if wxr.config.expand_tables: 3817 ↛ 3818line 3817 didn't jump to line 3818 because the condition on line 3817 was never true

3818 if section != "Mutation": 

3819 with open(wxr.config.expand_tables, "w") as f: 

3820 f.write(word + "\n") 

3821 f.write(lang + "\n") 

3822 f.write(pos + "\n") 

3823 f.write(section + "\n") 

3824 text = wxr.wtp.node_to_wikitext(tree) 

3825 f.write(text + "\n")