Coverage for src/wiktextract/extractor/en/inflection.py: 87%

1549 statements  

« prev     ^ index     » next       coverage.py v7.14.1, created at 2026-06-03 06:55 +0000

1# Code for parsing inflection tables. 

2# 

3# Copyright (c) 2021-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org. 

4 

5import collections 

6import copy 

7import functools 

8import html 

9import re 

10import unicodedata 

11from typing import TYPE_CHECKING, Generator, Literal, Optional, Union 

12 

13from mediawiki_langcodes import code_to_name, name_to_code 

14from wikitextprocessor import MAGIC_FIRST, HTMLNode, NodeKind, WikiNode 

15 

16from ...clean import clean_value 

17from ...datautils import data_append, freeze, split_at_comma_semi 

18from ...tags import valid_tags 

19from ...wxr_context import WiktextractContext 

20from .form_descriptions import ( 

21 classify_desc, 

22 decode_tags, 

23 distw, 

24 match_links_to_form, 

25 parse_head_final_tags, 

26) 

27from .inflection_kludges import ka_decl_noun_template_cell 

28from .inflectiondata import infl_map, infl_start_map, infl_start_re 

29from .lang_specific_configs import get_lang_conf, lang_specific_tags 

30from .table_headers_heuristics_data import LANGUAGES_WITH_CELLS_AS_HEADERS 

31from .type_utils import FormData, WordData 

32 

33# --debug-text-cell WORD 

34# Command-line parameter for debugging. When parsing inflection tables, 

35# print out debug messages when encountering this text. 

36debug_cell_text: Optional[str] = None 

37 

38 

39def set_debug_cell_text(text: str) -> None: 

40 global debug_cell_text 

41 debug_cell_text = text 

42 

43 

44TagSets = list[tuple[str, ...]] 

45 

46# Column texts that are interpreted as an empty column. 

47IGNORED_COLVALUES = { 

48 "-", 

49 "־", 

50 "᠆", 

51 "‐", 

52 "‑", 

53 "‒", 

54 "–", 

55 "—", 

56 "―", 

57 "−", 

58 "⸺", 

59 "⸻", 

60 "﹘", 

61 "﹣", 

62 "-", 

63 "/", 

64 "?", 

65 "not used", 

66 "not applicable", 

67} 

68 

69# These tags are never inherited from above 

70# XXX merge with lang_specific 

71noinherit_tags = { 

72 "infinitive-i", 

73 "infinitive-i-long", 

74 "infinitive-ii", 

75 "infinitive-iii", 

76 "infinitive-iv", 

77 "infinitive-v", 

78} 

79 

80# Subject->object transformation mapping, when using dummy-object-concord 

81# to replace subject concord tags with object concord tags 

82object_concord_replacements = { 

83 "first-person": "object-first-person", 

84 "second-person": "object-second-person", 

85 "third-person": "object-third-person", 

86 "singular": "object-singular", 

87 "plural": "object-plural", 

88 "definite": "object-definite", 

89 "indefinite": "object-indefinite", 

90 "class-1": "object-class-1", 

91 "class-2": "object-class-2", 

92 "class-3": "object-class-3", 

93 "class-4": "object-class-4", 

94 "class-5": "object-class-5", 

95 "class-6": "object-class-6", 

96 "class-7": "object-class-7", 

97 "class-8": "object-class-8", 

98 "class-9": "object-class-9", 

99 "class-10": "object-class-10", 

100 "class-11": "object-class-11", 

101 "class-12": "object-class-12", 

102 "class-13": "object-class-13", 

103 "class-14": "object-class-14", 

104 "class-15": "object-class-15", 

105 "class-16": "object-class-16", 

106 "class-17": "object-class-17", 

107 "class-18": "object-class-18", 

108 "masculine": "object-masculine", 

109 "feminine": "object-feminine", 

110} 

111 

112# Words in title that cause addition of tags in all entries 

113title_contains_global_map = { 

114 "possessive": "possessive", 

115 "possessed forms of": "possessive", 

116 "predicative forms of": "predicative", 

117 "negative": "negative", 

118 "positive definite forms": "positive definite", 

119 "positive indefinite forms": "positive indefinite", 

120 "comparative": "comparative", 

121 "superlative": "superlative", 

122 "combined forms": "combined-form", 

123 "mutation": "mutation", 

124 "definite article": "definite", 

125 "indefinite article": "indefinite", 

126 "indefinite declension": "indefinite", 

127 "bare forms": "indefinite", # e.g., cois/Irish 

128 "definite declension": "definite", 

129 "pre-reform": "dated", 

130 "personal pronouns": "personal pronoun", 

131 "composed forms of": "multiword-construction", 

132 "subordinate-clause forms of": "subordinate-clause", 

133 "participles of": "participle", 

134 "variation of": "dummy-skip-this", # a'/Scottish Gaelic 

135 "command form of": "imperative", # a راتلل/Pashto 

136 "historical inflection of": "dummy-skip-this", # kork/Norwegian Nynorsk 

137 "obsolete declension": "obsolete", # März/German 20241111 

138} 

139for k, v in title_contains_global_map.items(): 

140 if any(t not in valid_tags for t in v.split()): 140 ↛ 141line 140 didn't jump to line 141 because the condition on line 140 was never true

141 print("TITLE_CONTAINS_GLOBAL_MAP UNRECOGNIZED TAG: {}: {}".format(k, v)) 

142table_hdr_ign_part = r"(Inflection|Conjugation|Declension|Mutation) of [^\s]" 

143 

144table_hdr_ign_part_re = re.compile(r"(?i)(" + table_hdr_ign_part + ")") 

145# (?i) python regex extension, ignore case 

146title_contains_global_re = re.compile( 

147 r"(?i)(^|\b)({}|{})($|\b)".format( 

148 table_hdr_ign_part, 

149 "|".join(re.escape(x) for x in title_contains_global_map.keys()), 

150 ) 

151) 

152 

153# Words in title that cause addition of tags to table-tags "form" 

154title_contains_wordtags_map = { 

155 "pf": "perfective", 

156 "impf": "imperfective", 

157 "strong": "strong", 

158 "weak": "weak", 

159 "countable": "countable", 

160 "uncountable": "uncountable", 

161 "inanimate": "inanimate", 

162 "animate": "animate", 

163 "transitive": "transitive", 

164 "intransitive": "intransitive", 

165 "ditransitive": "ditransitive", 

166 "ambitransitive": "ambitransitive", 

167 "archaic": "archaic", 

168 "dated": "dated", 

169 "affirmative": "affirmative", 

170 "negative": "negative", 

171 "subject pronouns": "subjective", 

172 "object pronouns": "objective", 

173 "emphatic": "emphatic", 

174 "proper noun": "proper-noun", 

175 "no plural": "no-plural", 

176 "imperfective": "imperfective", 

177 "perfective": "perfective", 

178 "no supine stem": "no-supine", 

179 "no perfect stem": "no-perfect", 

180 "deponent": "deponent", 

181 "irregular": "irregular", 

182 "no short forms": "no-short-form", 

183 "iō-variant": "iō-variant", 

184 "1st declension": "declension-1", 

185 "2nd declension": "declension-2", 

186 "3rd declension": "declension-3", 

187 "4th declension": "declension-4", 

188 "5th declension": "declension-5", 

189 "6th declension": "declension-6", 

190 "first declension": "declension-1", 

191 "second declension": "declension-2", 

192 "third declension": "declension-3", 

193 "fourth declension": "declension-4", 

194 "fifth declension": "declension-5", 

195 "sixth declension": "declension-6", 

196 "1st conjugation": "conjugation-1", 

197 "2nd conjugation": "conjugation-2", 

198 "3rd conjugation": "conjugation-3", 

199 "4th conjugation": "conjugation-4", 

200 "5th conjugation": "conjugation-5", 

201 "6th conjugation": "conjugation-6", 

202 "7th conjugation": "conjugation-7", 

203 "first conjugation": "conjugation-1", 

204 "second conjugation": "conjugation-2", 

205 "third conjugation": "conjugation-3", 

206 "fourth conjugation": "conjugation-4", 

207 "fifth conjugation": "conjugation-5", 

208 "sixth conjugation": "conjugation-6", 

209 "seventh conjugation": "conjugation-7", 

210 # Corsican regional tags in table header 

211 "cismontane": "Cismontane", 

212 "ultramontane": "Ultramontane", 

213 "western lombard": "Western-Lombard", 

214 "eastern lombard": "Eastern-Lombard", 

215 "contracted": "contracted", 

216 "present": "present", 

217 "perfect": "perfect", 

218 "imperfect": "imperfect", 

219 "pluperfect": "pluperfect", 

220 "future": "future", 

221 "aorist": "aorist", 

222 "eastern armenian": "Eastern-Armenian", 

223 "western armenian": "Western-Armenian", 

224 "-al conjugation": "-al-conjugation", 

225 "-al negative conjugation": "-al-conjugation", 

226 "-il conjugation": "-il-conjugation", 

227 "-il negative conjugation": "-il-conjugation", 

228 "-el conjugation": "-el-conjugation", 

229 "-el negative conjugation": "-el-conjugation", 

230 "-ul conjugation": "-ul-conjugation", 

231 "-ul negative conjugation": "-ul-conjugation", 

232 "u-type": "u-type", 

233 "nominalized infinitive": "noun infinitive", 

234} 

235for k, v in title_contains_wordtags_map.items(): 

236 if any(t not in valid_tags for t in v.split()): 236 ↛ 237line 236 didn't jump to line 237 because the condition on line 236 was never true

237 print( 

238 "TITLE_CONTAINS_WORDTAGS_MAP UNRECOGNIZED TAG: {}: {}".format(k, v) 

239 ) 

240title_contains_wordtags_re = re.compile( 

241 r"(?i)(^|\b)({}|{})($|\b)".format( 

242 table_hdr_ign_part, 

243 "|".join( 

244 re.escape(x) 

245 for x in reversed( 

246 sorted(title_contains_wordtags_map.keys(), key=len) 

247 ) 

248 ), 

249 ) 

250) 

251 

252# Parenthesized elements in title that are converted to tags in 

253# "table-tags" form 

254title_elements_map = { 

255 "weak": "weak", 

256 "strong": "strong", 

257 "separable": "separable", 

258 "masculine": "masculine", 

259 "feminine": "feminine", 

260 "neuter": "neuter", 

261 "singular": "singular", 

262 "plural": "plural", 

263 "archaic": "archaic", 

264 "dated": "dated", 

265 "iterative": "iterative", 

266 "poetic": "poetic", 

267 "Attic": "Attic", 

268 "Epic": "Epic", 

269 "Aeolic": "Aeolic", 

270 "Arcadocypriot": "Arcadocypriot", 

271 "Old Attic": "Old-Attic", 

272 "Boeotian": "Boeotian", 

273 "Byzantine": "Byzantine", 

274 "Choral Doric": "Choral-Doric", 

275 "Doric": "Doric", 

276 "Elean": "Elean", 

277 "Epirote": "Epirote", 

278 "Ionic": "Ionic", 

279 "Koine": "Koine", 

280 "Cretan": "Cretan", 

281 "Corinthian": "Corinthian", 

282 "Laconian": "Laconian", 

283 "Later poetic": "Later-poetic-Ancient-Greek", 

284 "Lesbian": "Lesbian", 

285 "Locrian": "Locrian", 

286 "Lyric": "Lyric-Ancient-Greek", 

287 "Thessalian": "Thessalian", 

288 "Tragic": "Tragic-Ancient-Greek", 

289} 

290for k, v in title_elements_map.items(): 

291 if any(t not in valid_tags for t in v.split()): 291 ↛ 292line 291 didn't jump to line 292 because the condition on line 291 was never true

292 print("TITLE_ELEMENTS_MAP UNRECOGNIZED TAG: {}: {}".format(k, v)) 

293 

294# Parenthized element starts to map them to tags for form for the rest of 

295# the element 

296title_elemstart_map = { 

297 "auxiliary": "auxiliary", 

298 "Kotus type": "class", 

299 "ÕS type": "class", 

300 "class": "class", 

301 "short class": "class", 

302 "type": "class", 

303 "strong class": "class", 

304 "weak class": "class", 

305 "accent paradigm": "accent-paradigm", 

306 "stem in": "class", 

307} 

308for k, v in title_elemstart_map.items(): 

309 if any(t not in valid_tags for t in v.split()): 309 ↛ 310line 309 didn't jump to line 310 because the condition on line 309 was never true

310 print("TITLE_ELEMSTART_MAP UNRECOGNIZED TAG: {}: {}".format(k, v)) 

311title_elemstart_re = re.compile( 

312 r"^({}) ".format("|".join(re.escape(x) for x in title_elemstart_map.keys())) 

313) 

314 

315 

316# Regexp for cell starts that are likely definitions of reference symbols. 

317# See also nondef_re. 

318def_re = re.compile( 

319 r"(\s*•?\s+)?" 

320 r"((\*+|[△†0123456789⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻]+)([⁾):]|\s|(?=[A-Z]))|" 

321 r"\^(\*+|[△†])|" 

322 r"([¹²³⁴⁵⁶⁷⁸⁹])|" 

323 r"([ᴬᴮᴰᴱᴳᴴᴵᴶᴷᴸᴹᴺᴼᴾᴿᵀᵁⱽᵂᵃᵇᶜᵈᵉᶠᵍʰⁱʲᵏˡᵐⁿᵒᵖʳˢᵗᵘᵛʷˣʸᶻᵝᵞᵟᶿᶥᵠᵡ]))" 

324) 

325# ᴺᴸᴴ persan/Old Irish 

326 

327# Regexp for cell starts that are exceptions to def_re and do not actually 

328# start a definition. 

329nondef_re = re.compile( 

330 r"(^\s*(1|2|3)\s+(sg|pl)\s*$|" # 1s or 3p etc. 

331 r"\s*\d\d?\s*/\s*\d\d?\s*$)" 

332) # taka/Swahili "15 / 17" 

333 

334 

335class InflCell: 

336 """Cell in an inflection table.""" 

337 

338 __slots__ = ( 

339 "text", 

340 "is_title", 

341 "colspan", 

342 "rowspan", 

343 "target", 

344 "links", 

345 ) 

346 

347 def __init__( 

348 self, 

349 text: str, 

350 is_title: bool, 

351 colspan: int, 

352 rowspan: int, 

353 target: str | None, 

354 cell_links: list[tuple[str, str]] | None = None, 

355 ) -> None: 

356 assert isinstance(text, str) 

357 assert is_title in (True, False) 

358 assert isinstance(colspan, int) and colspan >= 1 

359 assert isinstance(rowspan, int) and rowspan >= 1 

360 assert target is None or isinstance(target, str) 

361 self.text = text.strip() 

362 self.is_title = text and is_title 

363 self.colspan = colspan 

364 self.rowspan = rowspan 

365 self.target = target 

366 self.links = cell_links 

367 

368 def __str__(self) -> str: 

369 v = "{}/{}/{}/{!r}".format( 

370 self.text, self.is_title, self.colspan, self.rowspan 

371 ) 

372 if self.target: 

373 v += ": {!r}".format(self.target) 

374 return v 

375 

376 def __repr__(self) -> str: 

377 return str(self) 

378 

379 

380class HdrSpan: 

381 """Saved information about a header cell/span during the parsing 

382 of a table.""" 

383 

384 __slots__ = ( 

385 "start", 

386 "colspan", 

387 "rowspan", 

388 "rownum", # Row number where this occurred 

389 "tagsets", # list of tuples 

390 "text", # For debugging 

391 "all_headers_row", 

392 "expanded", # The header has been expanded to cover whole row/part 

393 ) 

394 

395 def __init__( 

396 self, 

397 start: int, 

398 colspan: int, 

399 rowspan: int, 

400 rownum: int, 

401 tagsets: TagSets, 

402 text: str, 

403 all_headers_row: bool, 

404 ) -> None: 

405 assert isinstance(start, int) and start >= 0 

406 assert isinstance(colspan, int) and colspan >= 1 

407 assert isinstance(rownum, int) 

408 assert isinstance(tagsets, list) 

409 for x in tagsets: 

410 assert isinstance(x, tuple) 

411 assert all_headers_row in (True, False) 

412 self.start = start 

413 self.colspan = colspan 

414 self.rowspan = rowspan 

415 self.rownum = rownum 

416 self.tagsets = list(tuple(sorted(set(tags))) for tags in tagsets) 

417 self.text = text 

418 self.all_headers_row = all_headers_row 

419 self.expanded = False 

420 

421 

422def is_superscript(ch: str) -> bool: 

423 """Returns True if the argument is a superscript character.""" 

424 assert isinstance(ch, str) and len(ch) == 1 

425 try: 

426 name = unicodedata.name(ch) 

427 except ValueError: 

428 return False 

429 return ( 

430 re.match( 

431 r"SUPERSCRIPT |" 

432 r"MODIFIER LETTER SMALL |" 

433 r"MODIFIER LETTER CAPITAL ", 

434 name, 

435 ) 

436 is not None 

437 ) 

438 

439 

440def remove_useless_tags(lang: str, pos: str, tags: set[str]) -> None: 

441 """Remove certain tag combinations from ``tags`` when they serve no purpose 

442 together (cover all options).""" 

443 assert isinstance(lang, str) 

444 assert isinstance(pos, str) 

445 assert isinstance(tags, set) 

446 if ( 

447 "animate" in tags 

448 and "inanimate" in tags 

449 and get_lang_conf(lang, "animate_inanimate_remove") 

450 ): 

451 tags.remove("animate") 

452 tags.remove("inanimate") 

453 if ( 

454 "virile" in tags 

455 and "nonvirile" in tags 

456 and get_lang_conf(lang, "virile_nonvirile_remove") 

457 ): 

458 tags.remove("virile") 

459 tags.remove("nonvirile") 

460 # If all numbers in the language are listed, remove them all 

461 numbers = get_lang_conf(lang, "numbers") 

462 if numbers and all(x in tags for x in numbers): 

463 for x in numbers: 

464 tags.remove(x) 

465 # If all genders in the language are listed, remove them all 

466 genders = get_lang_conf(lang, "genders") 

467 if genders and all(x in tags for x in genders): 

468 for x in genders: 

469 tags.remove(x) 

470 # If all voices in the language are listed, remove them all 

471 voices = get_lang_conf(lang, "voices") 

472 if voices and all(x in tags for x in voices): 

473 for x in voices: 

474 tags.remove(x) 

475 # If all strengths of the language are listed, remove them all 

476 strengths = get_lang_conf(lang, "strengths") 

477 if strengths and all(x in tags for x in strengths): 

478 for x in strengths: 

479 tags.remove(x) 

480 # If all persons of the language are listed, remove them all 

481 persons = get_lang_conf(lang, "persons") 

482 if persons and all(x in tags for x in persons): 

483 for x in persons: 

484 tags.remove(x) 

485 # If all definitenesses of the language are listed, remove them all 

486 definitenesses = get_lang_conf(lang, "definitenesses") 

487 if definitenesses and all(x in tags for x in definitenesses): 

488 for x in definitenesses: 

489 tags.remove(x) 

490 

491 

492def tagset_cats(tagset: TagSets) -> set[str]: 

493 """Returns a set of tag categories for the tagset (merged from all 

494 alternatives).""" 

495 return set(valid_tags[t] for ts in tagset for t in ts) 

496 

497 

498def or_tagsets( 

499 lang: str, pos: str, tagsets1: TagSets, tagsets2: TagSets 

500) -> TagSets: 

501 """Merges two tagsets (the new tagset just merges the tags from both, in 

502 all combinations). If they contain simple alternatives (differ in 

503 only one category), they are simply merged; otherwise they are split to 

504 more alternatives. The tagsets are assumed be sets of sorted tuples.""" 

505 assert isinstance(tagsets1, list) 

506 assert all(isinstance(x, tuple) for x in tagsets1) 

507 assert isinstance(tagsets2, list) 

508 assert all(isinstance(x, tuple) for x in tagsets1) 

509 tagsets: TagSets = [] # This will be the result 

510 

511 def add_tags(tags1: tuple[str, ...]) -> None: 

512 # CONTINUE 

513 if not tags1: 

514 return # empty set would merge with anything, won't change result 

515 if not tagsets: 

516 tagsets.append(tags1) 

517 return 

518 for tags2 in tagsets: 

519 # Determine if tags1 can be merged with tags2 

520 num_differ = 0 

521 if tags1 and tags2: 521 ↛ 539line 521 didn't jump to line 539 because the condition on line 521 was always true

522 cats1 = set(valid_tags[t] for t in tags1) 

523 cats2 = set(valid_tags[t] for t in tags2) 

524 cats = cats1 | cats2 

525 for cat in cats: 

526 tags1_in_cat = set(t for t in tags1 if valid_tags[t] == cat) 

527 tags2_in_cat = set(t for t in tags2 if valid_tags[t] == cat) 

528 if ( 

529 tags1_in_cat != tags2_in_cat 

530 or not tags1_in_cat 

531 or not tags2_in_cat 

532 ): 

533 num_differ += 1 

534 if not tags1_in_cat or not tags2_in_cat: 

535 # Prevent merging if one is empty 

536 num_differ += 1 

537 # print("tags1={} tags2={} num_differ={}" 

538 # .format(tags1, tags2, num_differ)) 

539 if num_differ <= 1: 

540 # Yes, they can be merged 

541 tagsets.remove(tags2) 

542 tags_s = set(tags1) | set(tags2) 

543 remove_useless_tags(lang, pos, tags_s) 

544 tags_t = tuple(sorted(tags_s)) 

545 add_tags(tags_t) # Could result in further merging 

546 return 

547 # If we could not merge, add to tagsets 

548 tagsets.append(tags1) 

549 

550 for tags in tagsets1: 

551 add_tags(tags) 

552 for tags in tagsets2: 

553 add_tags(tags) 

554 if not tagsets: 

555 tagsets.append(()) 

556 

557 # print("or_tagsets: {} + {} -> {}" 

558 # .format(tagsets1, tagsets2, tagsets)) 

559 return tagsets 

560 

561 

562def and_tagsets( 

563 lang: str, 

564 pos: str, 

565 tagsets1: list[tuple[str, ...]], 

566 tagsets2: list[tuple[str, ...]], 

567) -> list[tuple[str, ...]]: 

568 """Merges tagsets by taking union of all cobinations, without trying 

569 to determine whether they are compatible.""" 

570 assert isinstance(tagsets1, list) and len(tagsets1) >= 1 

571 assert all(isinstance(x, tuple) for x in tagsets1) 

572 assert isinstance(tagsets2, list) and len(tagsets2) >= 1 

573 assert all(isinstance(x, tuple) for x in tagsets1) 

574 new_tagsets = [] 

575 tags: Union[set[str], tuple[str, ...]] 

576 for tags1 in tagsets1: 

577 for tags2 in tagsets2: 

578 tags = set(tags1) | set(tags2) 

579 remove_useless_tags(lang, pos, tags) 

580 if "dummy-ignored-text-cell" in tags: 580 ↛ 581line 580 didn't jump to line 581 because the condition on line 580 was never true

581 tags.remove("dummy-ignored-text-cell") 

582 tags = tuple(sorted(tags)) 

583 if tags not in new_tagsets: 583 ↛ 577line 583 didn't jump to line 577 because the condition on line 583 was always true

584 new_tagsets.append(tags) 

585 # print("and_tagsets: {} + {} -> {}" 

586 # .format(tagsets1, tagsets2, new_tagsets)) 

587 return new_tagsets 

588 

589 

590@functools.lru_cache(65536) 

591def extract_cell_content( 

592 lang: str, word: str, col: str 

593) -> tuple[str, list[str], list[tuple[str, str]], list[str]]: 

594 """Cleans a row/column header for later processing. This returns 

595 (cleaned, refs, defs, tags).""" 

596 # print("EXTRACT_CELL_CONTENT {!r}".format(col)) 

597 hdr_tags = [] 

598 col = re.sub(r"(?s)\s*,\s*$", "", col) 

599 col = re.sub(r"(?s)\s*•\s*$", "", col) 

600 col = re.sub(r"\s+", " ", col) 

601 col = col.strip() 

602 if re.search( 

603 r"^\s*(There are |" 

604 r"\* |" 

605 r"see |" 

606 r"Use |" 

607 r"use the |" 

608 r"Only used |" 

609 r"The forms in |" 

610 r"these are also written |" 

611 r"The genitive can be |" 

612 r"Genitive forms are rare or non-existant|" 

613 r"Accusative Note: |" 

614 r"Classifier Note: |" 

615 r"Noun: Assamese nouns are |" 

616 r"the active conjugation|" 

617 r"the instrumenal singular|" 

618 r"Note:|" 

619 r"\^* Note:|" 

620 r"possible mutated form |" 

621 r"The future tense: )", 

622 col, 

623 ): 

624 return "dummy-ignored-text-cell", [], [], [] 

625 

626 # Temporarily remove final parenthesized part (if separated by whitespace), 

627 # so that we can extract reference markers before it. 

628 final_paren = "" 

629 m = re.search(r"\s+\([^)]*\)$", col) 

630 if m is not None: 

631 final_paren = m.group(0) 

632 col = col[: m.start()] 

633 

634 # Extract references and tag markers 

635 refs = [] 

636 special_references = get_lang_conf(lang, "special_references") 

637 while True: 

638 m = re.search(r"\^(.|\([^)]*\))$", col) 

639 if not m: 

640 break 

641 r = m.group(1) 

642 if r.startswith("(") and r.endswith(")"): 

643 r = r[1:-1] 

644 for r1 in r.split(","): 

645 if r1 == "rare": 645 ↛ 646line 645 didn't jump to line 646 because the condition on line 645 was never true

646 hdr_tags.append("rare") 

647 elif special_references and r1 in special_references: 

648 hdr_tags.extend(special_references[r1].split()) 

649 else: 

650 # v = m.group(1) 

651 if r1.startswith("(") and r1.endswith(")"): 651 ↛ 652line 651 didn't jump to line 652 because the condition on line 651 was never true

652 r1 = r1[1:-1] 

653 refs.append(unicodedata.normalize("NFKD", r1)) 

654 col = col[: m.start()] 

655 # See if it is a ref definition 

656 # print("BEFORE REF CHECK: {!r}".format(col)) 

657 m = def_re.match(col) 

658 # print(f"Before def_re: {refs=}") 

659 if m and not nondef_re.match(col): 

660 ofs = 0 

661 ref = None 

662 deflst = [] 

663 for m in re.finditer(def_re, col): 

664 if ref: 

665 deflst.append((ref, col[ofs : m.start()].strip())) 

666 ref = unicodedata.normalize( 

667 "NFKD", m.group(3) or m.group(5) or m.group(6) or "" 

668 ) 

669 ofs = m.end() 

670 if ref: 670 ↛ 673line 670 didn't jump to line 673 because the condition on line 670 was always true

671 deflst.append((ref, col[ofs:].strip())) 

672 # print("deflst:", deflst) 

673 return "", [], deflst, [] 

674 # See if it *looks* like a reference to a definition 

675 # print(f"After def_re: {refs=}") 

676 while col: 

677 if is_superscript(col[-1]) or col[-1] in ("†",): 

678 if col.endswith("ʳᵃʳᵉ"): 

679 hdr_tags.append("rare") 

680 col = col[:-4].strip() 

681 continue 

682 if special_references: 

683 stop_flag = False 

684 for r in special_references: 

685 if col.endswith(r): 

686 hdr_tags.extend(special_references[r].split()) 

687 col = col[: -len(r)].strip() 

688 stop_flag = True 

689 break # this for loop 

690 if stop_flag: 

691 continue # this while loop 

692 # Numbers and H/L/N are useful information 

693 refs.append(unicodedata.normalize("NFKD", col[-1])) 

694 col = col[:-1] 

695 else: 

696 break 

697 

698 # Check for another form of note definition 

699 if ( 699 ↛ 705line 699 didn't jump to line 705 because the condition on line 699 was never true

700 len(col) > 2 

701 and col[1] in (")", " ", ":") 

702 and col[0].isdigit() 

703 and not re.match(nondef_re, col) 

704 ): 

705 return "", [], [(col[0], col[2:].strip())], [] 

706 col = col.strip() 

707 

708 # Extract final "*" reference symbols. Sometimes there are multiple. 

709 m = re.search(r"\*+$", col) 

710 if m is not None: 

711 col = col[: m.start()] 

712 refs.append(unicodedata.normalize("NFKD", m.group(0))) 

713 if col.endswith("(*)"): 713 ↛ 714line 713 didn't jump to line 714 because the condition on line 713 was never true

714 col = col[:-3].strip() 

715 refs.append("*") 

716 

717 # Put back the final parenthesized part 

718 col = col.strip() + final_paren 

719 # print("EXTRACT_CELL_CONTENT: orig_col={!r} col={!r} refs={!r} hdr_tags={}" 

720 # .format(orig_col, col, refs, hdr_tags)) 

721 return col.strip(), refs, [], hdr_tags 

722 

723 

724@functools.lru_cache(10000) 

725def parse_title( 

726 title: str, source: str 

727) -> tuple[list[str], list[str], list[FormData]]: 

728 """Parses inflection table title. This returns (global_tags, table_tags, 

729 extra_forms), where ``global_tags`` is tags to be added to each inflection 

730 entry, ``table_tags`` are tags for the word but not to be added to every 

731 form, and ``extra_forms`` is dictionary describing additional forms to be 

732 included in the part-of-speech entry).""" 

733 assert isinstance(title, str) 

734 assert isinstance(source, str) 

735 title = html.unescape(title) 

736 title = re.sub(r"(?i)<[^>]*>", "", title).strip() 

737 title = re.sub(r"\s+", " ", title) 

738 # print("PARSE_TITLE:", title) 

739 global_tags: list[str] = [] 

740 table_tags: list[str] = [] 

741 extra_forms = [] 

742 # Add certain global tags based on contained words 

743 for m in re.finditer(title_contains_global_re, title): 

744 v = m.group(0).lower() 

745 if re.match(table_hdr_ign_part_re, v): 745 ↛ 746line 745 didn't jump to line 746 because the condition on line 745 was never true

746 continue 

747 global_tags.extend(title_contains_global_map[v].split()) 

748 # Add certain tags to table-tags "form" based on contained words 

749 for m in re.finditer(title_contains_wordtags_re, title): 

750 v = m.group(0).lower() 

751 if re.match(table_hdr_ign_part_re, v): 751 ↛ 752line 751 didn't jump to line 752 because the condition on line 751 was never true

752 continue 

753 table_tags.extend(title_contains_wordtags_map[v].split()) 

754 if re.search(r"Conjugation of (s’|se ).*French verbs", title): 754 ↛ 755line 754 didn't jump to line 755 because the condition on line 754 was never true

755 global_tags.append("reflexive") 

756 # Check for <x>-type at the beginning of title (e.g., Armenian) and various 

757 # other ways of specifying an inflection class. 

758 for m in re.finditer( 

759 r"\b(" 

760 r"[\w/]+-type|" 

761 r"accent-\w+|" 

762 r"[\w/]+-stem|" 

763 r"[^ ]+ gradation|" 

764 r"\b(stem in [\w/ ]+)|" 

765 r"[^ ]+ alternation|" 

766 r"(First|Second|Third|Fourth|Fifth|Sixth|Seventh) " 

767 r"(Conjugation|declension)|" 

768 r"First and second declension|" 

769 r"(1st|2nd|3rd|4th|5th|6th) declension|" 

770 r"\w[\w/ ]* harmony" 

771 r")\b", 

772 title, 

773 ): 

774 dt: FormData = {"form": m.group(1), "source": source, "tags": ["class"]} 

775 extra_forms.append(dt) 

776 # Parse parenthesized part from title 

777 for m in re.finditer(r"\(([^)]*)\)", title): 

778 for elem in m.group(1).split(","): 

779 # group(0) is the whole string, group(1) first parens 

780 elem = elem.strip() 

781 if elem in title_elements_map: 

782 table_tags.extend(title_elements_map[elem].split()) 

783 else: 

784 m1 = re.match(title_elemstart_re, elem) 

785 if m1: 

786 tags = title_elemstart_map[m1.group(1)].split() 

787 dt = { 

788 "form": elem[m1.end() :], 

789 "source": source, 

790 "tags": tags, 

791 } 

792 extra_forms.append(dt) 

793 # For titles that contains no parenthesized parts, do some special 

794 # handling to still interpret parts from them 

795 if "(" not in title: 

796 # No parenthesized parts 

797 m1 = re.search(r"\b(Portuguese) (-.* verb) ", title) 

798 if m1 is not None: 

799 dt = {"form": m1.group(2), "tags": ["class"], "source": source} 

800 extra_forms.append(dt) 

801 for elem in title.split(","): 

802 elem = elem.strip() 

803 if elem in title_elements_map: 803 ↛ 804line 803 didn't jump to line 804 because the condition on line 803 was never true

804 table_tags.extend(title_elements_map[elem].split()) 

805 elif elem.endswith("-stem"): 805 ↛ 806line 805 didn't jump to line 806 because the condition on line 805 was never true

806 dt = {"form": elem, "tags": ["class"], "source": source} 

807 extra_forms.append(dt) 

808 return global_tags, table_tags, extra_forms 

809 

810 

811def expand_header( 

812 wxr: WiktextractContext, 

813 tablecontext: "TableContext", 

814 word: str, 

815 lang: str, 

816 pos: str, 

817 text: str, 

818 base_tags: Union[list[str], set[str], tuple[str, ...]], 

819 silent=False, 

820 ignore_tags=False, 

821 depth=0, 

822 column_number: int | None = None, 

823) -> list[tuple[str, ...]]: 

824 """Expands a cell header to tagset, handling conditional expressions 

825 in infl_map. This returns list of tuples of tags, each list element 

826 describing an alternative interpretation. ``base_tags`` is combined 

827 column and row tags for the cell in which the text is being interpreted 

828 (conditional expressions in inflection data may depend on it). 

829 If ``silent`` is True, then no warnings will be printed. If ``ignore_tags`` 

830 is True, then tags listed in "if" will be ignored in the test (this is 

831 used when trying to heuristically detect whether a non-<th> cell is anyway 

832 a header).""" 

833 assert isinstance(wxr, WiktextractContext) 

834 assert isinstance(word, str) 

835 assert isinstance(lang, str) 

836 assert isinstance(pos, str) 

837 assert isinstance(text, str) 

838 assert isinstance(base_tags, (list, tuple, set)) 

839 assert silent in (True, False) 

840 assert isinstance(depth, int) 

841 # print("EXPAND_HDR: text={!r} base_tags={!r}".format(text, base_tags)) 

842 # First map the text using the inflection map 

843 text = clean_value(wxr, text) 

844 combined_return: list[tuple[str, ...]] = [] 

845 parts = split_at_comma_semi(text, separators=[";"]) 

846 for text in parts: 

847 if not text: 847 ↛ 848line 847 didn't jump to line 848 because the condition on line 847 was never true

848 continue 

849 if text in infl_map: 

850 v = infl_map[text] # list or string 

851 else: 

852 m = re.match(infl_start_re, text) 

853 if m is not None: 853 ↛ 854line 853 didn't jump to line 854 because the condition on line 853 was never true

854 v = infl_start_map[m.group(1)] 

855 # print("INFL_START {} -> {}".format(text, v)) 

856 elif re.match(r"Notes", text): 

857 # Ignored header 

858 # print("IGNORING NOTES") 

859 combined_return = or_tagsets( 

860 lang, pos, combined_return, [("dummy-skip-this",)] 

861 ) 

862 # this just adds dummy-skip-this 

863 continue 

864 elif text in IGNORED_COLVALUES: 

865 combined_return = or_tagsets( 

866 lang, pos, combined_return, [("dummy-ignore-skipped",)] 

867 ) 

868 continue 

869 # Try without final parenthesized part 

870 text_without_parens = re.sub(r"[,/]?\s+\([^)]*\)\s*$", "", text) 

871 if text_without_parens in infl_map: 

872 v = infl_map[text_without_parens] 

873 elif m is None: 873 ↛ 889line 873 didn't jump to line 889 because the condition on line 873 was always true

874 if not silent: 

875 wxr.wtp.debug( 

876 "inflection table: unrecognized header: {}".format( 

877 repr(text) 

878 ), 

879 sortid="inflection/735", 

880 ) 

881 # Unrecognized header 

882 combined_return = or_tagsets( 

883 lang, pos, combined_return, [("error-unrecognized-form",)] 

884 ) 

885 continue 

886 

887 # Then loop interpreting the value, until the value is a simple string. 

888 # This may evaluate nested conditional expressions. 

889 default_else = None 

890 while True: 

891 # If it is a string, we are done. 

892 if isinstance(v, str): 

893 tags = set(v.split()) 

894 remove_useless_tags(lang, pos, tags) 

895 tagset = [tuple(sorted(tags))] 

896 break 

897 # For a list, just interpret it as alternatives. (Currently the 

898 # alternatives must directly be strings.) 

899 if isinstance(v, (list, tuple)): 

900 tagset = [] 

901 for x in v: 

902 tags = set(x.split()) 

903 remove_useless_tags(lang, pos, tags) 

904 tags_t = tuple(sorted(tags)) 

905 if tags_t not in tagset: 905 ↛ 901line 905 didn't jump to line 901 because the condition on line 905 was always true

906 tagset.append(tags_t) 

907 break 

908 # Otherwise the value should be a dictionary describing a 

909 # conditional expression. 

910 if not isinstance(v, dict): 910 ↛ 911line 910 didn't jump to line 911 because the condition on line 910 was never true

911 wxr.wtp.debug( 

912 "inflection table: internal: " 

913 "UNIMPLEMENTED INFL_MAP VALUE: {}".format(infl_map[text]), 

914 sortid="inflection/767", 

915 ) 

916 tagset = [()] 

917 break 

918 # Evaluate the conditional expression. 

919 assert isinstance(v, dict) 

920 cond: Union[bool, str] = "default-true" 

921 c: Union[str, list[str], set[str]] = "" 

922 # Handle "lang" condition. The value must be either a 

923 # single language or a list of languages, and the 

924 # condition evaluates to True if the table is one of 

925 # those languages. 

926 if "lang" in v: 

927 c = v["lang"] 

928 # check if it's a code and transform if necessary 

929 if isinstance(c, str): 

930 if c != lang: 

931 cond = lang == code_to_name(c, "en") 

932 else: 

933 cond = True 

934 else: 

935 assert isinstance(c, (list, tuple, set)) 

936 if lang not in c: 

937 cond = name_to_code(lang, "en") in c 

938 else: 

939 cond = True 

940 # Handle "nested-table-depth" condition. The value must 

941 # be an int or list of ints, and the condition evaluates 

942 # True if the depth is one of those values. 

943 # "depth" is how deep into a nested table tree the current 

944 # table lies. It is first started in handle_wikitext_table, 

945 # so only applies to tables-within-tables, not other 

946 # WikiNode content. `depth` is currently only passed as a 

947 # parameter down the table parsing stack, and not stored. 

948 if cond and "nested-table-depth" in v: 948 ↛ 949line 948 didn't jump to line 949 because the condition on line 948 was never true

949 d = v["nested-table-depth"] 

950 if isinstance(d, int): 

951 cond = d == depth 

952 else: 

953 assert isinstance(d, (list, tuple, set)) 

954 cond = depth in d 

955 # Column index: check if we're in position X of the row 

956 if cond and "column-index" in v: 

957 index = v["column-index"] 

958 if isinstance(index, int): 958 ↛ 961line 958 didn't jump to line 961 because the condition on line 958 was always true

959 cond = index == column_number 

960 else: 

961 assert isinstance(index, (list, tuple, set)) 

962 cond = column_number in index 

963 # Handle inflection-template condition. Must be a string 

964 # or list of strings, and if tablecontext.template_name is in 

965 # those, accept the condition. 

966 # TableContext.template_name is passed down from page/ 

967 # parse_inflection, before parsing and expanding itself 

968 # has begun. 

969 if cond and tablecontext and "inflection-template" in v: 

970 d1 = v["inflection-template"] 

971 if isinstance(d1, str): 971 ↛ 974line 971 didn't jump to line 974 because the condition on line 971 was always true

972 cond = d1 == tablecontext.template_name 

973 else: 

974 assert isinstance(d1, (list, tuple, set)) 

975 cond = tablecontext.template_name in d1 

976 # Handle "pos" condition. The value must be either a single 

977 # part-of-speech or a list of them, and the condition evaluates to 

978 # True if the part-of-speech is any of those listed. 

979 if cond and "pos" in v: 

980 c = v["pos"] 

981 if isinstance(c, str): 

982 cond = c == pos 

983 else: 

984 assert isinstance(c, (list, tuple, set)) 

985 cond = pos in c 

986 # Handle "if" condition. The value must be a string containing a 

987 # space-separated list of tags. The condition evaluates to True if 

988 # ``base_tags`` contains all of the listed tags. If the condition 

989 # is of the form "any: ...tags...", then any of the tags will be 

990 # enough. 

991 if cond and "if" in v and not ignore_tags: 

992 c = v["if"] 

993 assert isinstance(c, str) 

994 # "if" condition is true if any of the listed tags is present if 

995 # it starts with "any:", otherwise all must be present 

996 if c.startswith("any: "): 

997 cond = any(t in base_tags for t in c[5:].split()) 

998 else: 

999 cond = all(t in base_tags for t in c.split()) 

1000 

1001 # Handle "default" assignment. Store the value to be used 

1002 # as a default later. 

1003 if "default" in v: 

1004 assert isinstance(v["default"], str) 

1005 default_else = v["default"] 

1006 

1007 # Warning message about missing conditions for debugging. 

1008 

1009 if cond == "default-true" and not default_else and not silent: 

1010 wxr.wtp.debug( 

1011 "inflection table: IF MISSING COND: word={} " 

1012 "lang={} text={} base_tags={} c={} cond={}".format( 

1013 word, lang, text, base_tags, c, cond 

1014 ), 

1015 sortid="inflection/851", 

1016 ) 

1017 # Based on the result of evaluating the condition, select either 

1018 # "then" part or "else" part. 

1019 if cond: 

1020 v = v.get("then", "") 

1021 else: 

1022 v1 = v.get("else") 

1023 if v1 is None: 

1024 if default_else is not None: 

1025 v = default_else 

1026 else: 

1027 if not silent: 

1028 wxr.wtp.debug( 

1029 "inflection table: IF WITHOUT ELSE EVALS " 

1030 "False: " 

1031 "{}/{} {!r} base_tags={}".format( 

1032 word, lang, text, base_tags 

1033 ), 

1034 sortid="inflection/865", 

1035 ) 

1036 v = "error-unrecognized-form" 

1037 else: 

1038 v = v1 

1039 

1040 # Merge the resulting tagset from this header part with the other 

1041 # tagsets from the whole header 

1042 combined_return = or_tagsets(lang, pos, combined_return, tagset) 

1043 

1044 # Return the combined tagsets, or empty tagset if we got no tagsets 

1045 if not combined_return: 

1046 combined_return = [()] 

1047 return combined_return 

1048 

1049 

1050def compute_coltags( 

1051 lang: str, 

1052 pos: str, 

1053 hdrspans: list[HdrSpan], 

1054 start: int, 

1055 colspan: int, 

1056 celltext: str, 

1057) -> list[tuple[str, ...]]: 

1058 """Computes column tags for a column of the given width based on the 

1059 current header spans.""" 

1060 assert isinstance(lang, str) 

1061 assert isinstance(pos, str) 

1062 assert isinstance(hdrspans, list) 

1063 assert isinstance(start, int) and start >= 0 

1064 assert isinstance(colspan, int) and colspan >= 1 

1065 assert isinstance(celltext, str) # For debugging only 

1066 # print("COMPUTE_COLTAGS CALLED start={} colspan={} celltext={!r}" 

1067 # .format(start, colspan, celltext)) 

1068 # For debugging, set this to the form for whose cell you want debug prints 

1069 if celltext == debug_cell_text: 1069 ↛ 1070line 1069 didn't jump to line 1070 because the condition on line 1069 was never true

1070 print( 

1071 "COMPUTE_COLTAGS CALLED start={} colspan={} celltext={!r}".format( 

1072 start, colspan, celltext 

1073 ) 

1074 ) 

1075 for hdrspan in hdrspans: 

1076 print( 

1077 " row={} start={} colspans={} tagsets={}".format( 

1078 hdrspan.rownum, 

1079 hdrspan.start, 

1080 hdrspan.colspan, 

1081 hdrspan.tagsets, 

1082 ) 

1083 ) 

1084 used = set() 

1085 coltags: list[tuple[str, ...]] = [()] 

1086 last_header_row = 1000000 

1087 # Iterate through the headers in reverse order, i.e., headers lower in the 

1088 # table (closer to the cell) first. 

1089 row_tagsets: list[tuple[str, ...]] = [()] 

1090 row_tagsets_rownum = 1000000 

1091 used_hdrspans = set() 

1092 for hdrspan in reversed(hdrspans): 

1093 if ( 

1094 hdrspan.start + hdrspan.colspan <= start 

1095 or hdrspan.start >= start + colspan 

1096 ): 

1097 # Does not horizontally overlap current cell. Ignore this hdrspan. 

1098 if celltext == debug_cell_text: 1098 ↛ 1099line 1098 didn't jump to line 1099 because the condition on line 1098 was never true

1099 print( 

1100 "Ignoring row={} start={} colspan={} tagsets={}".format( 

1101 hdrspan.rownum, 

1102 hdrspan.start, 

1103 hdrspan.colspan, 

1104 hdrspan.tagsets, 

1105 ) 

1106 ) 

1107 continue 

1108 # If the cell partially overlaps the current cell, assume we have 

1109 # reached something unrelated and abort. 

1110 if ( 

1111 hdrspan.start < start 

1112 and hdrspan.start + hdrspan.colspan > start 

1113 and hdrspan.start + hdrspan.colspan < start + colspan 

1114 ): 

1115 if celltext == debug_cell_text: 1115 ↛ 1116line 1115 didn't jump to line 1116 because the condition on line 1115 was never true

1116 print( 

1117 "break on partial overlap at start {} {} {}".format( 

1118 hdrspan.start, hdrspan.colspan, hdrspan.tagsets 

1119 ) 

1120 ) 

1121 break 

1122 if ( 

1123 hdrspan.start < start + colspan 

1124 and hdrspan.start > start 

1125 and hdrspan.start + hdrspan.colspan > start + colspan 

1126 and not hdrspan.expanded 

1127 ): 

1128 if celltext == debug_cell_text: 1128 ↛ 1129line 1128 didn't jump to line 1129 because the condition on line 1128 was never true

1129 print( 

1130 "break on partial overlap at end {} {} {}".format( 

1131 hdrspan.start, hdrspan.colspan, hdrspan.tagsets 

1132 ) 

1133 ) 

1134 break 

1135 # Check if we have already used this cell. 

1136 if id(hdrspan) in used_hdrspans: 

1137 continue 

1138 # We are going to use this cell. 

1139 used_hdrspans.add(id(hdrspan)) 

1140 tagsets = hdrspan.tagsets 

1141 # If the hdrspan is fully inside the current cell and does not cover 

1142 # it fully, check if we should merge information from multiple cells. 

1143 if not hdrspan.expanded and ( 

1144 hdrspan.start > start 

1145 or hdrspan.start + hdrspan.colspan < start + colspan 

1146 ): 

1147 # Multiple columns apply to the current cell, only 

1148 # gender/number/case tags present 

1149 # If there are no tags outside the range in any of the 

1150 # categories included in these cells, don't add anything 

1151 # (assume all choices valid in the language are possible). 

1152 in_cats = set( 

1153 valid_tags[t] 

1154 for x in hdrspans 

1155 if x.rownum == hdrspan.rownum 

1156 and x.start >= start 

1157 and x.start + x.colspan <= start + colspan 

1158 for tt in x.tagsets 

1159 for t in tt 

1160 ) 

1161 if celltext == debug_cell_text: 1161 ↛ 1162line 1161 didn't jump to line 1162 because the condition on line 1161 was never true

1162 print("in_cats={} tagsets={}".format(in_cats, tagsets)) 

1163 # Merge the tagsets into existing tagsets. This merges 

1164 # alternatives into the same tagset if there is only one 

1165 # category different; otherwise this splits the tagset into 

1166 # more alternatives. 

1167 includes_all_on_row = True 

1168 for x in hdrspans: 

1169 # print("X: x.rownum={} x.start={}".format(x.rownum, x.start)) 

1170 if x.rownum != hdrspan.rownum: 

1171 continue 

1172 if x.start < start or x.start + x.colspan > start + colspan: 

1173 if celltext == debug_cell_text: 1173 ↛ 1174line 1173 didn't jump to line 1174 because the condition on line 1173 was never true

1174 print( 

1175 "NOT IN RANGE: {} {} {}".format( 

1176 x.start, x.colspan, x.tagsets 

1177 ) 

1178 ) 

1179 includes_all_on_row = False 

1180 continue 

1181 if id(x) in used_hdrspans: 

1182 if celltext == debug_cell_text: 1182 ↛ 1183line 1182 didn't jump to line 1183 because the condition on line 1182 was never true

1183 print( 

1184 "ALREADY USED: {} {} {}".format( 

1185 x.start, x.colspan, x.tagsets 

1186 ) 

1187 ) 

1188 continue 

1189 used_hdrspans.add(id(x)) 

1190 if celltext == debug_cell_text: 1190 ↛ 1191line 1190 didn't jump to line 1191 because the condition on line 1190 was never true

1191 print( 

1192 "Merging into wide col: x.rownum={} " 

1193 "x.start={} x.colspan={} " 

1194 "start={} colspan={} tagsets={} x.tagsets={}".format( 

1195 x.rownum, 

1196 x.start, 

1197 x.colspan, 

1198 start, 

1199 colspan, 

1200 tagsets, 

1201 x.tagsets, 

1202 ) 

1203 ) 

1204 tagsets = or_tagsets(lang, pos, tagsets, x.tagsets) 

1205 # If all headers on the row were included, ignore them. 

1206 # See e.g. kunna/Swedish/Verb. 

1207 ts_cats = tagset_cats(tagsets) 

1208 if ( 

1209 includes_all_on_row 

1210 or 

1211 # Kludge, see fut/Hungarian/Verb 

1212 ("tense" in ts_cats and "object" in ts_cats) 

1213 ): 

1214 tagsets = [()] 

1215 # For limited categories, if the category doesn't appear 

1216 # outside, we won't include the category 

1217 if not in_cats - set( 

1218 ("gender", "number", "person", "case", "category", "voice") 

1219 ): 

1220 # Sometimes we have masc, fem, neut and plural, so treat 

1221 # number and gender as the same here (if one given, look for 

1222 # the other too) 

1223 if "number" in in_cats or "gender" in in_cats: 

1224 in_cats.update(("number", "gender")) 

1225 # Determine which categories occur outside on 

1226 # the same row. Ignore headers that have been expanded 

1227 # to cover the whole row/part of it. 

1228 out_cats = set( 

1229 valid_tags[t] 

1230 for x in hdrspans 

1231 if x.rownum == hdrspan.rownum 

1232 and not x.expanded 

1233 and ( 

1234 x.start < start or x.start + x.colspan > start + colspan 

1235 ) 

1236 for tt in x.tagsets 

1237 for t in tt 

1238 ) 

1239 if celltext == debug_cell_text: 1239 ↛ 1240line 1239 didn't jump to line 1240 because the condition on line 1239 was never true

1240 print("in_cats={} out_cats={}".format(in_cats, out_cats)) 

1241 # Remove all inside categories that do not appear outside 

1242 

1243 new_tagsets = [] 

1244 for ts in tagsets: 

1245 tags = tuple( 

1246 sorted(t for t in ts if valid_tags[t] in out_cats) 

1247 ) 

1248 if tags not in new_tagsets: 1248 ↛ 1244line 1248 didn't jump to line 1244 because the condition on line 1248 was always true

1249 new_tagsets.append(tags) 

1250 if celltext == debug_cell_text and new_tagsets != tagsets: 1250 ↛ 1251line 1250 didn't jump to line 1251 because the condition on line 1250 was never true

1251 print( 

1252 "Removed tags that do not " 

1253 "appear outside {} -> {}".format( 

1254 # have_hdr never used? 

1255 tagsets, 

1256 new_tagsets, 

1257 ) 

1258 ) 

1259 tagsets = new_tagsets 

1260 key = (hdrspan.start, hdrspan.colspan) 

1261 if key in used: 

1262 if celltext == debug_cell_text: 1262 ↛ 1263line 1262 didn't jump to line 1263 because the condition on line 1262 was never true

1263 print( 

1264 "Cellspan already used: start={} " 

1265 "colspan={} rownum={} {}".format( 

1266 hdrspan.start, 

1267 hdrspan.colspan, 

1268 hdrspan.rownum, 

1269 hdrspan.tagsets, 

1270 ) 

1271 ) 

1272 action = get_lang_conf(lang, "reuse_cellspan") 

1273 # can be "stop", "skip" or "reuse" 

1274 if action == "stop": 

1275 break 

1276 if action == "skip": 

1277 continue 

1278 assert action == "reuse" 

1279 tcats = tagset_cats(tagsets) 

1280 # Most headers block using the same column position above. However, 

1281 # "register" tags don't do this (cf. essere/Italian/verb: "formal") 

1282 if len(tcats) != 1 or "register" not in tcats: 

1283 used.add(key) 

1284 # If we have moved to a different row, merge into column tagsets 

1285 # (we use different and_tagsets within the row) 

1286 if row_tagsets_rownum != hdrspan.rownum: 

1287 # row_tagsets_rownum was initialized as 10000000 

1288 ret = and_tagsets(lang, pos, coltags, row_tagsets) 

1289 if celltext == debug_cell_text: 1289 ↛ 1290line 1289 didn't jump to line 1290 because the condition on line 1289 was never true

1290 print( 

1291 "merging rows: {} {} -> {}".format( 

1292 coltags, row_tagsets, ret 

1293 ) 

1294 ) 

1295 coltags = ret 

1296 row_tagsets = [()] 

1297 row_tagsets_rownum = hdrspan.rownum 

1298 # Merge into coltags 

1299 if hdrspan.all_headers_row and hdrspan.rownum + 1 == last_header_row: 

1300 # If this row is all headers and immediately preceeds the last 

1301 # header we accepted, take any header from there. 

1302 row_tagsets = and_tagsets(lang, pos, row_tagsets, tagsets) 

1303 if celltext == debug_cell_text: 1303 ↛ 1304line 1303 didn't jump to line 1304 because the condition on line 1303 was never true

1304 print("merged (next header row): {}".format(row_tagsets)) 

1305 else: 

1306 # new_cats is for the new tags (higher up in the table) 

1307 new_cats = tagset_cats(tagsets) 

1308 # cur_cats is for the tags already collected (lower in the table) 

1309 cur_cats = tagset_cats(coltags) 

1310 if celltext == debug_cell_text: 1310 ↛ 1311line 1310 didn't jump to line 1311 because the condition on line 1310 was never true

1311 print( 

1312 "row={} start={} colspan={} tagsets={} coltags={} " 

1313 "new_cats={} cur_cats={}".format( 

1314 hdrspan.rownum, 

1315 hdrspan.start, 

1316 hdrspan.colspan, 

1317 tagsets, 

1318 coltags, 

1319 new_cats, 

1320 cur_cats, 

1321 ) 

1322 ) 

1323 if "detail" in new_cats: 

1324 if not any(coltags): # Only if no tags so far 

1325 coltags = or_tagsets(lang, pos, coltags, tagsets) 

1326 if celltext == debug_cell_text: 1326 ↛ 1327line 1326 didn't jump to line 1327 because the condition on line 1326 was never true

1327 print("stopping on detail after merge") 

1328 break 

1329 # Here, we block bleeding of categories from above 

1330 elif "non-finite" in cur_cats and "non-finite" in new_cats: 

1331 stop = get_lang_conf(lang, "stop_non_finite_non_finite") 

1332 if stop: 1332 ↛ 1358line 1332 didn't jump to line 1358 because the condition on line 1332 was always true

1333 if celltext == debug_cell_text: 1333 ↛ 1334line 1333 didn't jump to line 1334 because the condition on line 1333 was never true

1334 print("stopping on non-finite-non-finite") 

1335 break 

1336 elif "non-finite" in cur_cats and "voice" in new_cats: 

1337 stop = get_lang_conf(lang, "stop_non_finite_voice") 

1338 if stop: 1338 ↛ 1358line 1338 didn't jump to line 1358 because the condition on line 1338 was always true

1339 if celltext == debug_cell_text: 1339 ↛ 1340line 1339 didn't jump to line 1340 because the condition on line 1339 was never true

1340 print("stopping on non-finite-voice") 

1341 break 

1342 elif "non-finite" in new_cats and cur_cats & set( 

1343 ("person", "number") 

1344 ): 

1345 if celltext == debug_cell_text: 1345 ↛ 1346line 1345 didn't jump to line 1346 because the condition on line 1345 was never true

1346 print("stopping on non-finite new") 

1347 break 

1348 elif "non-finite" in new_cats and "tense" in new_cats: 

1349 stop = get_lang_conf(lang, "stop_non_finite_tense") 

1350 if stop: 

1351 if celltext == debug_cell_text: 1351 ↛ 1352line 1351 didn't jump to line 1352 because the condition on line 1351 was never true

1352 print("stopping on non-finite new") 

1353 break 

1354 elif "non-finite" in cur_cats and new_cats & set(("mood",)): 1354 ↛ 1355line 1354 didn't jump to line 1355 because the condition on line 1354 was never true

1355 if celltext == debug_cell_text: 

1356 print("stopping on non-finite cur") 

1357 break 

1358 if ( 

1359 "tense" in new_cats 

1360 and any("imperative" in x for x in coltags) 

1361 and get_lang_conf(lang, "imperative_no_tense") 

1362 ): 

1363 if celltext == debug_cell_text: 1363 ↛ 1364line 1363 didn't jump to line 1364 because the condition on line 1363 was never true

1364 print("skipping tense in imperative") 

1365 continue 

1366 elif ( 

1367 "mood" in new_cats 

1368 and "mood" in cur_cats 

1369 and 

1370 # Allow if all new tags are already in current set 

1371 any( 

1372 t not in ts1 

1373 for ts1 in coltags # current 

1374 for ts2 in tagsets # new (from above) 

1375 for t in ts2 

1376 ) 

1377 ): 

1378 skip = get_lang_conf(lang, "skip_mood_mood") 

1379 if skip: 

1380 if celltext == debug_cell_text: 1380 ↛ 1381line 1380 didn't jump to line 1381 because the condition on line 1380 was never true

1381 print("skipping on mood-mood") 

1382 # we continue to next header 

1383 else: 

1384 if celltext == debug_cell_text: 1384 ↛ 1385line 1384 didn't jump to line 1385 because the condition on line 1384 was never true

1385 print("stopping on mood-mood") 

1386 break 

1387 elif "tense" in new_cats and "tense" in cur_cats: 

1388 skip = get_lang_conf(lang, "skip_tense_tense") 

1389 if skip: 

1390 if celltext == debug_cell_text: 1390 ↛ 1391line 1390 didn't jump to line 1391 because the condition on line 1390 was never true

1391 print("skipping on tense-tense") 

1392 # we continue to next header 

1393 else: 

1394 if celltext == debug_cell_text: 1394 ↛ 1395line 1394 didn't jump to line 1395 because the condition on line 1394 was never true

1395 print("stopping on tense-tense") 

1396 break 

1397 elif "aspect" in new_cats and "aspect" in cur_cats: 

1398 if celltext == debug_cell_text: 1398 ↛ 1399line 1398 didn't jump to line 1399 because the condition on line 1398 was never true

1399 print("skipping on aspect-aspect") 

1400 continue 

1401 elif "number" in cur_cats and "number" in new_cats: 

1402 if celltext == debug_cell_text: 1402 ↛ 1403line 1402 didn't jump to line 1403 because the condition on line 1402 was never true

1403 print("stopping on number-number") 

1404 break 

1405 elif "number" in cur_cats and "gender" in new_cats: 

1406 if celltext == debug_cell_text: 1406 ↛ 1407line 1406 didn't jump to line 1407 because the condition on line 1406 was never true

1407 print("stopping on number-gender") 

1408 break 

1409 elif "person" in cur_cats and "person" in new_cats: 

1410 if celltext == debug_cell_text: 1410 ↛ 1411line 1410 didn't jump to line 1411 because the condition on line 1410 was never true

1411 print("stopping on person-person") 

1412 break 

1413 else: 

1414 # Merge tags and continue to next header up/left in the table. 

1415 row_tagsets = and_tagsets(lang, pos, row_tagsets, tagsets) 

1416 if celltext == debug_cell_text: 1416 ↛ 1417line 1416 didn't jump to line 1417 because the condition on line 1416 was never true

1417 print("merged: {}".format(coltags)) 

1418 # Update the row number from which we have last taken headers 

1419 last_header_row = hdrspan.rownum 

1420 # Merge the final row tagset into coltags 

1421 coltags = and_tagsets(lang, pos, coltags, row_tagsets) 

1422 # print( 

1423 # "HDRSPANS:", list((x.start, x.colspan, x.tagsets) for x in hdrspans) 

1424 # ) 

1425 if celltext == debug_cell_text: 1425 ↛ 1426line 1425 didn't jump to line 1426 because the condition on line 1425 was never true

1426 print("COMPUTE_COLTAGS {} {}: {}".format(start, colspan, coltags)) 

1427 assert isinstance(coltags, list) 

1428 assert all(isinstance(x, tuple) for x in coltags) 

1429 return coltags 

1430 

1431 

1432def parse_simple_table( 

1433 wxr: WiktextractContext, 

1434 tablecontext: "TableContext", 

1435 word: str, 

1436 lang: str, 

1437 pos: str, 

1438 rows: list[list[InflCell]], 

1439 titles: list[str], 

1440 source: str, 

1441 after: str, 

1442 depth: int, 

1443) -> list[FormData]: 

1444 """This is the default table parser. Despite its name, it can parse 

1445 complex tables. This returns a list of forms to be added to the 

1446 part-of-speech, or None if the table could not be parsed.""" 

1447 assert isinstance(wxr, WiktextractContext) 

1448 assert isinstance(tablecontext, TableContext) 

1449 assert isinstance(word, str) 

1450 assert isinstance(lang, str) 

1451 assert isinstance(pos, str) 

1452 assert isinstance(rows, list) 

1453 assert isinstance(source, str) 

1454 assert isinstance(after, str) 

1455 assert isinstance(depth, int) 

1456 for row in rows: 

1457 for cell in row: 

1458 assert isinstance(cell, InflCell) 

1459 assert isinstance(titles, list) 

1460 for x in titles: 

1461 assert isinstance(x, str) 

1462 

1463 # print("PARSE_SIMPLE_TABLE: TITLES:", titles) 

1464 if debug_cell_text: 1464 ↛ 1465line 1464 didn't jump to line 1465 because the condition on line 1464 was never true

1465 print("ROWS:") 

1466 for row in rows: 

1467 print(" ", row) 

1468 

1469 # Check for forced rowspan kludge. See e.g. 

1470 # maorski/Serbo-Croatian. These are essentially multi-row 

1471 # cells implemented using <br> rather than separate cell. We fix this 

1472 # by identifying rows where this happens, and splitting the current row 

1473 # to multiple rows by synthesizing additional cells. 

1474 new_rows = [] 

1475 for row in rows: 

1476 split_row = ( 

1477 any(x.is_title and x.text in ("inanimate\nanimate",) for x in row) 

1478 and 

1479 # x is an InflCell 

1480 all(x.rowspan == 1 for x in row) 

1481 ) 

1482 if not split_row: 

1483 new_rows.append(row) 

1484 continue 

1485 row1 = [] 

1486 row2 = [] 

1487 for cell in row: 

1488 cell1 = copy.deepcopy(cell) 

1489 if "\n" in cell.text: 

1490 # Has more than one line - split this cell 

1491 parts = cell.text.strip().splitlines() 

1492 if len(parts) != 2: 1492 ↛ 1493line 1492 didn't jump to line 1493 because the condition on line 1492 was never true

1493 wxr.wtp.debug( 

1494 "forced rowspan kludge got {} parts: {!r}".format( 

1495 len(parts), cell.text 

1496 ), 

1497 sortid="inflection/1234", 

1498 ) 

1499 cell2 = copy.deepcopy(cell) 

1500 cell1.text = parts[0] 

1501 cell2.text = parts[1] 

1502 else: 

1503 cell1.rowspan = 2 

1504 cell2 = cell1 # ref, not a copy 

1505 row1.append(cell1) 

1506 row2.append(cell2) 

1507 new_rows.append(row1) 

1508 new_rows.append(row2) 

1509 rows = new_rows 

1510 # print("ROWS AFTER FORCED ROWSPAN KLUDGE:") 

1511 # for row in rows: 

1512 # print(" ", row) 

1513 

1514 # Parse definitions for references (from table itself and from text 

1515 # after it) 

1516 def_ht = {} 

1517 

1518 def add_defs(defs: list[tuple[str, str]]) -> None: 

1519 for ref, d in defs: 

1520 # print("DEF: ref={} d={}".format(ref, d)) 

1521 d = d.strip() 

1522 d = d.split(". ")[0].strip() # text before ". " 

1523 if not d: 1523 ↛ 1524line 1523 didn't jump to line 1524 because the condition on line 1523 was never true

1524 continue 

1525 if d.endswith("."): # catc ".."?? 

1526 d = d[:-1] 

1527 tags, topics = decode_tags(d, no_unknown_starts=True) 

1528 # print(f"{ref=}, {transformed=}, {tags=}") 

1529 if topics or any("error-unknown-tag" in ts for ts in tags): 

1530 d = d[0].lower() + d[1:] 

1531 tags, topics = decode_tags(d, no_unknown_starts=True) 

1532 if topics or any("error-unknown-tag" in ts for ts in tags): 

1533 # Failed to parse as tags 

1534 # print("Failed: topics={} tags={}" 

1535 # .format(topics, tags)) 

1536 continue 

1537 tags1_s: set[str] = set() 

1538 for ts in tags: 

1539 # Set.update is a union operation: definition tags are flat 

1540 tags1_s.update(ts) 

1541 tags1 = tuple(sorted(tags1_s)) 

1542 # print("DEFINED: {} -> {}".format(ref, tags1)) 

1543 def_ht[ref] = tags1 

1544 

1545 def generate_tags( 

1546 rowtags: list[tuple[str, ...]], table_tags: list[str] 

1547 ) -> tuple[ 

1548 list[tuple[str, ...]], list[tuple[str, ...]], list[tuple[str, ...]] 

1549 ]: 

1550 new_coltags: list[tuple[str, ...]] = [] 

1551 all_hdr_tags: list[tuple[str, ...]] = [] # list of tuples 

1552 new_rowtags: list[tuple[str, ...]] = [] 

1553 for rt0 in rowtags: 

1554 for ct0 in compute_coltags( 

1555 lang, 

1556 pos, 

1557 hdrspans, 

1558 col_idx, # col_idx=>start 

1559 colspan, 

1560 col, # cell_text 

1561 ): 

1562 base_tags: set[str] = ( 

1563 set(rt0) | set(ct0) | set(global_tags) | set(table_tags) 

1564 ) # Union. 

1565 # print(f"{rt0=}, {ct0=}, {global_tags=}," 

1566 # f" {table_tags=}, {base_tags=}") 

1567 alt_tags = expand_header( 

1568 wxr, 

1569 tablecontext, 

1570 word, 

1571 lang, 

1572 pos, 

1573 text, 

1574 base_tags, 

1575 depth=depth, 

1576 column_number=col_idx, 

1577 ) 

1578 # base_tags are used in infl_map "if"-conds. 

1579 for tt in alt_tags: 

1580 if tt not in all_hdr_tags: 

1581 all_hdr_tags.append(tt) 

1582 tt_s = set(tt) 

1583 # Add tags from referenced footnotes 

1584 tt_s.update(refs_tags) 

1585 # Sort, convert to tuple, and add to set of 

1586 # alternatives. 

1587 tt = tuple(sorted(tt_s)) 

1588 if tt not in new_coltags: 

1589 new_coltags.append(tt) 

1590 # Kludge (saprast/Latvian/Verb): ignore row tags 

1591 # if trying to add a non-finite after mood. 

1592 if any(valid_tags[t] == "mood" for t in rt0) and any( 

1593 valid_tags[t] == "non-finite" for t in tt 

1594 ): 

1595 tags = tuple(sorted(set(tt) | set(hdr_tags))) 

1596 else: 

1597 tags = tuple(sorted(set(tt) | set(rt0) | set(hdr_tags))) 

1598 if tags not in new_rowtags: 

1599 new_rowtags.append(tags) 

1600 return new_rowtags, new_coltags, all_hdr_tags 

1601 

1602 def add_new_hdrspan( 

1603 col: str, 

1604 hdrspans: list[HdrSpan], 

1605 store_new_hdrspan: bool, 

1606 col0_followed_by_nonempty: bool, 

1607 col0_hdrspan: Optional[HdrSpan], 

1608 ) -> tuple[str, bool, Optional[HdrSpan]]: 

1609 hdrspan = HdrSpan( 

1610 col_idx, colspan, rowspan, rownum, new_coltags, col, all_headers 

1611 ) 

1612 hdrspans.append(hdrspan) 

1613 

1614 # infl-map tag "dummy-store-hdrspan" causes this new hdrspan 

1615 # to be added to a register of stored hdrspans to be used 

1616 # later with "dummy-load-stored-hdrspans". 

1617 if store_new_hdrspan: 1617 ↛ 1618line 1617 didn't jump to line 1618 because the condition on line 1617 was never true

1618 tablecontext.stored_hdrspans.append(hdrspan) 

1619 

1620 # Handle headers that are above left-side header 

1621 # columns and are followed by personal pronouns in 

1622 # remaining columns (basically headers that 

1623 # evaluate to no tags). In such cases widen the 

1624 # left-side header to the full row. 

1625 if previously_seen: # id(cell) in seen_cells previously 

1626 col0_followed_by_nonempty = True 

1627 return col, col0_followed_by_nonempty, col0_hdrspan 

1628 elif col0_hdrspan is None: 

1629 col0_hdrspan = hdrspan 

1630 elif any(all_hdr_tags): 1630 ↛ 1698line 1630 didn't jump to line 1698 because the condition on line 1630 was always true

1631 col0_cats = tagset_cats(col0_hdrspan.tagsets) 

1632 later_cats = tagset_cats(all_hdr_tags) 

1633 col0_allowed = get_lang_conf(lang, "hdr_expand_first") 

1634 later_allowed = get_lang_conf(lang, "hdr_expand_cont") 

1635 later_allowed = later_allowed | set(["dummy"]) 

1636 # dummy2 has different behavior than plain dummy 

1637 # and does not belong here. 

1638 

1639 # print("col0_cats={} later_cats={} " 

1640 # "fol_by_nonempty={} col_idx={} end={} " 

1641 # "tagsets={}" 

1642 # .format(col0_cats, later_cats, 

1643 # col0_followed_by_nonempty, col_idx, 

1644 # col0_hdrspan.start + 

1645 # col0_hdrspan.colspan, 

1646 # col0_hdrspan.tagsets)) 

1647 # print("col0.rowspan={} rowspan={}" 

1648 # .format(col0_hdrspan.rowspan, rowspan)) 

1649 # Only expand if [col0_cats and later_cats are allowed 

1650 # and don't overlap] and [col0 has tags], and there have 

1651 # been [no disallowed cells in between]. 

1652 # 

1653 # There are three cases here: 

1654 # - col0_hdrspan set, continue with allowed current 

1655 # - col0_hdrspan set, expand, start new 

1656 # - col0_hdrspan set, no expand, start new 

1657 if ( 

1658 not col0_followed_by_nonempty 

1659 and 

1660 # XXX Only one cat of tags: kunna/Swedish 

1661 # XXX len(col0_cats) == 1 and 

1662 col0_hdrspan.rowspan >= rowspan 

1663 and 

1664 # from hdrspan 

1665 not (later_cats - later_allowed) 

1666 and not (col0_cats & later_cats) 

1667 ): 

1668 # First case: col0 set, continue 

1669 return col, col0_followed_by_nonempty, col0_hdrspan 

1670 # We are going to start new col0_hdrspan. Check if 

1671 # we should expand. 

1672 if ( 

1673 not col0_followed_by_nonempty 

1674 and not (col0_cats - col0_allowed) 

1675 and 

1676 # Only "allowed" allowed 

1677 # XXX len(col0_cats) == 1 and 

1678 col_idx > col0_hdrspan.start + col0_hdrspan.colspan 

1679 ): 

1680 # col_idx is beyond current colspan 

1681 # *Expand* current col0_hdrspan 

1682 # print("EXPANDING COL0 MID: {} from {} to {} " 

1683 # "cols {}" 

1684 # .format(col0_hdrspan.text, 

1685 # col0_hdrspan.colspan, 

1686 # col_idx - col0_hdrspan.start, 

1687 # col0_hdrspan.tagsets)) 

1688 col0_hdrspan.colspan = col_idx - col0_hdrspan.start 

1689 col0_hdrspan.expanded = True 

1690 # Clear old col0_hdrspan 

1691 if col == debug_cell_text: 1691 ↛ 1692line 1691 didn't jump to line 1692 because the condition on line 1691 was never true

1692 print("START NEW {}".format(hdrspan.tagsets)) 

1693 col0_hdrspan = None 

1694 # Now start new, unless it comes from previous row 

1695 if not previously_seen: 1695 ↛ 1698line 1695 didn't jump to line 1698 because the condition on line 1695 was always true

1696 col0_hdrspan = hdrspan 

1697 col0_followed_by_nonempty = False 

1698 return col, col0_followed_by_nonempty, col0_hdrspan 

1699 

1700 def split_text_into_alts(col: str) -> tuple[str, list[str], list[str]]: 

1701 # Split the cell text into alternatives 

1702 split_extra_tags = [] 

1703 if col and is_superscript(col[0]): 1703 ↛ 1704line 1703 didn't jump to line 1704 because the condition on line 1703 was never true

1704 alts = [col] 

1705 else: 

1706 separators = [";", "•", r"\n", " or "] 

1707 if " + " not in col: 

1708 separators.append(",") 

1709 if not col.endswith("/"): 

1710 separators.append("/") 

1711 if col in special_phrase_splits: 

1712 # Use language-specific special splits. 

1713 # These are phrases and constructions that have 

1714 # unique ways of splitting, not specific characters 

1715 # to split on like with the default splitting. 

1716 alts, tags = special_phrase_splits[col] 

1717 split_extra_tags = tags.split() 

1718 for x in split_extra_tags: 

1719 assert x in valid_tags 

1720 assert isinstance(alts, (list, tuple)) 

1721 assert isinstance(tags, str) 

1722 elif ( 1722 ↛ 1742line 1722 didn't jump to line 1742 because the condition on line 1722 was never true

1723 ( 

1724 m := re.match( 

1725 # word1, word2 (romanization1, romanization2) 

1726 r"\s*([^(),]+),([^(),]+)\(([^(),]+),([^(),]+)\)", 

1727 col, 

1728 ) 

1729 ) 

1730 # NOT `word, (tag, tag)` with an empty m.group(2)... 

1731 # There is a test that fails because of this. It's an 

1732 # outdated table, but still, ...Italian_verb1 

1733 and all(s.strip() for s in m.groups()) 

1734 and any( 

1735 ( 

1736 # except for entries like word1, word2 (tag2, tag2)... 

1737 classify_desc(s) in ("english", "romanization") 

1738 for s in (m.group(3), m.group(4)) 

1739 ) 

1740 ) 

1741 ): 

1742 alts = [m.group(1), m.group(2), m.group(3), m.group(4)] 

1743 else: 

1744 # Use default splitting. However, recognize 

1745 # language-specific replacements and change them to magic 

1746 # characters before splitting. This way we won't split 

1747 # them. This is important for, e.g., recognizing 

1748 # alternative pronouns. 

1749 # The magic characters are characters out of Unicode scope 

1750 # that are given a simple incremental value, int > unicode. 

1751 repls = {} 

1752 magic_ch = MAGIC_FIRST 

1753 trs = get_lang_conf(lang, "form_transformations") 

1754 # trs is a list of lists of strings 

1755 for _, v, _, _ in trs: 

1756 # v is a pattern string, like "^ich" 

1757 # form_transformations data is doing double-duty here, 

1758 # because the pattern strings are already known to us and 

1759 # not meant to be split. 

1760 m = re.search(v, col) 

1761 if m is not None: 

1762 # if pattern found in text 

1763 magic = chr(magic_ch) 

1764 magic_ch += 1 # next magic character value 

1765 col = re.sub(v, magic, col) # replace with magic ch 

1766 repls[magic] = m.group(0) 

1767 # remember what regex match string each magic char 

1768 # replaces. .group(0) is the whole match. 

1769 alts0 = split_at_comma_semi(col, separators=separators) 

1770 # with magic characters in place, split the text so that 

1771 # pre-transformation text is out of the way. 

1772 alts = [] 

1773 for alt in alts0: 

1774 # create a new list with the separated items and 

1775 # the magic characters replaced with the original texts. 

1776 for k, v in repls.items(): 

1777 alt = re.sub(k, v, alt) 

1778 alts.append(alt) 

1779 

1780 # Remove "*" from beginning of forms, as in non-attested 

1781 # or reconstructed forms. Otherwise it might confuse romanization 

1782 # detection. 

1783 alts = list(re.sub(r"^\*\*?([^ ])", r"\1", x) for x in alts) 

1784 alts = list( 

1785 x for x in alts if not re.match(r"pronounced with |\(with ", x) 

1786 ) 

1787 alts = list( 

1788 re.sub(r"^\((in the sense [^)]*)\)\s+", "", x) for x in alts 

1789 ) 

1790 return col, alts, split_extra_tags 

1791 

1792 def handle_parens( 

1793 form: str, roman: str, clitic: str | None, extra_tags: list[str] 

1794 ) -> tuple[str, str, str | None]: 

1795 if TYPE_CHECKING: 

1796 assert isinstance(paren, str) 

1797 assert isinstance(m, re.Match) 

1798 if re.match(r"[’'][a-z]([a-z][a-z]?)?$", paren): 

1799 # is there a clitic starting with apostrophe? 

1800 clitic = paren 

1801 # assume the whole paren is a clitic 

1802 # then remove paren from form 

1803 form = (form[: m.start()] + subst + form[m.end() :]).strip() 

1804 elif classify_desc(paren) == "tags": 

1805 tagsets1, topics1 = decode_tags(paren) 

1806 if not topics1: 1806 ↛ 1827line 1806 didn't jump to line 1827 because the condition on line 1806 was always true

1807 for ts in tagsets1: 

1808 ts = tuple(x for x in ts if " " not in x) 

1809 # There are some generated tags containing 

1810 # spaces; do not let them through here. 

1811 extra_tags.extend(ts) 

1812 form = (form[: m.start()] + subst + form[m.end() :]).strip() 

1813 # brackets contain romanization 

1814 elif ( 

1815 m.start() > 0 

1816 and not roman 

1817 and classify_desc(form[: m.start()]) == "other" 

1818 and 

1819 # "other" ~ text 

1820 classify_desc(paren) in ("romanization", "english") 

1821 and not re.search(r"^with |-form$", paren) 

1822 ): 

1823 roman = paren 

1824 form = (form[: m.start()] + subst + form[m.end() :]).strip() 

1825 elif re.search(r"^with |-form", paren): 1825 ↛ 1826line 1825 didn't jump to line 1826 because the condition on line 1825 was never true

1826 form = (form[: m.start()] + subst + form[m.end() :]).strip() 

1827 return form, roman, clitic 

1828 

1829 def merge_row_and_column_tags( 

1830 form: str, 

1831 some_has_covered_text: bool, 

1832 links: list[tuple[str, str]] | None = None, 

1833 ) -> tuple[list[FormData], str, bool]: 

1834 # Merge column tags and row tags. We give preference 

1835 # to moods etc coming from rowtags (cf. austteigen/German/Verb 

1836 # imperative forms). 

1837 

1838 # In certain cases, what a tag means depends on whether 

1839 # it is a row or column header. Depending on the language, 

1840 # we replace certain tags with others if they're in 

1841 # a column or row 

1842 

1843 ret: list[FormData] = [] 

1844 # rtagreplacs = get_lang_conf(lang, "rowtag_replacements") 

1845 # ctagreplacs = get_lang_conf(lang, "coltag_replacements") 

1846 for rt in sorted(rowtags): 

1847 if "dummy-use-as-coltags" in rt: 1847 ↛ 1848line 1847 didn't jump to line 1848 because the condition on line 1847 was never true

1848 continue 

1849 # if lang was in rowtag_replacements) 

1850 # if not rtagreplacs == None: 

1851 # rt = replace_directional_tags(rt, rtagreplacs) 

1852 for ct in sorted(coltags): 

1853 if "dummy-use-as-rowtags" in ct: 1853 ↛ 1854line 1853 didn't jump to line 1854 because the condition on line 1853 was never true

1854 continue 

1855 # if lang was in coltag_replacements 

1856 # if not ctagreplacs == None: 

1857 # ct = replace_directional_tags(ct, 

1858 # ctagreplacs) 

1859 tags = set(global_tags) 

1860 tags.update(extra_tags) 

1861 tags.update(rt) 

1862 tags.update(refs_tags) 

1863 tags.update(tablecontext.section_header) 

1864 # Merge tags from column. For certain kinds of tags, 

1865 # those coming from row take precedence. 

1866 old_tags = set(tags) 

1867 for t in ct: 

1868 c = valid_tags[t] 

1869 if c in ("mood", "case", "number") and any( 

1870 valid_tags[tt] == c for tt in old_tags 

1871 ): 

1872 continue 

1873 tags.add(t) 

1874 

1875 # Extract language-specific tags from the 

1876 # form. This may also adjust the form. 

1877 form, lang_tags = lang_specific_tags(lang, pos, form) 

1878 tags.update(lang_tags) 

1879 

1880 # For non-finite verb forms, see if they have 

1881 # a gender/class suffix 

1882 if pos == "verb" and any( 

1883 valid_tags[t] == "non-finite" for t in tags 

1884 ): 

1885 form, tt = parse_head_final_tags(wxr, lang, form) 

1886 tags.update(tt) 

1887 

1888 # Remove "personal" tag if have nth person; these 

1889 # come up with e.g. reconhecer/Portuguese/Verb. But 

1890 # not if we also have "pronoun" 

1891 if ( 

1892 "personal" in tags 

1893 and "pronoun" not in tags 

1894 and any( 

1895 x in tags 

1896 for x in [ 

1897 "first-person", 

1898 "second-person", 

1899 "third-person", 

1900 ] 

1901 ) 

1902 ): 

1903 tags.remove("personal") 

1904 

1905 # If we have impersonal, remove person and number. 

1906 # This happens with e.g. viajar/Portuguese/Verb 

1907 if "impersonal" in tags: 

1908 tags = tags - set( 

1909 [ 

1910 "first-person", 

1911 "second-person", 

1912 "third-person", 

1913 "singular", 

1914 "plural", 

1915 ] 

1916 ) 

1917 

1918 # Remove unnecessary "positive" tag from verb forms 

1919 if pos == "verb" and "positive" in tags: 

1920 if "negative" in tags: 1920 ↛ 1921line 1920 didn't jump to line 1921 because the condition on line 1920 was never true

1921 tags.remove("negative") 

1922 tags.remove("positive") 

1923 

1924 # Many Russian (and other Slavic) inflection tables 

1925 # have animate/inanimate distinction that generates 

1926 # separate entries for neuter/feminine, but the 

1927 # distinction only applies to masculine. Remove them 

1928 # form neuter/feminine and eliminate duplicates. 

1929 if get_lang_conf(lang, "masc_only_animate"): 

1930 for t1 in ("animate", "inanimate"): 

1931 for t2 in ("neuter", "feminine"): 

1932 if ( 

1933 t1 in tags 

1934 and t2 in tags 

1935 and "masculine" not in tags 

1936 and "plural" not in tags 

1937 ): 

1938 tags.remove(t1) 

1939 

1940 # German adjective tables contain "(keiner)" etc 

1941 # for mixed declension plural. When the adjective 

1942 # disappears and it becomes just one word, remove 

1943 # the "includes-article" tag. e.g. eiskalt/German 

1944 if "includes-article" in tags and " " not in form: 

1945 tags.remove("includes-article") 

1946 

1947 # Handle ignored forms. We mark that the form was 

1948 # provided. This is important information; some words 

1949 # just do not have a certain form. However, there also 

1950 # many cases where no word in a language has a 

1951 # particular form. Post-processing could detect and 

1952 # remove such cases. 

1953 if form in IGNORED_COLVALUES: 

1954 # if cell text seems to be ignorable 

1955 if "dummy-ignore-skipped" in tags: 

1956 continue 

1957 if ( 

1958 col_idx not in has_covering_hdr 

1959 and some_has_covered_text 

1960 ): 

1961 continue 

1962 # don't ignore this cell if there's been a header 

1963 # above it 

1964 form = "-" 

1965 elif col_idx in has_covering_hdr: 

1966 some_has_covered_text = True 

1967 

1968 # Handle ambiguous object concord. If a header 

1969 # gives the "dummy-object-concord"-tag to a word, 

1970 # replace person, number and gender tags with 

1971 # their "object-" counterparts so that the verb 

1972 # agrees with the object instead. 

1973 # Use only when the verb has ONLY object agreement! 

1974 # a پخول/Pashto 

1975 if "dummy-object-concord" in tags: 1975 ↛ 1976line 1975 didn't jump to line 1976 because the condition on line 1975 was never true

1976 for subtag, objtag in object_concord_replacements.items(): 

1977 if subtag in tags: 

1978 tags.remove(subtag) 

1979 tags.add(objtag) 

1980 

1981 # Remove the dummy mood tag that we sometimes 

1982 # use to block adding other mood and related 

1983 # tags 

1984 tags = tags - set( 

1985 [ 

1986 "dummy-mood", 

1987 "dummy-tense", 

1988 "dummy-ignore-skipped", 

1989 "dummy-object-concord", 

1990 "dummy-reset-headers", 

1991 "dummy-use-as-coltags", 

1992 "dummy-use-as-rowtags", 

1993 "dummy-store-hdrspan", 

1994 "dummy-load-stored-hdrspans", 

1995 "dummy-reset-stored-hdrspans", 

1996 "dummy-section-header", 

1997 ] 

1998 ) 

1999 

2000 # Perform language-specific tag replacements according 

2001 # to rules in a table. 

2002 lang_tag_mappings = get_lang_conf(lang, "lang_tag_mappings") 

2003 if lang_tag_mappings is not None: 2003 ↛ 2004line 2003 didn't jump to line 2004 because the condition on line 2003 was never true

2004 for pre, post in lang_tag_mappings.items(): 

2005 if all(t in tags for t in pre): 

2006 tags = (tags - set(pre)) | set(post) 

2007 

2008 # Warn if there are entries with empty tags 

2009 if not tags: 

2010 wxr.wtp.debug( 

2011 "inflection table: empty tags for {}".format(form), 

2012 sortid="inflection/1826", 

2013 ) 

2014 

2015 # Warn if form looks like IPA 

2016 ########## XXX ######## 

2017 # Because IPA is its own unicode block, we could also 

2018 # technically do a Unicode name check to see if a string 

2019 # contains IPA. Not all valid IPA characters are in the 

2020 # IPA extension block, so you can technically have false 

2021 # negatives if it's something like /toki/, but it 

2022 # shouldn't give false positives. 

2023 # Alternatively, you could make a list of IPA-admissible 

2024 # characters and reject non-IPA stuff with that. 

2025 if re.match(r"\s*/.*/\s*$", form): 2025 ↛ 2026line 2025 didn't jump to line 2026 because the condition on line 2025 was never true

2026 wxr.wtp.debug( 

2027 "inflection table form looks like IPA: " 

2028 "form={} tags={}".format(form, tags), 

2029 sortid="inflection/1840", 

2030 ) 

2031 

2032 # Note that this checks `form`, not `in tags` 

2033 if form == "dummy-ignored-text-cell": 2033 ↛ 2034line 2033 didn't jump to line 2034 because the condition on line 2033 was never true

2034 continue 

2035 

2036 if "dummy-remove-this-cell" in tags: 2036 ↛ 2037line 2036 didn't jump to line 2037 because the condition on line 2036 was never true

2037 continue 

2038 

2039 # Add the form 

2040 tags_list = list(sorted(tags)) 

2041 dt: FormData = { 

2042 "form": form, 

2043 "tags": tags_list, 

2044 "source": source, 

2045 } 

2046 if roman: 

2047 dt["roman"] = roman 

2048 if ipa: 

2049 dt["ipa"] = ipa 

2050 if cell_links is not None and ( 

2051 matched_links := match_links_to_form( 

2052 wxr, form, cell_links, None 

2053 ) 

2054 ): 

2055 dt["links"] = matched_links 

2056 ret.append(dt) 

2057 # If we got separate clitic form, add it 

2058 if clitic: 

2059 dt = { 

2060 "form": clitic, 

2061 "tags": tags_list + ["clitic"], 

2062 "source": source, 

2063 } 

2064 ret.append(dt) 

2065 return ret, form, some_has_covered_text 

2066 

2067 # First extract definitions from cells 

2068 # See defs_ht for footnote defs stuff 

2069 for row in rows: 

2070 for cell in row: 

2071 text, refs, defs, hdr_tags = extract_cell_content( 

2072 lang, word, cell.text 

2073 ) 

2074 # refs, defs = footnote stuff, defs -> (ref, def) 

2075 add_defs(defs) 

2076 # Extract definitions from text after table 

2077 text, refs, defs, hdr_tags = extract_cell_content(lang, word, after) 

2078 add_defs(defs) 

2079 

2080 # Then extract the actual forms 

2081 ret = [] 

2082 hdrspans: list[HdrSpan] = [] 

2083 first_col_has_text = False 

2084 rownum = 0 

2085 title = None 

2086 global_tags = [] 

2087 table_tags = [] 

2088 special_phrase_splits = get_lang_conf(lang, "special_phrase_splits") 

2089 form_replacements = get_lang_conf(lang, "form_replacements") 

2090 form_transformations = get_lang_conf(lang, "form_transformations") 

2091 possibly_ignored_forms = get_lang_conf(lang, "conditionally_ignored_cells") 

2092 cleanup_rules = get_lang_conf(lang, "minor_text_cleanups") 

2093 

2094 for title in titles: 

2095 more_global_tags, more_table_tags, extra_forms = parse_title( 

2096 title, source 

2097 ) 

2098 global_tags.extend(more_global_tags) 

2099 table_tags.extend(more_table_tags) 

2100 ret.extend(extra_forms) 

2101 cell_rowcnt: collections.defaultdict[int, int] = collections.defaultdict( 

2102 int 

2103 ) 

2104 seen_cells = set() 

2105 has_covering_hdr = set() 

2106 some_has_covered_text = False 

2107 for row in rows: 

2108 # print("ROW:", row) 

2109 # print("====") 

2110 # print(f"Start of PREVIOUS row hdrspans:" 

2111 # f"{tuple(sp.tagsets for sp in hdrspans)}") 

2112 # print(f"Start of row txt: {tuple(t.text for t in row)}") 

2113 if not row: 2113 ↛ 2114line 2113 didn't jump to line 2114 because the condition on line 2113 was never true

2114 continue # Skip empty rows 

2115 all_headers = all(x.is_title or not x.text.strip() for x in row) 

2116 text = row[0].text 

2117 if ( 

2118 row[0].is_title 

2119 and text 

2120 and not is_superscript(text[0]) 

2121 and text not in infl_map # zealous inflation map? 

2122 and ( 

2123 re.match(r"Inflection ", text) 

2124 or re.sub( 

2125 r"\s+", 

2126 " ", # flatten whitespace 

2127 re.sub( 

2128 r"\s*\([^)]*\)", 

2129 "", 

2130 # Remove whitespace+parens 

2131 text, 

2132 ), 

2133 ).strip() 

2134 not in infl_map 

2135 ) 

2136 and not re.match(infl_start_re, text) 

2137 and all( 

2138 x.is_title == row[0].is_title and x.text == text 

2139 # all InflCells in `row` have the same is_title and text 

2140 for x in row 

2141 ) 

2142 ): 

2143 if text and title is None: 

2144 # Only if there were no titles previously make the first 

2145 # text that is found the title 

2146 title = text 

2147 if re.match(r"(Note:|Notes:)", title): 2147 ↛ 2148line 2147 didn't jump to line 2148 because the condition on line 2147 was never true

2148 continue # not a title 

2149 more_global_tags, more_table_tags, extra_forms = parse_title( 

2150 title, source 

2151 ) 

2152 global_tags.extend(more_global_tags) 

2153 table_tags.extend(more_table_tags) 

2154 ret.extend(extra_forms) 

2155 continue # Skip title rows without incrementing i 

2156 if "dummy-skip-this" in global_tags: 2156 ↛ 2157line 2156 didn't jump to line 2157 because the condition on line 2156 was never true

2157 return [] 

2158 rowtags: list[tuple[str, ...]] = [()] 

2159 # have_hdr = False 

2160 # have_hdr never used? 

2161 have_text = False 

2162 samecell_cnt = 0 

2163 col0_hdrspan = None # col0 or later header (despite its name) 

2164 col0_followed_by_nonempty = False 

2165 row_empty = True 

2166 for col_idx, cell in enumerate(row): 

2167 colspan = cell.colspan # >= 1 

2168 rowspan = cell.rowspan # >= 1 

2169 cell_links = cell.links # for weird links 

2170 previously_seen = id(cell) in seen_cells 

2171 # checks to see if this cell was in the previous ROW 

2172 seen_cells.add(id(cell)) 

2173 if samecell_cnt == 0: 

2174 # First column of a (possible multi-column) cell 

2175 samecell_cnt = colspan - 1 

2176 else: 

2177 assert samecell_cnt > 0 

2178 samecell_cnt -= 1 

2179 continue 

2180 

2181 # is_first_row_of_cell = cell_rowcnt[id(cell)] == 0 

2182 # never used? 

2183 

2184 # defaultdict(int) around line 1900 

2185 cell_rowcnt[id(cell)] += 1 

2186 # => how many cols this spans 

2187 col: str = cell.text 

2188 if not col: 

2189 continue 

2190 row_empty = False 

2191 is_title = cell.is_title 

2192 

2193 # If the cell has a target, i.e., text after colon, interpret 

2194 # it as simply specifying a value for that value and ignore 

2195 # it otherwise. 

2196 if cell.target: 

2197 text, refs, defs, hdr_tags = extract_cell_content( 

2198 lang, word, col 

2199 ) 

2200 if not text: 2200 ↛ 2201line 2200 didn't jump to line 2201 because the condition on line 2200 was never true

2201 continue 

2202 refs_tags: set[str] = set() 

2203 for ref in refs: # gets tags from footnotes 2203 ↛ 2204line 2203 didn't jump to line 2204 because the loop on line 2203 never started

2204 if ref in def_ht: 

2205 refs_tags.update(def_ht[ref]) 

2206 rowtags = expand_header( 

2207 wxr, 

2208 tablecontext, 

2209 word, 

2210 lang, 

2211 pos, 

2212 text, 

2213 [], 

2214 silent=True, 

2215 depth=depth, 

2216 column_number=col_idx, 

2217 ) 

2218 rowtags = list( 

2219 set(tuple(sorted(set(x) | refs_tags)) for x in rowtags) 

2220 ) 

2221 is_title = False 

2222 col = cell.target 

2223 

2224 # print(rownum, col_idx, col) 

2225 # print(f"is_title: {is_title}") 

2226 if is_title: 

2227 # It is a header cell 

2228 text, refs, defs, hdr_tags = extract_cell_content( 

2229 lang, word, col 

2230 ) 

2231 if not text: 

2232 continue 

2233 # Extract tags from referenced footnotes 

2234 refs_tags = set() 

2235 for ref in refs: 

2236 if ref in def_ht: 

2237 refs_tags.update(def_ht[ref]) 

2238 

2239 # Expand header to tags 

2240 v = expand_header( 

2241 wxr, 

2242 tablecontext, 

2243 word, 

2244 lang, 

2245 pos, 

2246 text, 

2247 [], 

2248 silent=True, 

2249 depth=depth, 

2250 column_number=col_idx, 

2251 ) 

2252 # print("EXPANDED {!r} to {}".format(text, v)) 

2253 

2254 if col_idx == 0: 

2255 # first_col_has_text is used for a test to ignore 

2256 # upper-left cells that are just text without 

2257 # header info 

2258 first_col_has_text = True 

2259 # Check if the header expands to reset hdrspans 

2260 if any("dummy-reset-headers" in tt for tt in v): 

2261 new_hdrspans = [] 

2262 for hdrspan in hdrspans: 

2263 # if there are HdrSpan objects (abstract headers with 

2264 # row- and column-spans) that are to the left or at the 

2265 # same row or below, KEEP those; things above and to 

2266 # the right of the hdrspan with dummy-reset-headers 

2267 # are discarded. Tags from the header together with 

2268 # dummy-reset-headers are kept as normal. 

2269 if ( 

2270 hdrspan.start + hdrspan.colspan < col_idx 

2271 or hdrspan.rownum > rownum - cell.rowspan 

2272 ): 

2273 new_hdrspans.append(hdrspan) 

2274 hdrspans = new_hdrspans 

2275 

2276 for tt in v: 

2277 if "dummy-section-header" in tt: 2277 ↛ 2278line 2277 didn't jump to line 2278 because the condition on line 2277 was never true

2278 tablecontext.section_header = tt 

2279 break 

2280 if "dummy-reset-section-header" in tt: 2280 ↛ 2281line 2280 didn't jump to line 2281 because the condition on line 2280 was never true

2281 tablecontext.section_header = tuple() 

2282 # Text between headers on a row causes earlier headers to 

2283 # be reset 

2284 if have_text: 

2285 # print(" HAVE_TEXT BEFORE HDR:", col) 

2286 # Reset rowtags if new title column after previous 

2287 # text cells 

2288 # +-----+-----+-----+-----+ 

2289 # |hdr-a|txt-a|hdr-B|txt-B| 

2290 # +-----+-----+-----+-----+ 

2291 # ^reset rowtags=> 

2292 # XXX beware of header "—": "" - must not clear on that if 

2293 # it expands to no tags 

2294 rowtags = [()] 

2295 # have_hdr = True 

2296 # have_hdr never used? 

2297 # print("HAVE_HDR: {} rowtags={}".format(col, rowtags)) 

2298 # Update rowtags and coltags 

2299 has_covering_hdr.add(col_idx) # col_idx == current column 

2300 # has_covering_hdr is a set that has the col_idx-ids of columns 

2301 # that have previously had some kind of header. It is never 

2302 # resetted inside the col_idx-loops OR the bigger rows-loop, so 

2303 # applies to the whole table. 

2304 

2305 new_coltags: list[tuple[str, ...]] 

2306 all_hdr_tags: list[tuple[str, ...]] 

2307 rowtags, new_coltags, all_hdr_tags = generate_tags( 

2308 rowtags, table_tags 

2309 ) 

2310 

2311 if any("dummy-skip-this" in ts for ts in rowtags): 

2312 continue # Skip this cell 

2313 

2314 if any("dummy-load-stored-hdrspans" in ts for ts in v): 2314 ↛ 2315line 2314 didn't jump to line 2315 because the condition on line 2314 was never true

2315 hdrspans.extend(tablecontext.stored_hdrspans) 

2316 

2317 if any("dummy-reset-stored-hdrspans" in ts for ts in v): 2317 ↛ 2318line 2317 didn't jump to line 2318 because the condition on line 2317 was never true

2318 tablecontext.stored_hdrspans = [] 

2319 

2320 if any("dummy-store-hdrspan" in ts for ts in v): 2320 ↛ 2322line 2320 didn't jump to line 2322 because the condition on line 2320 was never true

2321 # print(f"STORED: {col}") 

2322 store_new_hdrspan = True 

2323 else: 

2324 store_new_hdrspan = False 

2325 

2326 new_coltags = list( 

2327 x 

2328 for x in new_coltags 

2329 if not any(t in noinherit_tags for t in x) 

2330 ) 

2331 # print("new_coltags={} previously_seen={} all_hdr_tags={}" 

2332 # .format(new_coltags, previously_seen, all_hdr_tags)) 

2333 if any(new_coltags): 

2334 ( 

2335 col, 

2336 col0_followed_by_nonempty, 

2337 col0_hdrspan, 

2338 ) = add_new_hdrspan( 

2339 col, 

2340 hdrspans, 

2341 store_new_hdrspan, 

2342 col0_followed_by_nonempty, 

2343 col0_hdrspan, 

2344 ) 

2345 

2346 continue 

2347 

2348 # These values are ignored, at least for now 

2349 if re.match(r"^(# |\(see )", col): 2349 ↛ 2350line 2349 didn't jump to line 2350 because the condition on line 2349 was never true

2350 continue 

2351 

2352 if any("dummy-skip-this" in ts for ts in rowtags): 

2353 continue # Skip this cell 

2354 

2355 # If the word has no rowtags and is a multi-row cell, then 

2356 # ignore this. This happens with empty separator rows 

2357 # within a rowspan>1 cell. cf. wander/English/Conjugation. 

2358 if rowtags == [()] and rowspan > 1: 

2359 continue 

2360 

2361 # Minor cleanup. See e.g. είμαι/Greek/Verb present participle. 

2362 if cleanup_rules: 

2363 for regx, substitution in cleanup_rules.items(): 

2364 col = re.sub(regx, substitution, col) 

2365 

2366 if ( 2366 ↛ 2371line 2366 didn't jump to line 2371 because the condition on line 2366 was never true

2367 col_idx == 0 

2368 and not first_col_has_text 

2369 and get_lang_conf(lang, "ignore_top_left_text_cell") is True 

2370 ): 

2371 continue # Skip text at top left, as in Icelandic, Faroese 

2372 

2373 # if col0_hdrspan is not None: 

2374 # print("COL0 FOLLOWED NONHDR: {!r} by {!r}" 

2375 # .format(col0_hdrspan.text, col)) 

2376 col0_followed_by_nonempty = True 

2377 have_text = True 

2378 

2379 # Determine column tags for the multi-column cell 

2380 combined_coltags = compute_coltags( 

2381 lang, pos, hdrspans, col_idx, colspan, col 

2382 ) 

2383 if any("dummy-ignored-text-cell" in ts for ts in combined_coltags): 2383 ↛ 2384line 2383 didn't jump to line 2384 because the condition on line 2383 was never true

2384 continue 

2385 

2386 # Split the text into separate forms. First simplify spaces except 

2387 # newline. 

2388 col = re.sub(r"[ \t\r]+", " ", col) 

2389 # Split the cell text into alternatives 

2390 

2391 col, alts, split_extra_tags = split_text_into_alts(col) 

2392 

2393 # Some cells have mixed form content, like text and romanization, 

2394 # or text and IPA. Handle these. 

2395 altss = handle_mixed_lines(alts, tablecontext) 

2396 

2397 altsss = list((x, combined_coltags, cell_links) for x in altss) 

2398 

2399 # Generate forms from the alternatives 

2400 # alts is a list of (tuple of forms, tuple of tags) 

2401 coltags: list[tuple[str, ...]] 

2402 base_roman: str 

2403 ipa: str 

2404 for (form, base_roman, ipa), coltags, cell_links in altsss: 

2405 form = form.strip() 

2406 extra_tags: list[str] = [] 

2407 extra_tags.extend(split_extra_tags) 

2408 # Handle special splits again here, so that we can have custom 

2409 # mappings from form to form and tags. 

2410 if form in form_replacements: 

2411 replacement, tags = form_replacements[form] 

2412 for x in tags.split(): 

2413 assert x in valid_tags 

2414 assert isinstance(replacement, str) 

2415 assert isinstance(tags, str) 

2416 form = replacement 

2417 extra_tags.extend(tags.split()) 

2418 

2419 check_romanization_form_transformation = False 

2420 # loop over regexes in form_transformation and replace text 

2421 # in form using regex patterns 

2422 # this does a bit of the same stuff the above does, 

2423 # but with regexes and re.sub() instead 

2424 subst: str 

2425 for ( 

2426 form_transformations_pos, 

2427 vv, 

2428 subst, 

2429 tags, 

2430 ) in form_transformations: 

2431 # v is a pattern string, like "^ich" 

2432 if ( 

2433 isinstance(form_transformations_pos, str) 

2434 and pos != form_transformations_pos 

2435 ) or ( 

2436 (not isinstance(form_transformations_pos, str)) 

2437 and pos not in form_transformations_pos 

2438 ): 

2439 continue 

2440 m: re.Match | None = re.search(vv, form) 

2441 if m is not None: 

2442 if base_roman: 2442 ↛ 2443line 2442 didn't jump to line 2443 because the condition on line 2442 was never true

2443 for _, rom_v, rom_sub, _ in form_transformations: 

2444 rom_m = re.search(rom_v, base_roman) 

2445 if rom_m is not None: 

2446 base_roman = re.sub( 

2447 rom_v, rom_sub, base_roman 

2448 ) 

2449 break 

2450 form = re.sub(vv, subst, form) 

2451 for x in tags.split(): 

2452 assert x in valid_tags 

2453 extra_tags.extend(tags.split()) 

2454 check_romanization_form_transformation = True 

2455 break 

2456 

2457 # Clean the value, extracting reference symbols 

2458 form, refs, defs, hdr_tags = extract_cell_content( 

2459 lang, word, form 

2460 ) 

2461 # if refs: 

2462 # print("REFS:", refs) 

2463 extra_tags.extend(hdr_tags) 

2464 # Extract tags from referenced footnotes 

2465 refs_tags = set() 

2466 for ref in refs: 

2467 if ref in def_ht: 

2468 refs_tags.update(def_ht[ref]) 

2469 

2470 if base_roman: 

2471 if check_romanization_form_transformation: 2471 ↛ 2475line 2471 didn't jump to line 2475 because the condition on line 2471 was never true

2472 # because form_transformations are used to handle things 

2473 # where the romanization has the "same" structure, we 

2474 # need to handle that here too.... 

2475 for ( 

2476 _, 

2477 vv, 

2478 subst, 

2479 _, 

2480 ) in form_transformations: 

2481 # v is a pattern string, like "^ich" 

2482 m = re.search(vv, base_roman) 

2483 if m is not None: 

2484 base_roman = re.sub(vv, subst, base_roman) 

2485 # XXX add tag stuff here if needed 

2486 break 

2487 

2488 base_roman, _, _, hdr_tags = extract_cell_content( 

2489 lang, word, base_roman 

2490 ) 

2491 extra_tags.extend(hdr_tags) 

2492 

2493 # Do some additional cleanup on the cell. 

2494 form = re.sub(r"^\s*,\s*", "", form) 

2495 form = re.sub(r"\s*,\s*$", "", form) 

2496 form = re.sub(r"\s*(,\s*)+", ", ", form) 

2497 form = re.sub(r"(?i)^Main:", "", form) 

2498 form = re.sub(r"\s+", " ", form) 

2499 form = form.strip() 

2500 

2501 # Look for parentheses that have semantic meaning 

2502 form, et = find_semantic_parens(form, lang) 

2503 extra_tags.extend(et) 

2504 

2505 # Handle parentheses in the table element. We parse 

2506 # tags anywhere and romanizations anywhere but beginning. 

2507 roman: str = base_roman 

2508 paren: str | None = None 

2509 clitic: str | None = None 

2510 m = re.search(r"(\s+|^)\(([^)]*)\)", form) 

2511 # start|spaces + (anything) 

2512 if m is not None: 

2513 subst = m.group(1) 

2514 paren = m.group(2) 

2515 else: 

2516 m = re.search(r"\(([^)]*)\)(\s+|$)", form) 

2517 # (anything) + spaces|end 

2518 if m is not None: 2518 ↛ 2519line 2518 didn't jump to line 2519 because the condition on line 2518 was never true

2519 paren = m.group(1) 

2520 subst = m.group(2) 

2521 if paren is not None: 

2522 form, roman, clitic = handle_parens( 

2523 form, roman, clitic, extra_tags 

2524 ) 

2525 

2526 # Ignore certain forms that are not really forms, 

2527 # unless they're really, really close to the article title 

2528 if form in ( 2528 ↛ 2533line 2528 didn't jump to line 2533 because the condition on line 2528 was never true

2529 "", 

2530 "unchanged", 

2531 "after an", # in sona/Irish/Adj/Mutation 

2532 ): 

2533 Lev = distw([form], word) 

2534 if form and Lev < 0.1: 

2535 wxr.wtp.debug( 

2536 "accepted possible false positive '{}' with" 

2537 "> 0.1 Levenshtein distance in {}/{}".format( 

2538 form, word, lang 

2539 ), 

2540 sortid="inflection/2213", 

2541 ) 

2542 elif form and Lev < 0.3: 

2543 wxr.wtp.debug( 

2544 "skipped possible match '{}' with > 0.3" 

2545 "Levenshtein distance in {}/{}".format( 

2546 form, word, lang 

2547 ), 

2548 sortid="inflection/2218", 

2549 ) 

2550 continue 

2551 else: 

2552 continue 

2553 # print("ROWTAGS={} COLTAGS={} REFS_TAGS={} " 

2554 # "FORM={!r} ROMAN={!r}" 

2555 # .format(rowtags, coltags, refs_tags, 

2556 # form, roman)) 

2557 

2558 # Merge tags from row and column and do miscellaneous 

2559 # tag-related handling. 

2560 ( 

2561 merge_ret, 

2562 form, 

2563 some_has_covered_text, 

2564 ) = merge_row_and_column_tags( 

2565 form, some_has_covered_text, cell_links 

2566 ) 

2567 ret.extend(merge_ret) 

2568 

2569 # End of row. 

2570 rownum += 1 

2571 # For certain languages, if the row was empty, reset 

2572 # hdrspans (saprast/Latvian/Verb, but not aussteigen/German/Verb). 

2573 if row_empty and get_lang_conf(lang, "empty_row_resets"): 

2574 hdrspans = [] 

2575 # Check if we should expand col0_hdrspan. 

2576 if col0_hdrspan is not None: 

2577 col0_allowed = get_lang_conf(lang, "hdr_expand_first") 

2578 col0_cats = tagset_cats(col0_hdrspan.tagsets) 

2579 # Only expand if col0_cats and later_cats are allowed 

2580 # and don't overlap and col0 has tags, and there have 

2581 # been no disallowed cells in between. 

2582 if ( 

2583 not col0_followed_by_nonempty 

2584 and not (col0_cats - col0_allowed) 

2585 and 

2586 # len(col0_cats) == 1 and 

2587 col_idx > col0_hdrspan.start + col0_hdrspan.colspan 

2588 ): 

2589 # If an earlier header is only followed by headers that yield 

2590 # no tags, expand it to entire row 

2591 # print("EXPANDING COL0: {} from {} to {} cols {}" 

2592 # .format(col0_hdrspan.text, col0_hdrspan.colspan, 

2593 # len(row) - col0_hdrspan.start, 

2594 # col0_hdrspan.tagsets)) 

2595 col0_hdrspan.colspan = len(row) - col0_hdrspan.start 

2596 col0_hdrspan.expanded = True 

2597 # XXX handle refs and defs 

2598 # for x in hdrspans: 

2599 # print(" HDRSPAN {} {} {} {!r}" 

2600 # .format(x.start, x.colspan, x.tagsets, x.text)) 

2601 

2602 # Post-process German nouns with articles in separate columns. We move the 

2603 # definite/indefinite/usually-without-article markers into the noun and 

2604 # remove the article entries. 

2605 if get_lang_conf(lang, "articles_in_separate_columns") and any( 

2606 "noun" in x["tags"] for x in ret 

2607 ): 

2608 new_ret = [] 

2609 saved_tags: set[str] = set() 

2610 had_noun = False 

2611 for dt in ret: 

2612 tags = dt["tags"] 

2613 # print(tags) 

2614 if "noun" in tags: 

2615 tags = list( 

2616 sorted(set(t for t in tags if t != "noun") | saved_tags) 

2617 ) 

2618 had_noun = True 

2619 elif ( 2619 ↛ 2646line 2619 didn't jump to line 2646 because the condition on line 2619 was always true

2620 "indefinite" in tags 

2621 or "definite" in tags 

2622 or "usually-without-article" in tags 

2623 or "without-article" in tags 

2624 ): 

2625 if had_noun: 

2626 saved_tags = set(tags) 

2627 else: 

2628 saved_tags = saved_tags | set(tags) # E.g. Haus/German 

2629 remove_useless_tags(lang, pos, saved_tags) 

2630 saved_tags = saved_tags & set( 

2631 [ 

2632 "masculine", 

2633 "feminine", 

2634 "neuter", 

2635 "singular", 

2636 "plural", 

2637 "indefinite", 

2638 "definite", 

2639 "usually-without-article", 

2640 "without-article", 

2641 ] 

2642 ) 

2643 had_noun = False 

2644 continue # Skip the articles 

2645 

2646 dt = dt.copy() 

2647 dt["tags"] = tags 

2648 new_ret.append(dt) 

2649 ret = new_ret 

2650 

2651 elif possibly_ignored_forms: 

2652 # Some languages have tables with cells that are kind of separated 

2653 # and difficult to handle, like eulersche Formel/German where 

2654 # the definite and indefinite articles are just floating. 

2655 # If a language has a dict of conditionally_ignored_cells, 

2656 # and if the contents of a cell is found in one of the rules 

2657 # there, ignore that cell if it 

2658 # 1. Does not have the appropriate tag (like "definite" for "die") 

2659 # and 

2660 # 2. The title of the article is not one of the other co-words 

2661 # (ie. it's an article for the definite articles in german etc.) 

2662 # pass 

2663 new_ret = [] 

2664 for cell_data in ret: 

2665 tags = cell_data["tags"] 

2666 text = cell_data["form"] 

2667 skip_this = False 

2668 for key_tag, ignored_forms in possibly_ignored_forms.items(): 

2669 if text not in ignored_forms: 2669 ↛ 2671line 2669 didn't jump to line 2671 because the condition on line 2669 was always true

2670 continue 

2671 if word in ignored_forms: 

2672 continue 

2673 if key_tag not in tags: 

2674 skip_this = True 

2675 

2676 if skip_this: 2676 ↛ 2677line 2676 didn't jump to line 2677 because the condition on line 2676 was never true

2677 continue 

2678 new_ret.append(cell_data) 

2679 

2680 ret = new_ret 

2681 

2682 # Post-process English inflection tables, addding "multiword-construction" 

2683 # when the number of words has increased. 

2684 if lang == "English" and pos == "verb": 

2685 word_words = len(word.split()) 

2686 new_ret = [] 

2687 for dt in ret: 

2688 form = dt.get("form", "") 

2689 if len(form.split()) > word_words: 

2690 dt = dt.copy() 

2691 dt["tags"] = list(dt.get("tags", [])) 

2692 # This strange copy-assigning shuffle is preventative black 

2693 # magic; do not touch lest you invoke deep bugs. 

2694 data_append(dt, "tags", "multiword-construction") 

2695 new_ret.append(dt) 

2696 ret = new_ret 

2697 

2698 # Always insert "table-tags" detail as the first entry in any inflection 

2699 # table. This way we can reliably detect where a new table starts. 

2700 # Table-tags applies until the next table-tags entry. 

2701 if ret or table_tags: 

2702 table_tags = sorted(set(table_tags)) 

2703 dt = { 

2704 "form": " ".join(table_tags), 

2705 "source": source, 

2706 "tags": ["table-tags"], 

2707 } 

2708 if dt["form"] == "": 

2709 dt["form"] = "no-table-tags" 

2710 if tablecontext.template_name: 

2711 tn: FormData = { 

2712 "form": tablecontext.template_name, 

2713 "source": source, 

2714 "tags": ["inflection-template"], 

2715 } 

2716 ret = [dt] + [tn] + ret 

2717 else: 

2718 ret = [dt] + ret 

2719 

2720 return ret 

2721 

2722 

2723def find_semantic_parens(form: str, lang: str) -> tuple[str, list[str]]: 

2724 # "Some languages" (=Greek) use brackets to mark things that 

2725 # require tags, like (informality), [rarity] and {archaicity}. 

2726 extra_tags = [] 

2727 if re.match(r"\([^][(){}]*\)$", form): 

2728 if get_lang_conf(lang, "parentheses_for_informal"): 

2729 form = form[1:-1] 

2730 extra_tags.append("informal") 

2731 else: 

2732 form = form[1:-1] 

2733 elif re.match(r"\{\[[^][(){}]*\]\}$", form): 

2734 if get_lang_conf(lang, "square_brackets_for_rare") and get_lang_conf( 2734 ↛ 2741line 2734 didn't jump to line 2741 because the condition on line 2734 was always true

2735 lang, "curly_brackets_for_archaic" 

2736 ): 

2737 # είμαι/Greek/Verb 

2738 form = form[2:-2] 

2739 extra_tags.extend(["rare", "archaic"]) 

2740 else: 

2741 form = form[2:-2] 

2742 elif re.match(r"\{[^][(){}]*\}$", form): 

2743 if get_lang_conf(lang, "curly_brackets_for_archaic"): 2743 ↛ 2748line 2743 didn't jump to line 2748 because the condition on line 2743 was always true

2744 # είμαι/Greek/Verb 

2745 form = form[1:-1] 

2746 extra_tags.extend(["archaic"]) 

2747 else: 

2748 form = form[1:-1] 

2749 elif re.match(r"\[[^][(){}]*\]$", form): 

2750 if get_lang_conf(lang, "square_brackets_for_rare"): 2750 ↛ 2755line 2750 didn't jump to line 2755 because the condition on line 2750 was always true

2751 # είμαι/Greek/Verb 

2752 form = form[1:-1] 

2753 extra_tags.append("rare") 

2754 else: 

2755 form = form[1:-1] 

2756 return form, extra_tags 

2757 

2758 

2759def handle_mixed_lines( 

2760 alts: list[str], tablecontext: "TableContext" 

2761) -> list[tuple[str, str, str]]: 

2762 # Handle the special case where romanization is given under 

2763 # normal form, e.g. in Russian. There can be multiple 

2764 # comma-separated forms in each case. We also handle the case 

2765 # where instead of romanization we have IPA pronunciation 

2766 # (e.g., avoir/French/verb). 

2767 len2 = len(alts) // 2 

2768 

2769 if len(alts) == 1 and "(" not in alts[0]: 

2770 return [(alts[0], "", "")] 

2771 

2772 # Check for IPAs (forms first, IPAs under) 

2773 # base, base, IPA, IPA 

2774 if ( 

2775 len(alts) % 2 == 0 # Divisibly by two 

2776 and all( 

2777 re.match(r"^\s*/.*/\s*$", x) # Inside slashes = IPA 

2778 for x in alts[len2:] 

2779 ) 

2780 and not any( 

2781 re.match(r"^\s*/.*/\s*$", x) # first half without slashes 

2782 for x in alts[:len2] 

2783 ) 

2784 ): # In the second half of alts 

2785 return list( 

2786 (alts[i], "", alts[i + len2]) 

2787 # List of tuples: (base, "", ipa) 

2788 for i in range(len2) 

2789 ) 

2790 # base, base, base, IPA 

2791 elif ( 

2792 len(alts) > 2 

2793 and re.match(r"^\s*/.*/\s*$", alts[-1]) 

2794 and all(not x.startswith("/") for x in alts[:-1]) 

2795 ): 

2796 # Only if the last alt is IPA 

2797 return list((alts[i], "", alts[-1]) for i in range(len(alts) - 1)) 

2798 

2799 # base, IPA, IPA, IPA 

2800 elif ( 

2801 len(alts) > 2 

2802 and not alts[0].startswith("/") 

2803 and all(re.match(r"^\s*/.*/\s*$", x) for x in alts[1:]) 

2804 ): 

2805 # First is base and the rest is IPA alternatives 

2806 return list((alts[0], "", x) for x in alts[1:]) 

2807 

2808 alt_classifications = list( 

2809 classify_desc( 

2810 re.sub( 

2811 r"\^.*$", 

2812 "", 

2813 # Remove ends of strings starting from ^. 

2814 # Supescripts have been already removed 

2815 # from the string, while ^xyz needs to be 

2816 # removed separately, though it's usually 

2817 # something with a single letter? 

2818 "".join(xx for xx in x if not is_superscript(xx)) 

2819 # Remove trailing footnote asterisks that mess with 

2820 # classification 

2821 .strip("* "), 

2822 ) 

2823 ) 

2824 for x in alts 

2825 ) 

2826 

2827 # Check for romanizations, forms first, romanizations under 

2828 if ( 

2829 len(alts) % 2 == 0 

2830 and not any("(" in x for x in alts) 

2831 and all(x == "other" for x in alt_classifications[:len2]) 

2832 and all( 

2833 x in ("romanization", "english") for x in alt_classifications[len2:] 

2834 ) 

2835 ): 

2836 return list((alts[i], alts[i + len2], "") for i in range(len2)) 

2837 # Check for romanizations, forms and romanizations alternating 

2838 elif ( 

2839 len(alts) % 2 == 0 

2840 and not any("(" in x for x in alts) 

2841 and all( 

2842 alt_classifications[i] == "other" for i in range(0, len(alts), 2) 

2843 ) 

2844 and all( 

2845 alt_classifications[i] in ("romanization", "english") 

2846 for i in range(1, len(alts), 2) 

2847 ) 

2848 ): 

2849 # odds 

2850 return list((alts[i], alts[i + 1], "") for i in range(0, len(alts), 2)) 

2851 # evens 

2852 # Handle complex Georgian entries with alternative forms and* 

2853 # *romanizations. It's a bit of a mess. Remove this kludge if not 

2854 # needed anymore. NOTE THAT THE PARENTHESES ON THE WEBSITE ARE NOT 

2855 # DISPLAYED. They are put inside their own span elements that are 

2856 # then hidden with some CSS. 

2857 # https://en.wiktionary.org/wiki/%E1%83%90%E1%83%9B%E1%83%94%E1%83%A0%E1%83%98%E1%83%99%E1%83%98%E1%83%A1_%E1%83%A8%E1%83%94%E1%83%94%E1%83%A0%E1%83%97%E1%83%94%E1%83%91%E1%83%A3%E1%83%9A%E1%83%98_%E1%83%A8%E1%83%A2%E1%83%90%E1%83%A2%E1%83%94%E1%83%91%E1%83%98 

2858 # ამერიკის შეერთებულ შტატებს(ა) (ameriḳis šeertebul šṭaṭebs(a)) 

2859 # The above should generate two alts entries, with two different 

2860 # parallel versions, one without (a) and with (a) at the end, 

2861 # for both the Georgian original and the romanization. 

2862 elif ( 

2863 tablecontext.template_name == "ka-decl-noun" 

2864 and len(alts) >= 1 

2865 and any(" (" in alt_ for alt_ in alts) 

2866 ): 

2867 return ka_decl_noun_template_cell(alts) 

2868 elif ( 

2869 len(alts) > 2 

2870 and alt_classifications[0] == "other" 

2871 and all( 

2872 x in ("romanization", "english") for x in alt_classifications[1:] 

2873 ) 

2874 ): 

2875 return list((alts[0], x, "") for x in alts[1:]) 

2876 else: 

2877 new_alts = [] 

2878 for alt in alts: 

2879 lst = [""] 

2880 idx = 0 

2881 for m in re.finditer( 

2882 r"(^|\w|\*)\((\w+(/\w+)*)\)", 

2883 # start OR letter OR asterisk (word/word*) 

2884 # \\___________group 1_______/ \ \_g3_/// 

2885 # \ \__gr. 2_// 

2886 # \_____________group 0________________/ 

2887 alt, 

2888 ): 

2889 v = m.group(2) # (word/word/word...) 

2890 if ( 

2891 classify_desc(v) == "tags" # Tags inside parens 

2892 or m.group(0) == alt 

2893 ): # All in parens 

2894 continue 

2895 new_lst = [] 

2896 for x in lst: 

2897 x += alt[idx : m.start()] + m.group(1) 

2898 # alt until letter or asterisk 

2899 idx = m.end() 

2900 vparts = v.split("/") 

2901 # group(2) = ["word", "wörd"...] 

2902 if len(vparts) == 1: 

2903 new_lst.append(x) 

2904 new_lst.append(x + v) 

2905 # "kind(er)" -> ["kind", "kinder"] 

2906 else: 

2907 for vv in vparts: 

2908 new_lst.append(x + vv) 

2909 # "lampai(tten/den)" -> 

2910 # ["lampaitten", "lampaiden"] 

2911 lst = new_lst 

2912 for x in lst: 

2913 new_alts.append(x + alt[idx:]) 

2914 # add the end of alt 

2915 return list((x, "", "") for x in new_alts) 

2916 # [form, no romz, no ipa] 

2917 return [] 

2918 

2919 

2920def handle_generic_table( 

2921 wxr: WiktextractContext, 

2922 tablecontext: "TableContext", 

2923 data: WordData, 

2924 word: str, 

2925 lang: str, 

2926 pos: str, 

2927 rows: list[list[InflCell]], 

2928 titles: list[str], 

2929 source: str, 

2930 after: str, 

2931 depth: int, 

2932) -> None: 

2933 assert isinstance(wxr, WiktextractContext) 

2934 assert isinstance(data, dict) 

2935 assert isinstance(word, str) 

2936 assert isinstance(lang, str) 

2937 assert isinstance(pos, str) 

2938 assert isinstance(rows, list) 

2939 assert isinstance(source, str) 

2940 assert isinstance(after, str) 

2941 assert isinstance(depth, int) 

2942 for row in rows: 

2943 assert isinstance(row, list) 

2944 for x in row: 

2945 assert isinstance(x, InflCell) 

2946 assert isinstance(titles, list) 

2947 for s in titles: 

2948 assert isinstance(s, str) 

2949 

2950 # Try to parse the table as a simple table 

2951 ret = parse_simple_table( 

2952 wxr, tablecontext, word, lang, pos, rows, titles, source, after, depth 

2953 ) 

2954 if ret is None: 2954 ↛ 2957line 2954 didn't jump to line 2957 because the condition on line 2954 was never true

2955 # XXX handle other table formats 

2956 # We were not able to handle the table 

2957 wxr.wtp.debug( 

2958 "unhandled inflection table format, {}/{}".format(word, lang), 

2959 sortid="inflection/2370", 

2960 ) 

2961 return 

2962 

2963 # Add the returned forms but eliminate duplicates. 

2964 have_forms = set() 

2965 for dt in ret: 

2966 fdt = freeze(dt) 

2967 if fdt in have_forms: 

2968 continue # Don't add duplicates 

2969 # Some Russian words have Declension and Pre-reform declension partially 

2970 # duplicating same data. Don't add "dated" tags variant if already have 

2971 # the same without "dated" from the modern declension table 

2972 

2973 tags = dt.get("tags", []) 

2974 for dated_tag in ("dated",): 

2975 if dated_tag in tags: 

2976 dt2 = dt.copy() 

2977 tags2 = list(x for x in tags if x != dated_tag) 

2978 dt2["tags"] = tags2 

2979 if tags2 and freeze(dt2) in have_forms: 2979 ↛ 2980line 2979 didn't jump to line 2980 because the condition on line 2979 was never true

2980 break # Already have without archaic 

2981 else: 

2982 if "table-tags" not in tags: 

2983 have_forms.add(fdt) 

2984 data_append(data, "forms", dt) 

2985 

2986 

2987def determine_header( 

2988 wxr: WiktextractContext, 

2989 tablecontext, 

2990 lang: str, 

2991 word: str, 

2992 pos: str, 

2993 table_kind: NodeKind, 

2994 kind: NodeKind | str, 

2995 style: str | None, 

2996 row: list[InflCell], 

2997 col: WikiNode, 

2998 celltext: str, 

2999 titletext: str, 

3000 cols_headered: list[bool], 

3001 target: str | None, 

3002 cellstyle: str, 

3003 # is_title, 

3004 # hdr_expansion, 

3005 # target, 

3006 # celltext, 

3007) -> tuple[bool, list[tuple[str, ...]], str | None, str]: 

3008 assert isinstance(table_kind, NodeKind) 

3009 assert isinstance(kind, (NodeKind, str)) 

3010 assert style is None or isinstance(style, str) 

3011 assert cellstyle is None or isinstance(cellstyle, str) 

3012 

3013 header_kind: NodeKind | str 

3014 if table_kind == NodeKind.TABLE: 

3015 header_kind = NodeKind.TABLE_HEADER_CELL 

3016 elif table_kind == NodeKind.HTML: 3016 ↛ 3018line 3016 didn't jump to line 3018 because the condition on line 3016 was always true

3017 header_kind = "th" 

3018 idx = celltext.find(": ") 

3019 is_title = False 

3020 # remove anything in parentheses, compress whitespace, .strip() 

3021 cleaned_titletext = re.sub( 

3022 r"\s+", " ", re.sub(r"\s*\([^)]*\)", "", titletext) 

3023 ).strip() 

3024 cleaned, _, _, _ = extract_cell_content(lang, word, celltext) 

3025 cleaned = re.sub(r"\s+", " ", cleaned) 

3026 hdr_expansion = expand_header( 

3027 wxr, 

3028 tablecontext, 

3029 word, 

3030 lang, 

3031 pos, 

3032 cleaned, 

3033 [], 

3034 silent=True, 

3035 ignore_tags=True, 

3036 ) 

3037 candidate_hdr = not any( 

3038 any(t.startswith("error-") for t in ts) for ts in hdr_expansion 

3039 ) 

3040 # KJ candidate_hdr says that a specific cell is a candidate 

3041 # for being a header because it passed through expand_header 

3042 # without getting any "error-" tags; that is, the contents 

3043 # is "valid" for being a header; these are the false positives 

3044 # we want to catch 

3045 ignored_cell = any( 

3046 any(t.startswith("dummy-") for t in ts) for ts in hdr_expansion 

3047 ) 

3048 # ignored_cell should NOT be used to filter for headers, like 

3049 # candidate_hdr is used, but only to filter for related *debug 

3050 # messages*: some dummy-tags are actually half-way to headers, 

3051 # like ones with "Notes", so they MUST be headers, but later 

3052 # on they're ignored *as* headers so they don't need to print 

3053 # out any cells-as-headers debug messages. 

3054 if ( 

3055 candidate_hdr 

3056 and kind != header_kind 

3057 and cleaned != "" 

3058 and cleaned != "dummy-ignored-text-cell" 

3059 and cleaned not in IGNORED_COLVALUES 

3060 ): 

3061 # print("col: {}".format(col)) 

3062 if not ignored_cell and lang not in LANGUAGES_WITH_CELLS_AS_HEADERS: 

3063 wxr.wtp.debug( 

3064 "rejected heuristic header: " 

3065 "table cell identified as header and given " 

3066 "candidate status, BUT {} is not in " 

3067 "LANGUAGES_WITH_CELLS_AS_HEADERS; " 

3068 "cleaned text: {}".format(lang, cleaned), 

3069 sortid="inflection/2447", 

3070 ) 

3071 candidate_hdr = False 

3072 elif cleaned not in LANGUAGES_WITH_CELLS_AS_HEADERS.get(lang, ""): 

3073 wxr.wtp.debug( 

3074 "rejected heuristic header: " 

3075 "table cell identified as header and given " 

3076 "candidate status, BUT the cleaned text is " 

3077 "not in LANGUAGES_WITH_CELLS_AS_HEADERS[{}]; " 

3078 "cleaned text: {}".format(lang, cleaned), 

3079 sortid="inflection/2457", 

3080 ) 

3081 candidate_hdr = False 

3082 else: 

3083 wxr.wtp.debug( 

3084 "accepted heuristic header: " 

3085 "table cell identified as header and given " 

3086 "candidate status, AND the cleaned text is " 

3087 "in LANGUAGES_WITH_CELLS_AS_HEADERS[{}]; " 

3088 "cleaned text: {}".format(lang, cleaned), 

3089 sortid="inflection/2466", 

3090 ) 

3091 

3092 # If the cell starts with something that could start a 

3093 # definition (typically a reference symbol), make it a candidate 

3094 # regardless of whether the language is listed. 

3095 if re.match(def_re, cleaned) and not re.match(nondef_re, cleaned): 3095 ↛ 3096line 3095 didn't jump to line 3096 because the condition on line 3095 was never true

3096 candidate_hdr = True 

3097 

3098 # print("titletext={!r} hdr_expansion={!r} candidate_hdr={!r} " 

3099 # "lang={} pos={}" 

3100 # .format(titletext, hdr_expansion, candidate_hdr, 

3101 # lang, pos)) 

3102 if idx >= 0 and titletext[:idx] in infl_map: 

3103 target = titletext[idx + 2 :].strip() 

3104 celltext = celltext[:idx] 

3105 is_title = True 

3106 elif ( 

3107 kind == header_kind 

3108 and " + " not in titletext # For "avoir + blah blah"? 

3109 and not any( 

3110 isinstance(x, WikiNode) 

3111 and x.kind == NodeKind.HTML 

3112 and x.sarg == "span" 

3113 and x.attrs.get("lang") in ("az",) 

3114 for x in col.children 

3115 ) 

3116 ): 

3117 is_title = True 

3118 elif ( 

3119 candidate_hdr 

3120 and cleaned_titletext not in IGNORED_COLVALUES 

3121 and distw([cleaned_titletext], word) > 0.3 

3122 and cleaned_titletext not in ("I", "es") 

3123 ): 

3124 is_title = True 

3125 # if first column or same style as first column 

3126 elif ( 

3127 style == cellstyle 

3128 and 

3129 # and title is not identical to word name 

3130 titletext != word 

3131 and cleaned not in IGNORED_COLVALUES 

3132 and cleaned != "dummy-ignored-text-cell" 

3133 and 

3134 # the style composite string is not broken 

3135 not style.startswith("////") 

3136 and " + " not in titletext 

3137 ): 

3138 if not ignored_cell and lang not in LANGUAGES_WITH_CELLS_AS_HEADERS: 3138 ↛ 3139line 3138 didn't jump to line 3139 because the condition on line 3138 was never true

3139 wxr.wtp.debug( 

3140 "rejected heuristic header: " 

3141 "table cell identified as header based " 

3142 "on style, BUT {} is not in " 

3143 "LANGUAGES_WITH_CELLS_AS_HEADERS; " 

3144 "cleaned text: {}, style: {}".format(lang, cleaned, style), 

3145 sortid="inflection/2512", 

3146 ) 

3147 elif ( 3147 ↛ 3151line 3147 didn't jump to line 3151 because the condition on line 3147 was never true

3148 not ignored_cell 

3149 and cleaned not in LANGUAGES_WITH_CELLS_AS_HEADERS.get(lang, "") 

3150 ): 

3151 wxr.wtp.debug( 

3152 "rejected heuristic header: " 

3153 "table cell identified as header based " 

3154 "on style, BUT the cleaned text is " 

3155 "not in LANGUAGES_WITH_CELLS_AS_HEADERS[{}]; " 

3156 "cleaned text: {}, style: {}".format(lang, cleaned, style), 

3157 sortid="inflection/2522", 

3158 ) 

3159 else: 

3160 wxr.wtp.debug( 

3161 "accepted heuristic header: " 

3162 "table cell identified as header based " 

3163 "on style, AND the cleaned text is " 

3164 "in LANGUAGES_WITH_CELLS_AS_HEADERS[{}]; " 

3165 "cleaned text: {}, style: {}".format(lang, cleaned, style), 

3166 sortid="inflection/2530", 

3167 ) 

3168 is_title = True 

3169 if ( 3169 ↛ 3176line 3169 didn't jump to line 3176 because the condition on line 3169 was never true

3170 not is_title 

3171 and len(row) < len(cols_headered) 

3172 and cols_headered[len(row)] 

3173 ): 

3174 # Whole column has title suggesting they are headers 

3175 # (e.g. "Case") 

3176 is_title = True 

3177 if re.match( 

3178 r"Conjugation of |Declension of |Inflection of |" 

3179 r"Mutation of |Notes\b", # \b is word-boundary 

3180 titletext, 

3181 ): 

3182 is_title = True 

3183 return is_title, hdr_expansion, target, celltext 

3184 

3185 

3186class TableContext: 

3187 """Saved context used when parsing a table and its subtables.""" 

3188 

3189 __slot__ = ( 

3190 "stored_hdrspans", 

3191 "section_header", 

3192 "template_name", 

3193 ) 

3194 

3195 def __init__(self, template_name: str | None = None) -> None: 

3196 self.stored_hdrspans: list[HdrSpan] = [] 

3197 self.section_header: tuple[str, ...] = tuple() 

3198 if template_name is None: 

3199 self.template_name = "" 

3200 else: 

3201 self.template_name = template_name 

3202 

3203 

3204def handle_wikitext_or_html_table( 

3205 wxr: WiktextractContext, 

3206 word: str, 

3207 lang: str, 

3208 pos: str, 

3209 data: WordData, 

3210 tree: WikiNode, 

3211 titles: list[str], 

3212 source: str, 

3213 after: str, 

3214 tablecontext: TableContext | None = None, 

3215): 

3216 """Parses a table from parsed Wikitext format into rows and columns of 

3217 InflCell objects and then calls handle_generic_table() to parse it into 

3218 forms. This adds the forms into ``data``.""" 

3219 assert isinstance(wxr, WiktextractContext) 

3220 assert isinstance(word, str) 

3221 assert isinstance(lang, str) 

3222 assert isinstance(pos, str) 

3223 assert isinstance(data, dict) 

3224 assert isinstance(tree, WikiNode) 

3225 assert tree.kind == NodeKind.TABLE or ( 

3226 tree.kind == NodeKind.HTML and tree.sarg == "table" 

3227 ) 

3228 assert isinstance(titles, list) 

3229 assert isinstance(source, str) 

3230 for x in titles: 

3231 assert isinstance(x, str) 

3232 assert isinstance(after, str) 

3233 assert tablecontext is None or isinstance(tablecontext, TableContext) 

3234 # Imported here to avoid a circular import 

3235 from wiktextract.page import clean_node, recursively_extract 

3236 

3237 # from wikitextprocessor.parser import print_tree 

3238 # print_tree(tree) 

3239 # print("-------==========-------") 

3240 

3241 if not tablecontext: 

3242 tablecontext = TableContext() 

3243 

3244 # Get language specific text removal patterns 

3245 remove_text_patterns: tuple[str | re.Pattern, ...] | None = None 

3246 if rem := get_lang_conf(lang, "remove_text_patterns"): 

3247 for poses in rem.keys(): 

3248 if pos in poses: 

3249 remove_text_patterns = rem[poses] 

3250 break 

3251 

3252 def handle_table1( 

3253 wxr: WiktextractContext, 

3254 tablecontext: TableContext, 

3255 word: str, 

3256 lang: str, 

3257 pos: str, 

3258 data: WordData, 

3259 tree: WikiNode, 

3260 titles: list[str], 

3261 source: str, 

3262 after: str, 

3263 depth: int, 

3264 ) -> list[tuple[list[list[InflCell]], list[str], str, int]]: 

3265 # rows, titles, after, depth 

3266 """Helper function allowing the 'flattening' out of the table 

3267 recursion: instead of handling the tables in the wrong order 

3268 (recursively), this function adds to new_row that is then 

3269 iterated through in the main function at the end, creating 

3270 a longer table (still in pieces) in the correct order.""" 

3271 

3272 assert isinstance(data, dict) 

3273 assert isinstance(titles, list) 

3274 assert isinstance(source, str) 

3275 for x in titles: 

3276 assert isinstance(x, str) 

3277 assert isinstance(after, str) 

3278 assert isinstance(depth, int) 

3279 # print("HANDLE_WIKITEXT_TABLE", titles) 

3280 # if len(titles) > 0: 

3281 # wxr.wtp.debug(f"HANDLE_WIKITEXT_TABLE {titles=}") 

3282 

3283 # Filling for columns with rowspan > 1 

3284 col_gap_data: list[InflCell | None] = [] 

3285 # Number of remaining rows for which to fill the column 

3286 vertical_still_left: list[int] = [] 

3287 cols_headered: list[bool] = [] # [F, T, F, F...] 

3288 # True when the whole column contains headers, even 

3289 # when the cell is not considered a header; triggered 

3290 # by the "*" inflmap meta-tag. 

3291 rows: list[list[InflCell]] = [] 

3292 

3293 sub_ret = [] 

3294 

3295 # from wikitextprocessor.parser import print_tree 

3296 # print_tree(tree) 

3297 for node in tree.children: 

3298 if not isinstance(node, WikiNode): 

3299 continue 

3300 kind: NodeKind | str 

3301 if node.kind == NodeKind.HTML: 

3302 kind = node.sarg 

3303 else: 

3304 kind = node.kind 

3305 

3306 # print(" {}".format(node)) 

3307 if kind in (NodeKind.TABLE_CAPTION, "caption"): 

3308 # print(" CAPTION:", node) 

3309 if "inflection-table-title" in node.attrs.get("class", ""): 3309 ↛ 3310line 3309 didn't jump to line 3310 because the condition on line 3309 was never true

3310 titles = [clean_node(wxr, None, node.children)] 

3311 elif kind in (NodeKind.TABLE_ROW, "tr"): 

3312 if "vsShow" in node.attrs.get("class", "").split(): 

3313 # vsShow rows are those that are intially shown in tables 

3314 # that have more data. The hidden data duplicates these 

3315 # rows, so we skip it and just process the hidden data. 

3316 continue 

3317 

3318 # if ( 

3319 # len(node.children) == 1 

3320 # and node.children[0].attrs.get("class") == "separator" 

3321 # ): 

3322 # print("------------------ skip separator") 

3323 # continue 

3324 

3325 # Parse a table row. 

3326 row: list[InflCell] = [] 

3327 style = None 

3328 row_has_nonempty_cells = False 

3329 # Have nonempty cell not from rowspan 

3330 for col in get_table_cells(node): 

3331 # loop through each cell in the ROW 

3332 

3333 # The below skip is not needed anymore, because we "skip" in 

3334 # get_table_cells, but left here as a comment 

3335 # if not isinstance(col, WikiNode): 

3336 # # This skip is not used for counting, 

3337 # # "None" is not used in 

3338 # # indexing or counting or looping. 

3339 # continue 

3340 if col.kind == NodeKind.HTML: 

3341 kind = col.sarg 

3342 else: 

3343 kind = col.kind 

3344 if kind not in ( 3344 ↛ 3350line 3344 didn't jump to line 3350 because the condition on line 3344 was never true

3345 NodeKind.TABLE_HEADER_CELL, 

3346 NodeKind.TABLE_CELL, 

3347 "th", 

3348 "td", 

3349 ): 

3350 print(" UNEXPECTED ROW CONTENT: {}".format(col)) 

3351 continue 

3352 

3353 while ( 

3354 len(row) < len(vertical_still_left) 

3355 and vertical_still_left[len(row)] > 0 

3356 ): 

3357 # vertical_still_left is [...0, 0, 2...] for each 

3358 # column. It is populated at the end of the loop, at the 

3359 # same time as col_gap_data. This needs to be looped and 

3360 # filled this way because each `for col`-looping jumps 

3361 # straight to the next meaningful cell; there is no 

3362 # "None" cells, only emptiness between, and rowspan and 

3363 # colspan are just to generate the "fill- 

3364 vertical_still_left[len(row)] -= 1 

3365 

3366 # KJ Apr 2026 

3367 # type checking is ignored; I am pretty sure that 

3368 # row will never contain None, even if col_gap_data 

3369 # is `InflCell | None`, but this code is such 

3370 # spaghetti that it's hard to figure out, except 

3371 # by the process of elimination: this has never 

3372 # caused trouble before, ergo, it works. 

3373 row.append(col_gap_data[len(row)]) # type: ignore 

3374 

3375 # appending row is how "indexing" is 

3376 # done here; something is appended, 

3377 # like a filler-cell here or a "start" 

3378 # cell at the end of the row-loop, 

3379 # which increased len(row) which is 

3380 # then used as the target-index to check 

3381 # for gaps. vertical_still_left is 

3382 # the countdown to when to stop 

3383 # filling in gaps, and goes down to 0, 

3384 # and col_gap_data is not touched 

3385 # except when a new rowspan is needed, 

3386 # at the same time that 

3387 # vertical_still_left gets reassigned. 

3388 

3389 try: 

3390 rowspan = int(col.attrs.get("rowspan", "1")) # 🡙 

3391 colspan = int(col.attrs.get("colspan", "1")) # 🡘 

3392 except ValueError: 

3393 rowspan = 1 

3394 colspan = 1 

3395 # print("COL:", col) 

3396 

3397 # Too many of these errors 

3398 if colspan > 100: 

3399 # wxr.wtp.error( 

3400 # f"Colspan {colspan} over 30, set to 1", 

3401 # sortid="inflection/20250113a", 

3402 # ) 

3403 colspan = 100 

3404 if rowspan > 100: 3404 ↛ 3409line 3404 didn't jump to line 3409 because the condition on line 3404 was never true

3405 # wxr.wtp.error( 

3406 # f"Rowspan {rowspan} over 30, set to 1", 

3407 # sortid="inflection/20250113b", 

3408 # ) 

3409 rowspan = 100 

3410 

3411 # Process any nested tables recursively. 

3412 tables, rest = recursively_extract( 

3413 col, 

3414 lambda x: ( 

3415 isinstance(x, WikiNode) 

3416 and (x.kind == NodeKind.TABLE or x.sarg == "table") 

3417 ), 

3418 ) 

3419 

3420 # Clean the rest of the cell. 

3421 link_capture_dict: dict = {} 

3422 celltext = clean_node( 

3423 wxr, link_capture_dict, rest, collect_links=True 

3424 ) 

3425 cell_links: list[tuple[str, str]] | None = ( 

3426 link_capture_dict.get("links", None) 

3427 ) 

3428 # print(f"CLEANED: {celltext=}") 

3429 # print(f"SUBTABLES: {tables}") 

3430 # print(f"{link_capture_dict=}") 

3431 

3432 # Remove regexed patterns from text 

3433 if remove_text_patterns is not None: 

3434 for pat in remove_text_patterns: 

3435 celltext = re.sub(pat, "", celltext) 

3436 # print(f"AFTER: {celltext=} <<") 

3437 

3438 # Handle nested tables. 

3439 for tbl in tables: 

3440 # Some nested tables (e.g., croí/Irish) have subtitles 

3441 # as normal paragraphs in the same cell under a descrip- 

3442 # tive text that should be treated as a title (e.g., 

3443 # "Forms with the definite article", with "definite" not 

3444 # mentioned elsewhere). 

3445 new_titles = list(titles) 

3446 if celltext: 

3447 new_titles.append(celltext) 

3448 subtbl = handle_table1( 

3449 wxr, 

3450 tablecontext, 

3451 word, 

3452 lang, 

3453 pos, 

3454 data, 

3455 tbl, # type: ignore 

3456 new_titles, 

3457 source, 

3458 "", 

3459 depth + 1, 

3460 ) 

3461 if subtbl: 3461 ↛ 3439line 3461 didn't jump to line 3439 because the condition on line 3461 was always true

3462 sub_ret.append((rows, titles, after, depth)) 

3463 rows = [] 

3464 titles = [] 

3465 after = "" 

3466 sub_ret.extend(subtbl) 

3467 

3468 # This magic value is used as part of header detection 

3469 cellstyle = ( 

3470 col.attrs.get("style", "") 

3471 + "//" 

3472 + col.attrs.get("class", "") 

3473 + "//" 

3474 + str(kind) 

3475 ) 

3476 

3477 if not row: # if first column in row 

3478 style = cellstyle 

3479 target = None 

3480 titletext = celltext.strip() 

3481 while titletext and is_superscript(titletext[-1]): 

3482 titletext = titletext[:-1] 

3483 

3484 ( 

3485 is_title, 

3486 hdr_expansion, 

3487 target, 

3488 celltext, 

3489 ) = determine_header( 

3490 wxr, 

3491 tablecontext, 

3492 lang, 

3493 word, 

3494 pos, 

3495 tree.kind, 

3496 kind, 

3497 style, 

3498 row, 

3499 col, 

3500 celltext, 

3501 titletext, 

3502 cols_headered, 

3503 None, 

3504 cellstyle, 

3505 ) 

3506 

3507 if is_title: 

3508 # If this cell gets a "*" tag, make the whole column 

3509 # below it (toggling it in cols_headered = [F, F, T...]) 

3510 # into headers. 

3511 while len(cols_headered) <= len(row): 

3512 cols_headered.append(False) 

3513 if any("*" in tt for tt in hdr_expansion): 

3514 cols_headered[len(row)] = True 

3515 celltext = "" 

3516 # if row_has_nonempty_cells has been True at some point, it 

3517 # keeps on being True. 

3518 # if row_has_nonempty_cells or is_title or celltext != "": 

3519 # row_has_nonempty_cells = True 

3520 # ⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓ 

3521 row_has_nonempty_cells |= is_title or celltext != "" 

3522 cell = InflCell( 

3523 celltext, is_title, colspan, rowspan, target, cell_links 

3524 ) 

3525 for _ in range(0, colspan): 

3526 # colspan🡘 current loop (col) or 1 

3527 # All the data-filling for colspan 

3528 # is done simply in this loop, 

3529 # while rowspan needs to use 

3530 # vertical_still_left to count gaps 

3531 # and col_gap_data to fill in 

3532 # those gaps with InflCell data. 

3533 if rowspan > 1: # rowspan🡙 current loop (col) or 1 

3534 while len(col_gap_data) <= len(row): 

3535 # Initialize col_gap_data/ed if 

3536 # it is lacking slots 

3537 # for each column; col_gap_data and 

3538 # vertical_still_left are never 

3539 # reset to [], during 

3540 # the whole table function. 

3541 col_gap_data.append(None) 

3542 vertical_still_left.append(0) 

3543 # Below is where the "rectangle" block of rowspan 

3544 # and colspan is filled for the future. 

3545 col_gap_data[len(row)] = cell 

3546 # col_gap_data contains cells that 

3547 # will be used in the 

3548 # future, or None 

3549 vertical_still_left[len(row)] = rowspan - 1 

3550 # A counter for how many gaps🡙 are still left to be 

3551 # filled (row.append or 

3552 # row[col_gap_data[len(row)] => 

3553 # rows), it is not reset to [], but decremented to 0 

3554 # each time a row gets something from col_gap_data. 

3555 # Append this cell 1+ times for colspan🡘 

3556 row.append(cell) 

3557 if not row: 

3558 continue 

3559 # After looping the original row-nodes above, fill 

3560 # in the rest of the row if the final cell has colspan 

3561 # (inherited from above, so a cell with rowspan and colspan) 

3562 for i in range(len(row), len(vertical_still_left)): 

3563 if vertical_still_left[i] <= 0: 

3564 continue 

3565 vertical_still_left[i] -= 1 

3566 while len(row) < i: 

3567 row.append(InflCell("", False, 1, 1, None)) 

3568 row.append(col_gap_data[i]) # type: ignore 

3569 # print(" ROW {!r}".format(row)) 

3570 if row_has_nonempty_cells: 3570 ↛ 3297line 3570 didn't jump to line 3297 because the condition on line 3570 was always true

3571 rows.append(row) 

3572 elif kind in ( 3572 ↛ 3297line 3572 didn't jump to line 3297 because the condition on line 3572 was always true

3573 NodeKind.TABLE_HEADER_CELL, 

3574 NodeKind.TABLE_CELL, 

3575 "th", 

3576 "td", 

3577 "span", 

3578 ): 

3579 # print(" TOP-LEVEL CELL", node) 

3580 pass 

3581 

3582 if sub_ret: 

3583 main_ret = sub_ret 

3584 main_ret.append((rows, titles, after, depth)) 

3585 else: 

3586 main_ret = [(rows, titles, after, depth)] 

3587 return main_ret 

3588 

3589 new_rows = handle_table1( 

3590 wxr, tablecontext, word, lang, pos, data, tree, titles, source, after, 0 

3591 ) 

3592 

3593 # Now we have a table that has been parsed into rows and columns of 

3594 # InflCell objects. Parse the inflection table from that format. 

3595 if new_rows: 3595 ↛ exitline 3595 didn't return from function 'handle_wikitext_or_html_table' because the condition on line 3595 was always true

3596 for rows, titles, after, depth in new_rows: 

3597 handle_generic_table( 

3598 wxr, 

3599 tablecontext, 

3600 data, 

3601 word, 

3602 lang, 

3603 pos, 

3604 rows, 

3605 titles, 

3606 source, 

3607 after, 

3608 depth, 

3609 ) 

3610 

3611 

3612def get_table_cells(node: WikiNode) -> Generator[WikiNode, None, None]: 

3613 """If a wikitext table cell contains HTML cells `<td>`, as they sometimes 

3614 do because it is easier to write wikitext conditionals that way, 

3615 those td-elements are parsed as child elements of the Wikitext cell. 

3616 This generator will yield wikitext and HTML direct children of 

3617 `node` and if a Wikitext TABLE_CELL has direct td-element children, 

3618 those are also yielded.""" 

3619 for col in node.children: 

3620 if not isinstance(col, WikiNode): 

3621 continue 

3622 if any( 

3623 isinstance(c, HTMLNode) and c.sarg in ("th", "td") 

3624 for c in col.children 

3625 ): 

3626 html_cells = [] 

3627 content = [] 

3628 for c in col.children: 

3629 if isinstance(c, HTMLNode) and c.sarg in ("th", "td"): 

3630 html_cells.append(c) 

3631 else: 

3632 content.append(c) 

3633 # Remove td-elements from col so they are not returned twice 

3634 col.children = content 

3635 yield col 

3636 for c in html_cells: 

3637 yield c 

3638 else: 

3639 yield col 

3640 

3641 

3642def handle_html_table( 

3643 wxr: WiktextractContext, 

3644 word: str, 

3645 lang: str, 

3646 pos: str, 

3647 data: WordData, 

3648 tree: WikiNode, 

3649 titles: list[str], 

3650 source: str, 

3651 after: str, 

3652 tablecontext: TableContext | None = None, 

3653) -> None: 

3654 """A passer-on function for html-tables, XXX, remove these?""" 

3655 handle_wikitext_or_html_table( 

3656 wxr, word, lang, pos, data, tree, titles, source, after, tablecontext 

3657 ) 

3658 

3659 

3660def handle_wikitext_table( 

3661 wxr: WiktextractContext, 

3662 word: str, 

3663 lang: str, 

3664 pos: str, 

3665 data: WordData, 

3666 tree: WikiNode, 

3667 titles: list[str], 

3668 source: str, 

3669 after: str, 

3670 tablecontext: TableContext | None = None, 

3671) -> None: 

3672 """A passer-on function for html-tables, XXX, remove these?""" 

3673 handle_wikitext_or_html_table( 

3674 wxr, word, lang, pos, data, tree, titles, source, after, tablecontext 

3675 ) 

3676 

3677 

3678def parse_inflection_section( 

3679 wxr: WiktextractContext, 

3680 data: WordData, 

3681 word: str, 

3682 lang: str, 

3683 pos: str, 

3684 section: str, 

3685 tree: WikiNode, 

3686 tablecontext: TableContext | None = None, 

3687) -> None: 

3688 """Parses an inflection section on a page. ``data`` should be the 

3689 data for a part-of-speech, and inflections will be added to it.""" 

3690 

3691 # print("PARSE_INFLECTION_SECTION {}/{}/{}/{}" 

3692 # .format(word, lang, pos, section)) 

3693 assert isinstance(wxr, WiktextractContext) 

3694 assert isinstance(data, dict) 

3695 assert isinstance(word, str) 

3696 assert isinstance(lang, str) 

3697 assert isinstance(section, str) 

3698 assert isinstance(tree, WikiNode) 

3699 assert tablecontext is None or isinstance(tablecontext, TableContext) 

3700 source = section 

3701 tables: list[ 

3702 tuple[Literal["html", "wikitext"], WikiNode, list[str], list[str]] 

3703 ] = [] 

3704 titleparts: list[str] = [] 

3705 preceding_bolded_title = "" 

3706 

3707 # from wikitextprocessor.parser import print_tree 

3708 # print_tree(tree) 

3709 # print("--------------******************----------------") 

3710 

3711 def process_tables() -> None: 

3712 for kind, node, titles, after_l in tables: 

3713 after = "".join(after_l).strip() 

3714 after = clean_value(wxr, after) 

3715 if kind == "wikitext": 

3716 handle_wikitext_table( 

3717 wxr, 

3718 word, 

3719 lang, 

3720 pos, 

3721 data, 

3722 node, 

3723 titles, 

3724 source, 

3725 after, 

3726 tablecontext=tablecontext, 

3727 ) 

3728 elif kind == "html": 3728 ↛ 3742line 3728 didn't jump to line 3742 because the condition on line 3728 was always true

3729 handle_html_table( 

3730 wxr, 

3731 word, 

3732 lang, 

3733 pos, 

3734 data, 

3735 node, 

3736 titles, 

3737 source, 

3738 after, 

3739 tablecontext=tablecontext, 

3740 ) 

3741 else: 

3742 raise RuntimeError( 

3743 "{}: unimplemented table kind {}".format(word, kind) 

3744 ) 

3745 

3746 def recurse_navframe(node: WikiNode | str, titles: list[str]) -> None: 

3747 nonlocal tables 

3748 nonlocal titleparts 

3749 titleparts = [] 

3750 old_tables = tables 

3751 tables = [] 

3752 

3753 recurse(node, [], navframe=True) 

3754 

3755 process_tables() 

3756 tables = old_tables 

3757 

3758 def recurse( 

3759 node: WikiNode 

3760 | str 

3761 | list[WikiNode | str] 

3762 | list[list[WikiNode | str]], 

3763 titles: list[str], 

3764 navframe=False, 

3765 ) -> None: 

3766 nonlocal tables 

3767 if isinstance(node, (list, tuple)): 

3768 for x in node: 

3769 recurse(x, titles, navframe) 

3770 return 

3771 if isinstance(node, str): 

3772 if tables: 

3773 tables[-1][-1].append(node) 

3774 elif navframe: 

3775 titleparts.append(node) 

3776 return 

3777 if not isinstance(node, WikiNode): 3777 ↛ 3778line 3777 didn't jump to line 3778 because the condition on line 3777 was never true

3778 if navframe: 

3779 wxr.wtp.debug( 

3780 "inflection table: unhandled in NavFrame: {}".format(node), 

3781 sortid="inflection/2907", 

3782 ) 

3783 return 

3784 kind = node.kind 

3785 if navframe: 

3786 if kind == NodeKind.HTML: 

3787 classes = node.attrs.get("class", "").split() 

3788 if "NavToggle" in classes: 3788 ↛ 3789line 3788 didn't jump to line 3789 because the condition on line 3788 was never true

3789 return 

3790 if "NavHead" in classes: 

3791 # print("NAVHEAD:", node) 

3792 recurse(node.children, titles, navframe) 

3793 return 

3794 if "NavContent" in classes: 

3795 # print("NAVCONTENT:", node) 

3796 title = "".join(titleparts).strip() 

3797 title = html.unescape(title) 

3798 title = title.strip() 

3799 new_titles = list(titles) 

3800 if not re.match(r"(Note:|Notes:)", title): 3800 ↛ 3802line 3800 didn't jump to line 3802 because the condition on line 3800 was always true

3801 new_titles.append(title) 

3802 recurse(node, new_titles, navframe=False) 

3803 return 

3804 else: 

3805 if kind == NodeKind.TABLE: 

3806 tables.append(("wikitext", node, titles, [])) 

3807 return 

3808 elif kind == NodeKind.HTML and node.sarg == "table": 

3809 htmlclasses = node.attrs.get("class", ()) 

3810 if "audiotable" in htmlclasses: 

3811 return 

3812 tables.append(("html", node, titles, [])) 

3813 return 

3814 elif kind in ( 3814 ↛ 3821line 3814 didn't jump to line 3821 because the condition on line 3814 was never true

3815 NodeKind.LEVEL2, 

3816 NodeKind.LEVEL3, 

3817 NodeKind.LEVEL4, 

3818 NodeKind.LEVEL5, 

3819 NodeKind.LEVEL6, 

3820 ): 

3821 return # Skip subsections 

3822 if ( 

3823 kind == NodeKind.HTML 

3824 and node.sarg == "div" 

3825 and "NavFrame" in node.attrs.get("class", "").split() 

3826 ): 

3827 recurse_navframe(node, titles) 

3828 return 

3829 if kind == NodeKind.LINK: 

3830 if len(node.largs) > 1: 

3831 recurse(node.largs[1:], titles, navframe) 

3832 else: 

3833 recurse(node.largs[0], titles, navframe) 

3834 return 

3835 if kind == NodeKind.HTML and node.sarg == "ref": 

3836 return 

3837 if kind == NodeKind.LIST and node.sarg == ";": 

3838 nonlocal preceding_bolded_title 

3839 from wiktextract.page import clean_node 

3840 

3841 preceding_bolded_title = clean_node(wxr, None, node).strip("; ") 

3842 for x in node.children: 

3843 recurse(x, titles, navframe) 

3844 

3845 assert tree.kind == NodeKind.ROOT 

3846 for x in tree.children: 

3847 if preceding_bolded_title != "": 

3848 recurse(x, [preceding_bolded_title]) 

3849 else: 

3850 recurse(x, []) 

3851 

3852 # Process the tables we found 

3853 process_tables() 

3854 

3855 # XXX this code is used for extracting tables for inflection tests 

3856 if wxr.config.expand_tables: 3856 ↛ 3857line 3856 didn't jump to line 3857 because the condition on line 3856 was never true

3857 if section != "Mutation": 

3858 with open(wxr.config.expand_tables, "w") as f: 

3859 f.write(word + "\n") 

3860 f.write(lang + "\n") 

3861 f.write(pos + "\n") 

3862 f.write(section + "\n") 

3863 text = wxr.wtp.node_to_wikitext(tree) 

3864 f.write(text + "\n")