Coverage for src/wiktextract/extractor/en/inflection.py: 87%

1513 statements  

« prev     ^ index     » next       coverage.py v7.10.3, created at 2025-08-15 05:18 +0000

1# Code for parsing inflection tables. 

2# 

3# Copyright (c) 2021-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org. 

4 

5import collections 

6import copy 

7import functools 

8import html 

9import itertools 

10import re 

11import unicodedata 

12from typing import Generator, Optional, Union 

13 

14from wikitextprocessor import MAGIC_FIRST, HTMLNode, NodeKind, WikiNode 

15 

16from ...clean import clean_value 

17from ...datautils import data_append, freeze, split_at_comma_semi 

18from ...tags import valid_tags 

19from ...wxr_context import WiktextractContext 

20from .form_descriptions import ( 

21 classify_desc, 

22 decode_tags, 

23 distw, 

24 parse_head_final_tags, 

25) 

26from .inflection_kludges import ka_decl_noun_template_cell 

27from .inflectiondata import infl_map, infl_start_map, infl_start_re 

28from .lang_specific_configs import get_lang_conf, lang_specific_tags 

29from .table_headers_heuristics_data import LANGUAGES_WITH_CELLS_AS_HEADERS 

30from .type_utils import FormData 

31 

32# --debug-text-cell WORD 

33# Command-line parameter for debugging. When parsing inflection tables, 

34# print out debug messages when encountering this text. 

35debug_cell_text: Optional[str] = None 

36 

37 

38def set_debug_cell_text(text: str) -> None: 

39 global debug_cell_text 

40 debug_cell_text = text 

41 

42 

43TagSets = list[tuple[str, ...]] 

44 

45# Column texts that are interpreted as an empty column. 

46IGNORED_COLVALUES = { 

47 "-", 

48 "־", 

49 "᠆", 

50 "‐", 

51 "‑", 

52 "‒", 

53 "–", 

54 "—", 

55 "―", 

56 "−", 

57 "⸺", 

58 "⸻", 

59 "﹘", 

60 "﹣", 

61 "-", 

62 "/", 

63 "?", 

64 "not used", 

65 "not applicable", 

66} 

67 

68# These tags are never inherited from above 

69# XXX merge with lang_specific 

70noinherit_tags = { 

71 "infinitive-i", 

72 "infinitive-i-long", 

73 "infinitive-ii", 

74 "infinitive-iii", 

75 "infinitive-iv", 

76 "infinitive-v", 

77} 

78 

79# Subject->object transformation mapping, when using dummy-object-concord 

80# to replace subject concord tags with object concord tags 

81object_concord_replacements = { 

82 "first-person": "object-first-person", 

83 "second-person": "object-second-person", 

84 "third-person": "object-third-person", 

85 "singular": "object-singular", 

86 "plural": "object-plural", 

87 "definite": "object-definite", 

88 "indefinite": "object-indefinite", 

89 "class-1": "object-class-1", 

90 "class-2": "object-class-2", 

91 "class-3": "object-class-3", 

92 "class-4": "object-class-4", 

93 "class-5": "object-class-5", 

94 "class-6": "object-class-6", 

95 "class-7": "object-class-7", 

96 "class-8": "object-class-8", 

97 "class-9": "object-class-9", 

98 "class-10": "object-class-10", 

99 "class-11": "object-class-11", 

100 "class-12": "object-class-12", 

101 "class-13": "object-class-13", 

102 "class-14": "object-class-14", 

103 "class-15": "object-class-15", 

104 "class-16": "object-class-16", 

105 "class-17": "object-class-17", 

106 "class-18": "object-class-18", 

107 "masculine": "object-masculine", 

108 "feminine": "object-feminine", 

109} 

110 

111# Words in title that cause addition of tags in all entries 

112title_contains_global_map = { 

113 "possessive": "possessive", 

114 "possessed forms of": "possessive", 

115 "predicative forms of": "predicative", 

116 "negative": "negative", 

117 "positive definite forms": "positive definite", 

118 "positive indefinite forms": "positive indefinite", 

119 "comparative": "comparative", 

120 "superlative": "superlative", 

121 "combined forms": "combined-form", 

122 "mutation": "mutation", 

123 "definite article": "definite", 

124 "indefinite article": "indefinite", 

125 "indefinite declension": "indefinite", 

126 "bare forms": "indefinite", # e.g., cois/Irish 

127 "definite declension": "definite", 

128 "pre-reform": "dated", 

129 "personal pronouns": "personal pronoun", 

130 "composed forms of": "multiword-construction", 

131 "subordinate-clause forms of": "subordinate-clause", 

132 "participles of": "participle", 

133 "variation of": "dummy-skip-this", # a'/Scottish Gaelic 

134 "command form of": "imperative", # a راتلل/Pashto 

135 "historical inflection of": "dummy-skip-this", # kork/Norwegian Nynorsk 

136 "obsolete declension": "obsolete", # März/German 20241111 

137} 

138for k, v in title_contains_global_map.items(): 

139 if any(t not in valid_tags for t in v.split()): 139 ↛ 140line 139 didn't jump to line 140 because the condition on line 139 was never true

140 print("TITLE_CONTAINS_GLOBAL_MAP UNRECOGNIZED TAG: {}: {}".format(k, v)) 

141table_hdr_ign_part = r"(Inflection|Conjugation|Declension|Mutation) of [^\s]" 

142 

143table_hdr_ign_part_re = re.compile(r"(?i)(" + table_hdr_ign_part + ")") 

144# (?i) python regex extension, ignore case 

145title_contains_global_re = re.compile( 

146 r"(?i)(^|\b)({}|{})($|\b)".format( 

147 table_hdr_ign_part, 

148 "|".join(re.escape(x) for x in title_contains_global_map.keys()), 

149 ) 

150) 

151 

152# Words in title that cause addition of tags to table-tags "form" 

153title_contains_wordtags_map = { 

154 "pf": "perfective", 

155 "impf": "imperfective", 

156 "strong": "strong", 

157 "weak": "weak", 

158 "countable": "countable", 

159 "uncountable": "uncountable", 

160 "inanimate": "inanimate", 

161 "animate": "animate", 

162 "transitive": "transitive", 

163 "intransitive": "intransitive", 

164 "ditransitive": "ditransitive", 

165 "ambitransitive": "ambitransitive", 

166 "archaic": "archaic", 

167 "dated": "dated", 

168 "affirmative": "affirmative", 

169 "negative": "negative", 

170 "subject pronouns": "subjective", 

171 "object pronouns": "objective", 

172 "emphatic": "emphatic", 

173 "proper noun": "proper-noun", 

174 "no plural": "no-plural", 

175 "imperfective": "imperfective", 

176 "perfective": "perfective", 

177 "no supine stem": "no-supine", 

178 "no perfect stem": "no-perfect", 

179 "deponent": "deponent", 

180 "irregular": "irregular", 

181 "no short forms": "no-short-form", 

182 "iō-variant": "iō-variant", 

183 "1st declension": "declension-1", 

184 "2nd declension": "declension-2", 

185 "3rd declension": "declension-3", 

186 "4th declension": "declension-4", 

187 "5th declension": "declension-5", 

188 "6th declension": "declension-6", 

189 "first declension": "declension-1", 

190 "second declension": "declension-2", 

191 "third declension": "declension-3", 

192 "fourth declension": "declension-4", 

193 "fifth declension": "declension-5", 

194 "sixth declension": "declension-6", 

195 "1st conjugation": "conjugation-1", 

196 "2nd conjugation": "conjugation-2", 

197 "3rd conjugation": "conjugation-3", 

198 "4th conjugation": "conjugation-4", 

199 "5th conjugation": "conjugation-5", 

200 "6th conjugation": "conjugation-6", 

201 "7th conjugation": "conjugation-7", 

202 "first conjugation": "conjugation-1", 

203 "second conjugation": "conjugation-2", 

204 "third conjugation": "conjugation-3", 

205 "fourth conjugation": "conjugation-4", 

206 "fifth conjugation": "conjugation-5", 

207 "sixth conjugation": "conjugation-6", 

208 "seventh conjugation": "conjugation-7", 

209 # Corsican regional tags in table header 

210 "cismontane": "Cismontane", 

211 "ultramontane": "Ultramontane", 

212 "western lombard": "Western-Lombard", 

213 "eastern lombard": "Eastern-Lombard", 

214} 

215for k, v in title_contains_wordtags_map.items(): 

216 if any(t not in valid_tags for t in v.split()): 216 ↛ 217line 216 didn't jump to line 217 because the condition on line 216 was never true

217 print( 

218 "TITLE_CONTAINS_WORDTAGS_MAP UNRECOGNIZED TAG: {}: {}".format(k, v) 

219 ) 

220title_contains_wordtags_re = re.compile( 

221 r"(?i)(^|\b)({}|{})($|\b)".format( 

222 table_hdr_ign_part, 

223 "|".join(re.escape(x) for x in title_contains_wordtags_map.keys()), 

224 ) 

225) 

226 

227# Parenthesized elements in title that are converted to tags in 

228# "table-tags" form 

229title_elements_map = { 

230 "weak": "weak", 

231 "strong": "strong", 

232 "separable": "separable", 

233 "masculine": "masculine", 

234 "feminine": "feminine", 

235 "neuter": "neuter", 

236 "singular": "singular", 

237 "plural": "plural", 

238 "archaic": "archaic", 

239 "dated": "dated", 

240 "Attic": "Attic", # e.g. καλός/Greek/Adj 

241 "Epic": "Epic", # e.g. καλός/Greek/Adj 

242} 

243for k, v in title_elements_map.items(): 

244 if any(t not in valid_tags for t in v.split()): 244 ↛ 245line 244 didn't jump to line 245 because the condition on line 244 was never true

245 print("TITLE_ELEMENTS_MAP UNRECOGNIZED TAG: {}: {}".format(k, v)) 

246 

247# Parenthized element starts to map them to tags for form for the rest of 

248# the element 

249title_elemstart_map = { 

250 "auxiliary": "auxiliary", 

251 "Kotus type": "class", 

252 "ÕS type": "class", 

253 "class": "class", 

254 "short class": "class", 

255 "type": "class", 

256 "strong class": "class", 

257 "weak class": "class", 

258 "accent paradigm": "accent-paradigm", 

259 "stem in": "class", 

260} 

261for k, v in title_elemstart_map.items(): 

262 if any(t not in valid_tags for t in v.split()): 262 ↛ 263line 262 didn't jump to line 263 because the condition on line 262 was never true

263 print("TITLE_ELEMSTART_MAP UNRECOGNIZED TAG: {}: {}".format(k, v)) 

264title_elemstart_re = re.compile( 

265 r"^({}) ".format("|".join(re.escape(x) for x in title_elemstart_map.keys())) 

266) 

267 

268 

269# Regexp for cell starts that are likely definitions of reference symbols. 

270# See also nondef_re. 

271def_re = re.compile( 

272 r"(\s*•?\s+)?" 

273 r"((\*+|[△†0123456789⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻]+)([⁾):]|\s|(?=[A-Z]))|" 

274 r"\^(\*+|[△†])|" 

275 r"([¹²³⁴⁵⁶⁷⁸⁹])|" 

276 r"([ᴬᴮᴰᴱᴳᴴᴵᴶᴷᴸᴹᴺᴼᴾᴿᵀᵁⱽᵂᵃᵇᶜᵈᵉᶠᵍʰⁱʲᵏˡᵐⁿᵒᵖʳˢᵗᵘᵛʷˣʸᶻᵝᵞᵟᶿᶥᵠᵡ]))" 

277) 

278# ᴺᴸᴴ persan/Old Irish 

279 

280# Regexp for cell starts that are exceptions to def_re and do not actually 

281# start a definition. 

282nondef_re = re.compile( 

283 r"(^\s*(1|2|3)\s+(sg|pl)\s*$|" # 1s or 3p etc. 

284 r"\s*\d\d?\s*/\s*\d\d?\s*$)" 

285) # taka/Swahili "15 / 17" 

286 

287# Certain tags are moved from headers in tables into word tags, as they always 

288# apply to the whole word. 

289TAGS_FORCED_WORDTAGS: set[str] = set( 

290 [ 

291 # This was originally created for a issue with number paradigms in 

292 # Arabic, but that is being handled elsewhere now. 

293 ] 

294) 

295 

296 

297class InflCell: 

298 """Cell in an inflection table.""" 

299 

300 __slots__ = ( 

301 "text", 

302 "is_title", 

303 "colspan", 

304 "rowspan", 

305 "target", 

306 ) 

307 

308 def __init__( 

309 self, 

310 text: str, 

311 is_title: bool, 

312 colspan: int, 

313 rowspan: int, 

314 target: Optional[str], 

315 ) -> None: 

316 assert isinstance(text, str) 

317 assert is_title in (True, False) 

318 assert isinstance(colspan, int) and colspan >= 1 

319 assert isinstance(rowspan, int) and rowspan >= 1 

320 assert target is None or isinstance(target, str) 

321 self.text = text.strip() 

322 self.is_title = text and is_title 

323 self.colspan = colspan 

324 self.rowspan = rowspan 

325 self.target = target 

326 

327 def __str__(self) -> str: 

328 v = "{}/{}/{}/{!r}".format( 

329 self.text, self.is_title, self.colspan, self.rowspan 

330 ) 

331 if self.target: 

332 v += ": {!r}".format(self.target) 

333 return v 

334 

335 def __repr__(self) -> str: 

336 return str(self) 

337 

338 

339class HdrSpan: 

340 """Saved information about a header cell/span during the parsing 

341 of a table.""" 

342 

343 __slots__ = ( 

344 "start", 

345 "colspan", 

346 "rowspan", 

347 "rownum", # Row number where this occurred 

348 "tagsets", # list of tuples 

349 "text", # For debugging 

350 "all_headers_row", 

351 "expanded", # The header has been expanded to cover whole row/part 

352 ) 

353 

354 def __init__( 

355 self, 

356 start: int, 

357 colspan: int, 

358 rowspan: int, 

359 rownum: int, 

360 tagsets: TagSets, 

361 text: str, 

362 all_headers_row: bool, 

363 ) -> None: 

364 assert isinstance(start, int) and start >= 0 

365 assert isinstance(colspan, int) and colspan >= 1 

366 assert isinstance(rownum, int) 

367 assert isinstance(tagsets, list) 

368 for x in tagsets: 

369 assert isinstance(x, tuple) 

370 assert all_headers_row in (True, False) 

371 self.start = start 

372 self.colspan = colspan 

373 self.rowspan = rowspan 

374 self.rownum = rownum 

375 self.tagsets = list(tuple(sorted(set(tags))) for tags in tagsets) 

376 self.text = text 

377 self.all_headers_row = all_headers_row 

378 self.expanded = False 

379 

380 

381def is_superscript(ch: str) -> bool: 

382 """Returns True if the argument is a superscript character.""" 

383 assert isinstance(ch, str) and len(ch) == 1 

384 try: 

385 name = unicodedata.name(ch) 

386 except ValueError: 

387 return False 

388 return ( 

389 re.match( 

390 r"SUPERSCRIPT |" 

391 r"MODIFIER LETTER SMALL |" 

392 r"MODIFIER LETTER CAPITAL ", 

393 name, 

394 ) 

395 is not None 

396 ) 

397 

398 

399def remove_useless_tags(lang: str, pos: str, tags: set[str]) -> None: 

400 """Remove certain tag combinations from ``tags`` when they serve no purpose 

401 together (cover all options).""" 

402 assert isinstance(lang, str) 

403 assert isinstance(pos, str) 

404 assert isinstance(tags, set) 

405 if ( 

406 "animate" in tags 

407 and "inanimate" in tags 

408 and get_lang_conf(lang, "animate_inanimate_remove") 

409 ): 

410 tags.remove("animate") 

411 tags.remove("inanimate") 

412 if ( 

413 "virile" in tags 

414 and "nonvirile" in tags 

415 and get_lang_conf(lang, "virile_nonvirile_remove") 

416 ): 

417 tags.remove("virile") 

418 tags.remove("nonvirile") 

419 # If all numbers in the language are listed, remove them all 

420 numbers = get_lang_conf(lang, "numbers") 

421 if numbers and all(x in tags for x in numbers): 

422 for x in numbers: 

423 tags.remove(x) 

424 # If all genders in the language are listed, remove them all 

425 genders = get_lang_conf(lang, "genders") 

426 if genders and all(x in tags for x in genders): 

427 for x in genders: 

428 tags.remove(x) 

429 # If all voices in the language are listed, remove them all 

430 voices = get_lang_conf(lang, "voices") 

431 if voices and all(x in tags for x in voices): 

432 for x in voices: 

433 tags.remove(x) 

434 # If all strengths of the language are listed, remove them all 

435 strengths = get_lang_conf(lang, "strengths") 

436 if strengths and all(x in tags for x in strengths): 

437 for x in strengths: 

438 tags.remove(x) 

439 # If all persons of the language are listed, remove them all 

440 persons = get_lang_conf(lang, "persons") 

441 if persons and all(x in tags for x in persons): 

442 for x in persons: 

443 tags.remove(x) 

444 # If all definitenesses of the language are listed, remove them all 

445 definitenesses = get_lang_conf(lang, "definitenesses") 

446 if definitenesses and all(x in tags for x in definitenesses): 

447 for x in definitenesses: 

448 tags.remove(x) 

449 

450 

451def tagset_cats(tagset: TagSets) -> set[str]: 

452 """Returns a set of tag categories for the tagset (merged from all 

453 alternatives).""" 

454 return set(valid_tags[t] for ts in tagset for t in ts) 

455 

456 

457def or_tagsets( 

458 lang: str, pos: str, tagsets1: TagSets, tagsets2: TagSets 

459) -> TagSets: 

460 """Merges two tagsets (the new tagset just merges the tags from both, in 

461 all combinations). If they contain simple alternatives (differ in 

462 only one category), they are simply merged; otherwise they are split to 

463 more alternatives. The tagsets are assumed be sets of sorted tuples.""" 

464 assert isinstance(tagsets1, list) 

465 assert all(isinstance(x, tuple) for x in tagsets1) 

466 assert isinstance(tagsets2, list) 

467 assert all(isinstance(x, tuple) for x in tagsets1) 

468 tagsets: TagSets = [] # This will be the result 

469 

470 def add_tags(tags1: tuple[str, ...]) -> None: 

471 # CONTINUE 

472 if not tags1: 

473 return # empty set would merge with anything, won't change result 

474 if not tagsets: 

475 tagsets.append(tags1) 

476 return 

477 for tags2 in tagsets: 

478 # Determine if tags1 can be merged with tags2 

479 num_differ = 0 

480 if tags1 and tags2: 480 ↛ 498line 480 didn't jump to line 498 because the condition on line 480 was always true

481 cats1 = set(valid_tags[t] for t in tags1) 

482 cats2 = set(valid_tags[t] for t in tags2) 

483 cats = cats1 | cats2 

484 for cat in cats: 

485 tags1_in_cat = set(t for t in tags1 if valid_tags[t] == cat) 

486 tags2_in_cat = set(t for t in tags2 if valid_tags[t] == cat) 

487 if ( 

488 tags1_in_cat != tags2_in_cat 

489 or not tags1_in_cat 

490 or not tags2_in_cat 

491 ): 

492 num_differ += 1 

493 if not tags1_in_cat or not tags2_in_cat: 

494 # Prevent merging if one is empty 

495 num_differ += 1 

496 # print("tags1={} tags2={} num_differ={}" 

497 # .format(tags1, tags2, num_differ)) 

498 if num_differ <= 1: 

499 # Yes, they can be merged 

500 tagsets.remove(tags2) 

501 tags_s = set(tags1) | set(tags2) 

502 remove_useless_tags(lang, pos, tags_s) 

503 tags_t = tuple(sorted(tags_s)) 

504 add_tags(tags_t) # Could result in further merging 

505 return 

506 # If we could not merge, add to tagsets 

507 tagsets.append(tags1) 

508 

509 for tags in tagsets1: 

510 add_tags(tags) 

511 for tags in tagsets2: 

512 add_tags(tags) 

513 if not tagsets: 

514 tagsets.append(()) 

515 

516 # print("or_tagsets: {} + {} -> {}" 

517 # .format(tagsets1, tagsets2, tagsets)) 

518 return tagsets 

519 

520 

521def and_tagsets( 

522 lang: str, 

523 pos: str, 

524 tagsets1: list[tuple[str, ...]], 

525 tagsets2: list[tuple[str, ...]], 

526) -> list[tuple[str, ...]]: 

527 """Merges tagsets by taking union of all cobinations, without trying 

528 to determine whether they are compatible.""" 

529 assert isinstance(tagsets1, list) and len(tagsets1) >= 1 

530 assert all(isinstance(x, tuple) for x in tagsets1) 

531 assert isinstance(tagsets2, list) and len(tagsets2) >= 1 

532 assert all(isinstance(x, tuple) for x in tagsets1) 

533 new_tagsets = [] 

534 tags: Union[set[str], tuple[str, ...]] 

535 for tags1 in tagsets1: 

536 for tags2 in tagsets2: 

537 tags = set(tags1) | set(tags2) 

538 remove_useless_tags(lang, pos, tags) 

539 if "dummy-ignored-text-cell" in tags: 539 ↛ 540line 539 didn't jump to line 540 because the condition on line 539 was never true

540 tags.remove("dummy-ignored-text-cell") 

541 tags = tuple(sorted(tags)) 

542 if tags not in new_tagsets: 542 ↛ 536line 542 didn't jump to line 536 because the condition on line 542 was always true

543 new_tagsets.append(tags) 

544 # print("and_tagsets: {} + {} -> {}" 

545 # .format(tagsets1, tagsets2, new_tagsets)) 

546 return new_tagsets 

547 

548 

549@functools.lru_cache(65536) 

550def extract_cell_content( 

551 lang: str, word: str, col: str 

552) -> tuple[str, list[str], list[tuple[str, str]], list[str]]: 

553 """Cleans a row/column header for later processing. This returns 

554 (cleaned, refs, defs, tags).""" 

555 # print("EXTRACT_CELL_CONTENT {!r}".format(col)) 

556 hdr_tags = [] 

557 col = re.sub(r"(?s)\s*,\s*$", "", col) 

558 col = re.sub(r"(?s)\s*•\s*$", "", col) 

559 col = re.sub(r"\s+", " ", col) 

560 col = col.strip() 

561 if re.search( 

562 r"^\s*(There are |" 

563 r"\* |" 

564 r"see |" 

565 r"Use |" 

566 r"use the |" 

567 r"Only used |" 

568 r"The forms in |" 

569 r"these are also written |" 

570 r"The genitive can be |" 

571 r"Genitive forms are rare or non-existant|" 

572 r"Accusative Note: |" 

573 r"Classifier Note: |" 

574 r"Noun: Assamese nouns are |" 

575 r"the active conjugation|" 

576 r"the instrumenal singular|" 

577 r"Note:|" 

578 r"\^* Note:|" 

579 r"possible mutated form |" 

580 r"The future tense: )", 

581 col, 

582 ): 

583 return "dummy-ignored-text-cell", [], [], [] 

584 

585 # Temporarily remove final parenthesized part (if separated by whitespace), 

586 # so that we can extract reference markers before it. 

587 final_paren = "" 

588 m = re.search(r"\s+\([^)]*\)$", col) 

589 if m is not None: 

590 final_paren = m.group(0) 

591 col = col[: m.start()] 

592 

593 # Extract references and tag markers 

594 refs = [] 

595 special_references = get_lang_conf(lang, "special_references") 

596 while True: 

597 m = re.search(r"\^(.|\([^)]*\))$", col) 

598 if not m: 

599 break 

600 r = m.group(1) 

601 if r.startswith("(") and r.endswith(")"): 

602 r = r[1:-1] 

603 for r1 in r.split(","): 

604 if r1 == "rare": 604 ↛ 605line 604 didn't jump to line 605 because the condition on line 604 was never true

605 hdr_tags.append("rare") 

606 elif special_references and r1 in special_references: 

607 hdr_tags.extend(special_references[r1].split()) 

608 else: 

609 # v = m.group(1) 

610 if r1.startswith("(") and r1.endswith(")"): 610 ↛ 611line 610 didn't jump to line 611 because the condition on line 610 was never true

611 r1 = r1[1:-1] 

612 refs.append(unicodedata.normalize("NFKD", r1)) 

613 col = col[: m.start()] 

614 # See if it is a ref definition 

615 # print("BEFORE REF CHECK: {!r}".format(col)) 

616 m = def_re.match(col) 

617 # print(f"Before def_re: {refs=}") 

618 if m and not nondef_re.match(col): 

619 ofs = 0 

620 ref = None 

621 deflst = [] 

622 for m in re.finditer(def_re, col): 

623 if ref: 

624 deflst.append((ref, col[ofs : m.start()].strip())) 

625 ref = unicodedata.normalize( 

626 "NFKD", m.group(3) or m.group(5) or m.group(6) or "" 

627 ) 

628 ofs = m.end() 

629 if ref: 629 ↛ 632line 629 didn't jump to line 632 because the condition on line 629 was always true

630 deflst.append((ref, col[ofs:].strip())) 

631 # print("deflst:", deflst) 

632 return "", [], deflst, [] 

633 # See if it *looks* like a reference to a definition 

634 # print(f"After def_re: {refs=}") 

635 while col: 

636 if is_superscript(col[-1]) or col[-1] in ("†",): 

637 if col.endswith("ʳᵃʳᵉ"): 

638 hdr_tags.append("rare") 

639 col = col[:-4].strip() 

640 continue 

641 if special_references: 

642 stop_flag = False 

643 for r in special_references: 

644 if col.endswith(r): 

645 hdr_tags.extend(special_references[r].split()) 

646 col = col[: -len(r)].strip() 

647 stop_flag = True 

648 break # this for loop 

649 if stop_flag: 

650 continue # this while loop 

651 # Numbers and H/L/N are useful information 

652 refs.append(unicodedata.normalize("NFKD", col[-1])) 

653 col = col[:-1] 

654 else: 

655 break 

656 

657 # Check for another form of note definition 

658 if ( 658 ↛ 664line 658 didn't jump to line 664 because the condition on line 658 was never true

659 len(col) > 2 

660 and col[1] in (")", " ", ":") 

661 and col[0].isdigit() 

662 and not re.match(nondef_re, col) 

663 ): 

664 return "", [], [(col[0], col[2:].strip())], [] 

665 col = col.strip() 

666 

667 # Extract final "*" reference symbols. Sometimes there are multiple. 

668 m = re.search(r"\*+$", col) 

669 if m is not None: 

670 col = col[: m.start()] 

671 refs.append(unicodedata.normalize("NFKD", m.group(0))) 

672 if col.endswith("(*)"): 672 ↛ 673line 672 didn't jump to line 673 because the condition on line 672 was never true

673 col = col[:-3].strip() 

674 refs.append("*") 

675 

676 # Put back the final parenthesized part 

677 col = col.strip() + final_paren 

678 # print("EXTRACT_CELL_CONTENT: orig_col={!r} col={!r} refs={!r} hdr_tags={}" 

679 # .format(orig_col, col, refs, hdr_tags)) 

680 return col.strip(), refs, [], hdr_tags 

681 

682 

683@functools.lru_cache(10000) 

684def parse_title( 

685 title: str, source: str 

686) -> tuple[list[str], list[str], list[FormData]]: 

687 """Parses inflection table title. This returns (global_tags, table_tags, 

688 extra_forms), where ``global_tags`` is tags to be added to each inflection 

689 entry, ``table_tags`` are tags for the word but not to be added to every 

690 form, and ``extra_forms`` is dictionary describing additional forms to be 

691 included in the part-of-speech entry).""" 

692 assert isinstance(title, str) 

693 assert isinstance(source, str) 

694 title = html.unescape(title) 

695 title = re.sub(r"(?i)<[^>]*>", "", title).strip() 

696 title = re.sub(r"\s+", " ", title) 

697 # print("PARSE_TITLE:", title) 

698 global_tags = [] 

699 table_tags = [] 

700 extra_forms = [] 

701 # Add certain global tags based on contained words 

702 for m in re.finditer(title_contains_global_re, title): 

703 v = m.group(0).lower() 

704 if re.match(table_hdr_ign_part_re, v): 704 ↛ 705line 704 didn't jump to line 705 because the condition on line 704 was never true

705 continue 

706 global_tags.extend(title_contains_global_map[v].split()) 

707 # Add certain tags to table-tags "form" based on contained words 

708 for m in re.finditer(title_contains_wordtags_re, title): 

709 v = m.group(0).lower() 

710 if re.match(table_hdr_ign_part_re, v): 710 ↛ 711line 710 didn't jump to line 711 because the condition on line 710 was never true

711 continue 

712 table_tags.extend(title_contains_wordtags_map[v].split()) 

713 if re.search(r"Conjugation of (s’|se ).*French verbs", title): 713 ↛ 714line 713 didn't jump to line 714 because the condition on line 713 was never true

714 global_tags.append("reflexive") 

715 # Check for <x>-type at the beginning of title (e.g., Armenian) and various 

716 # other ways of specifying an inflection class. 

717 for m in re.finditer( 

718 r"\b(" 

719 r"[\w/]+-type|" 

720 r"accent-\w+|" 

721 r"[\w/]+-stem|" 

722 r"[^ ]+ gradation|" 

723 r"\b(stem in [\w/ ]+)|" 

724 r"[^ ]+ alternation|" 

725 r"(First|Second|Third|Fourth|Fifth|Sixth|Seventh) " 

726 r"(Conjugation|declension)|" 

727 r"First and second declension|" 

728 r"(1st|2nd|3rd|4th|5th|6th) declension|" 

729 r"\w[\w/ ]* harmony" 

730 r")\b", 

731 title, 

732 ): 

733 dt: FormData = {"form": m.group(1), "source": source, "tags": ["class"]} 

734 extra_forms.append(dt) 

735 # Parse parenthesized part from title 

736 for m in re.finditer(r"\(([^)]*)\)", title): 

737 for elem in m.group(1).split(","): 

738 # group(0) is the whole string, group(1) first parens 

739 elem = elem.strip() 

740 if elem in title_elements_map: 

741 table_tags.extend(title_elements_map[elem].split()) 

742 else: 

743 m1 = re.match(title_elemstart_re, elem) 

744 if m1: 

745 tags = title_elemstart_map[m1.group(1)].split() 

746 dt = { 

747 "form": elem[m1.end() :], 

748 "source": source, 

749 "tags": tags, 

750 } 

751 extra_forms.append(dt) 

752 # For titles that contains no parenthesized parts, do some special 

753 # handling to still interpret parts from them 

754 if "(" not in title: 

755 # No parenthesized parts 

756 m1 = re.search(r"\b(Portuguese) (-.* verb) ", title) 

757 if m1 is not None: 

758 dt = {"form": m1.group(2), "tags": ["class"], "source": source} 

759 extra_forms.append(dt) 

760 for elem in title.split(","): 

761 elem = elem.strip() 

762 if elem in title_elements_map: 762 ↛ 763line 762 didn't jump to line 763 because the condition on line 762 was never true

763 table_tags.extend(title_elements_map[elem].split()) 

764 elif elem.endswith("-stem"): 764 ↛ 765line 764 didn't jump to line 765 because the condition on line 764 was never true

765 dt = {"form": elem, "tags": ["class"], "source": source} 

766 extra_forms.append(dt) 

767 return global_tags, table_tags, extra_forms 

768 

769 

770def expand_header( 

771 wxr: WiktextractContext, 

772 tablecontext: "TableContext", 

773 word: str, 

774 lang: str, 

775 pos: str, 

776 text: str, 

777 base_tags: Union[list[str], set[str], tuple[str, ...]], 

778 silent=False, 

779 ignore_tags=False, 

780 depth=0, 

781) -> list[tuple[str, ...]]: 

782 """Expands a cell header to tagset, handling conditional expressions 

783 in infl_map. This returns list of tuples of tags, each list element 

784 describing an alternative interpretation. ``base_tags`` is combined 

785 column and row tags for the cell in which the text is being interpreted 

786 (conditional expressions in inflection data may depend on it). 

787 If ``silent`` is True, then no warnings will be printed. If ``ignore_tags`` 

788 is True, then tags listed in "if" will be ignored in the test (this is 

789 used when trying to heuristically detect whether a non-<th> cell is anyway 

790 a header).""" 

791 assert isinstance(wxr, WiktextractContext) 

792 assert isinstance(word, str) 

793 assert isinstance(lang, str) 

794 assert isinstance(pos, str) 

795 assert isinstance(text, str) 

796 assert isinstance(base_tags, (list, tuple, set)) 

797 assert silent in (True, False) 

798 assert isinstance(depth, int) 

799 # print("EXPAND_HDR: text={!r} base_tags={!r}".format(text, base_tags)) 

800 # First map the text using the inflection map 

801 text = clean_value(wxr, text) 

802 combined_return: list[tuple[str, ...]] = [] 

803 parts = split_at_comma_semi(text, separators=[";"]) 

804 for text in parts: 

805 if not text: 805 ↛ 806line 805 didn't jump to line 806 because the condition on line 805 was never true

806 continue 

807 if text in infl_map: 

808 v = infl_map[text] # list or string 

809 else: 

810 m = re.match(infl_start_re, text) 

811 if m is not None: 811 ↛ 812line 811 didn't jump to line 812 because the condition on line 811 was never true

812 v = infl_start_map[m.group(1)] 

813 # print("INFL_START {} -> {}".format(text, v)) 

814 elif re.match(r"Notes", text): 

815 # Ignored header 

816 # print("IGNORING NOTES") 

817 combined_return = or_tagsets( 

818 lang, pos, combined_return, [("dummy-skip-this",)] 

819 ) 

820 # this just adds dummy-skip-this 

821 continue 

822 elif text in IGNORED_COLVALUES: 

823 combined_return = or_tagsets( 

824 lang, pos, combined_return, [("dummy-ignore-skipped",)] 

825 ) 

826 continue 

827 # Try without final parenthesized part 

828 text_without_parens = re.sub(r"[,/]?\s+\([^)]*\)\s*$", "", text) 

829 if text_without_parens in infl_map: 

830 v = infl_map[text_without_parens] 

831 elif m is None: 831 ↛ 847line 831 didn't jump to line 847 because the condition on line 831 was always true

832 if not silent: 

833 wxr.wtp.debug( 

834 "inflection table: unrecognized header: {}".format( 

835 repr(text) 

836 ), 

837 sortid="inflection/735", 

838 ) 

839 # Unrecognized header 

840 combined_return = or_tagsets( 

841 lang, pos, combined_return, [("error-unrecognized-form",)] 

842 ) 

843 continue 

844 

845 # Then loop interpreting the value, until the value is a simple string. 

846 # This may evaluate nested conditional expressions. 

847 default_then = None 

848 while True: 

849 # If it is a string, we are done. 

850 if isinstance(v, str): 

851 tags = set(v.split()) 

852 remove_useless_tags(lang, pos, tags) 

853 tagset = [tuple(sorted(tags))] 

854 break 

855 # For a list, just interpret it as alternatives. (Currently the 

856 # alternatives must directly be strings.) 

857 if isinstance(v, (list, tuple)): 

858 tagset = [] 

859 for x in v: 

860 tags = set(x.split()) 

861 remove_useless_tags(lang, pos, tags) 

862 tags_t = tuple(sorted(tags)) 

863 if tags_t not in tagset: 863 ↛ 859line 863 didn't jump to line 859 because the condition on line 863 was always true

864 tagset.append(tags_t) 

865 break 

866 # Otherwise the value should be a dictionary describing a 

867 # conditional expression. 

868 if not isinstance(v, dict): 868 ↛ 869line 868 didn't jump to line 869 because the condition on line 868 was never true

869 wxr.wtp.debug( 

870 "inflection table: internal: " 

871 "UNIMPLEMENTED INFL_MAP VALUE: {}".format(infl_map[text]), 

872 sortid="inflection/767", 

873 ) 

874 tagset = [()] 

875 break 

876 # Evaluate the conditional expression. 

877 assert isinstance(v, dict) 

878 cond: Union[bool, str] = "default-true" 

879 c: Union[str, list[str], set[str]] = "" 

880 # Handle "lang" condition. The value must be either a 

881 # single language or a list of languages, and the 

882 # condition evaluates to True if the table is one of 

883 # those languages. 

884 if "lang" in v: 

885 c = v["lang"] 

886 if isinstance(c, str): 

887 cond = c == lang 

888 else: 

889 assert isinstance(c, (list, tuple, set)) 

890 cond = lang in c 

891 # Handle "nested-table-depth" condition. The value must 

892 # be an int or list of ints, and the condition evaluates 

893 # True if the depth is one of those values. 

894 # "depth" is how deep into a nested table tree the current 

895 # table lies. It is first started in handle_wikitext_table, 

896 # so only applies to tables-within-tables, not other 

897 # WikiNode content. `depth` is currently only passed as a 

898 # parameter down the table parsing stack, and not stored. 

899 if cond and "nested-table-depth" in v: 899 ↛ 900line 899 didn't jump to line 900 because the condition on line 899 was never true

900 d = v["nested-table-depth"] 

901 if isinstance(d, int): 

902 cond = d == depth 

903 else: 

904 assert isinstance(d, (list, tuple, set)) 

905 cond = depth in d 

906 # Handle inflection-template condition. Must be a string 

907 # or list of strings, and if tablecontext.template_name is in 

908 # those, accept the condition. 

909 # TableContext.template_name is passed down from page/ 

910 # parse_inflection, before parsing and expanding itself 

911 # has begun. 

912 if cond and tablecontext and "inflection-template" in v: 

913 d1 = v["inflection-template"] 

914 if isinstance(d1, str): 914 ↛ 917line 914 didn't jump to line 917 because the condition on line 914 was always true

915 cond = d1 == tablecontext.template_name 

916 else: 

917 assert isinstance(d1, (list, tuple, set)) 

918 cond = tablecontext.template_name in d1 

919 # Handle "pos" condition. The value must be either a single 

920 # part-of-speech or a list of them, and the condition evaluates to 

921 # True if the part-of-speech is any of those listed. 

922 if cond and "pos" in v: 

923 c = v["pos"] 

924 if isinstance(c, str): 

925 cond = c == pos 

926 else: 

927 assert isinstance(c, (list, tuple, set)) 

928 cond = pos in c 

929 # Handle "if" condition. The value must be a string containing a 

930 # space-separated list of tags. The condition evaluates to True if 

931 # ``base_tags`` contains all of the listed tags. If the condition 

932 # is of the form "any: ...tags...", then any of the tags will be 

933 # enough. 

934 if cond and "if" in v and not ignore_tags: 

935 c = v["if"] 

936 assert isinstance(c, str) 

937 # "if" condition is true if any of the listed tags is present if 

938 # it starts with "any:", otherwise all must be present 

939 if c.startswith("any: "): 

940 cond = any(t in base_tags for t in c[5:].split()) 

941 else: 

942 cond = all(t in base_tags for t in c.split()) 

943 

944 # Handle "default" assignment. Store the value to be used 

945 # as a default later. 

946 if "default" in v: 

947 assert isinstance(v["default"], str) 

948 default_then = v["default"] 

949 

950 # Warning message about missing conditions for debugging. 

951 

952 if cond == "default-true" and not default_then and not silent: 

953 wxr.wtp.debug( 

954 "inflection table: IF MISSING COND: word={} " 

955 "lang={} text={} base_tags={} c={} cond={}".format( 

956 word, lang, text, base_tags, c, cond 

957 ), 

958 sortid="inflection/851", 

959 ) 

960 # Based on the result of evaluating the condition, select either 

961 # "then" part or "else" part. 

962 if cond: 

963 v = v.get("then", "") 

964 else: 

965 v1 = v.get("else") 

966 if v1 is None: 

967 if default_then: 

968 v = default_then 

969 else: 

970 if not silent: 

971 wxr.wtp.debug( 

972 "inflection table: IF WITHOUT ELSE EVALS " 

973 "False: " 

974 "{}/{} {!r} base_tags={}".format( 

975 word, lang, text, base_tags 

976 ), 

977 sortid="inflection/865", 

978 ) 

979 v = "error-unrecognized-form" 

980 else: 

981 v = v1 

982 

983 # Merge the resulting tagset from this header part with the other 

984 # tagsets from the whole header 

985 combined_return = or_tagsets(lang, pos, combined_return, tagset) 

986 

987 # Return the combined tagsets, or empty tagset if we got no tagsets 

988 if not combined_return: 

989 combined_return = [()] 

990 return combined_return 

991 

992 

993def compute_coltags( 

994 lang: str, 

995 pos: str, 

996 hdrspans: list[str], 

997 start: int, 

998 colspan: int, 

999 celltext: int, 

1000) -> list[tuple[str]]: 

1001 """Computes column tags for a column of the given width based on the 

1002 current header spans.""" 

1003 assert isinstance(lang, str) 

1004 assert isinstance(pos, str) 

1005 assert isinstance(hdrspans, list) 

1006 assert isinstance(start, int) and start >= 0 

1007 assert isinstance(colspan, int) and colspan >= 1 

1008 assert isinstance(celltext, str) # For debugging only 

1009 # print("COMPUTE_COLTAGS CALLED start={} colspan={} celltext={!r}" 

1010 # .format(start, colspan, celltext)) 

1011 # For debugging, set this to the form for whose cell you want debug prints 

1012 if celltext == debug_cell_text: 1012 ↛ 1013line 1012 didn't jump to line 1013 because the condition on line 1012 was never true

1013 print( 

1014 "COMPUTE_COLTAGS CALLED start={} colspan={} celltext={!r}".format( 

1015 start, colspan, celltext 

1016 ) 

1017 ) 

1018 for hdrspan in hdrspans: 

1019 print( 

1020 " row={} start={} colspans={} tagsets={}".format( 

1021 hdrspan.rownum, 

1022 hdrspan.start, 

1023 hdrspan.colspan, 

1024 hdrspan.tagsets, 

1025 ) 

1026 ) 

1027 used = set() 

1028 coltags = [()] 

1029 last_header_row = 1000000 

1030 # Iterate through the headers in reverse order, i.e., headers lower in the 

1031 # table (closer to the cell) first. 

1032 row_tagsets = [()] 

1033 row_tagsets_rownum = 1000000 

1034 used_hdrspans = set() 

1035 for hdrspan in reversed(hdrspans): 

1036 if ( 

1037 hdrspan.start + hdrspan.colspan <= start 

1038 or hdrspan.start >= start + colspan 

1039 ): 

1040 # Does not horizontally overlap current cell. Ignore this hdrspan. 

1041 if celltext == debug_cell_text: 1041 ↛ 1042line 1041 didn't jump to line 1042 because the condition on line 1041 was never true

1042 print( 

1043 "Ignoring row={} start={} colspan={} tagsets={}".format( 

1044 hdrspan.rownum, 

1045 hdrspan.start, 

1046 hdrspan.colspan, 

1047 hdrspan.tagsets, 

1048 ) 

1049 ) 

1050 continue 

1051 # If the cell partially overlaps the current cell, assume we have 

1052 # reached something unrelated and abort. 

1053 if ( 

1054 hdrspan.start < start 

1055 and hdrspan.start + hdrspan.colspan > start 

1056 and hdrspan.start + hdrspan.colspan < start + colspan 

1057 ): 

1058 if celltext == debug_cell_text: 1058 ↛ 1059line 1058 didn't jump to line 1059 because the condition on line 1058 was never true

1059 print( 

1060 "break on partial overlap at start {} {} {}".format( 

1061 hdrspan.start, hdrspan.colspan, hdrspan.tagsets 

1062 ) 

1063 ) 

1064 break 

1065 if ( 

1066 hdrspan.start < start + colspan 

1067 and hdrspan.start > start 

1068 and hdrspan.start + hdrspan.colspan > start + colspan 

1069 and not hdrspan.expanded 

1070 ): 

1071 if celltext == debug_cell_text: 1071 ↛ 1072line 1071 didn't jump to line 1072 because the condition on line 1071 was never true

1072 print( 

1073 "break on partial overlap at end {} {} {}".format( 

1074 hdrspan.start, hdrspan.colspan, hdrspan.tagsets 

1075 ) 

1076 ) 

1077 break 

1078 # Check if we have already used this cell. 

1079 if id(hdrspan) in used_hdrspans: 

1080 continue 

1081 # We are going to use this cell. 

1082 used_hdrspans.add(id(hdrspan)) 

1083 tagsets = hdrspan.tagsets 

1084 # If the hdrspan is fully inside the current cell and does not cover 

1085 # it fully, check if we should merge information from multiple cells. 

1086 if not hdrspan.expanded and ( 

1087 hdrspan.start > start 

1088 or hdrspan.start + hdrspan.colspan < start + colspan 

1089 ): 

1090 # Multiple columns apply to the current cell, only 

1091 # gender/number/case tags present 

1092 # If there are no tags outside the range in any of the 

1093 # categories included in these cells, don't add anything 

1094 # (assume all choices valid in the language are possible). 

1095 in_cats = set( 

1096 valid_tags[t] 

1097 for x in hdrspans 

1098 if x.rownum == hdrspan.rownum 

1099 and x.start >= start 

1100 and x.start + x.colspan <= start + colspan 

1101 for tt in x.tagsets 

1102 for t in tt 

1103 ) 

1104 if celltext == debug_cell_text: 1104 ↛ 1105line 1104 didn't jump to line 1105 because the condition on line 1104 was never true

1105 print("in_cats={} tagsets={}".format(in_cats, tagsets)) 

1106 # Merge the tagsets into existing tagsets. This merges 

1107 # alternatives into the same tagset if there is only one 

1108 # category different; otherwise this splits the tagset into 

1109 # more alternatives. 

1110 includes_all_on_row = True 

1111 for x in hdrspans: 

1112 # print("X: x.rownum={} x.start={}".format(x.rownum, x.start)) 

1113 if x.rownum != hdrspan.rownum: 

1114 continue 

1115 if x.start < start or x.start + x.colspan > start + colspan: 

1116 if celltext == debug_cell_text: 1116 ↛ 1117line 1116 didn't jump to line 1117 because the condition on line 1116 was never true

1117 print( 

1118 "NOT IN RANGE: {} {} {}".format( 

1119 x.start, x.colspan, x.tagsets 

1120 ) 

1121 ) 

1122 includes_all_on_row = False 

1123 continue 

1124 if id(x) in used_hdrspans: 

1125 if celltext == debug_cell_text: 1125 ↛ 1126line 1125 didn't jump to line 1126 because the condition on line 1125 was never true

1126 print( 

1127 "ALREADY USED: {} {} {}".format( 

1128 x.start, x.colspan, x.tagsets 

1129 ) 

1130 ) 

1131 continue 

1132 used_hdrspans.add(id(x)) 

1133 if celltext == debug_cell_text: 1133 ↛ 1134line 1133 didn't jump to line 1134 because the condition on line 1133 was never true

1134 print( 

1135 "Merging into wide col: x.rownum={} " 

1136 "x.start={} x.colspan={} " 

1137 "start={} colspan={} tagsets={} x.tagsets={}".format( 

1138 x.rownum, 

1139 x.start, 

1140 x.colspan, 

1141 start, 

1142 colspan, 

1143 tagsets, 

1144 x.tagsets, 

1145 ) 

1146 ) 

1147 tagsets = or_tagsets(lang, pos, tagsets, x.tagsets) 

1148 # If all headers on the row were included, ignore them. 

1149 # See e.g. kunna/Swedish/Verb. 

1150 ts_cats = tagset_cats(tagsets) 

1151 if ( 

1152 includes_all_on_row 

1153 or 

1154 # Kludge, see fut/Hungarian/Verb 

1155 ("tense" in ts_cats and "object" in ts_cats) 

1156 ): 

1157 tagsets = [()] 

1158 # For limited categories, if the category doesn't appear 

1159 # outside, we won't include the category 

1160 if not in_cats - set( 

1161 ("gender", "number", "person", "case", "category", "voice") 

1162 ): 

1163 # Sometimes we have masc, fem, neut and plural, so treat 

1164 # number and gender as the same here (if one given, look for 

1165 # the other too) 

1166 if "number" in in_cats or "gender" in in_cats: 

1167 in_cats.update(("number", "gender")) 

1168 # Determine which categories occur outside on 

1169 # the same row. Ignore headers that have been expanded 

1170 # to cover the whole row/part of it. 

1171 out_cats = set( 

1172 valid_tags[t] 

1173 for x in hdrspans 

1174 if x.rownum == hdrspan.rownum 

1175 and not x.expanded 

1176 and ( 

1177 x.start < start or x.start + x.colspan > start + colspan 

1178 ) 

1179 for tt in x.tagsets 

1180 for t in tt 

1181 ) 

1182 if celltext == debug_cell_text: 1182 ↛ 1183line 1182 didn't jump to line 1183 because the condition on line 1182 was never true

1183 print("in_cats={} out_cats={}".format(in_cats, out_cats)) 

1184 # Remove all inside categories that do not appear outside 

1185 

1186 new_tagsets = [] 

1187 for ts in tagsets: 

1188 tags = tuple( 

1189 sorted(t for t in ts if valid_tags[t] in out_cats) 

1190 ) 

1191 if tags not in new_tagsets: 1191 ↛ 1187line 1191 didn't jump to line 1187 because the condition on line 1191 was always true

1192 new_tagsets.append(tags) 

1193 if celltext == debug_cell_text and new_tagsets != tagsets: 1193 ↛ 1194line 1193 didn't jump to line 1194 because the condition on line 1193 was never true

1194 print( 

1195 "Removed tags that do not " 

1196 "appear outside {} -> {}".format( 

1197 # have_hdr never used? 

1198 tagsets, 

1199 new_tagsets, 

1200 ) 

1201 ) 

1202 tagsets = new_tagsets 

1203 key = (hdrspan.start, hdrspan.colspan) 

1204 if key in used: 

1205 if celltext == debug_cell_text: 1205 ↛ 1206line 1205 didn't jump to line 1206 because the condition on line 1205 was never true

1206 print( 

1207 "Cellspan already used: start={} " 

1208 "colspan={} rownum={} {}".format( 

1209 hdrspan.start, 

1210 hdrspan.colspan, 

1211 hdrspan.rownum, 

1212 hdrspan.tagsets, 

1213 ) 

1214 ) 

1215 action = get_lang_conf(lang, "reuse_cellspan") 

1216 # can be "stop", "skip" or "reuse" 

1217 if action == "stop": 

1218 break 

1219 if action == "skip": 

1220 continue 

1221 assert action == "reuse" 

1222 tcats = tagset_cats(tagsets) 

1223 # Most headers block using the same column position above. However, 

1224 # "register" tags don't do this (cf. essere/Italian/verb: "formal") 

1225 if len(tcats) != 1 or "register" not in tcats: 

1226 used.add(key) 

1227 # If we have moved to a different row, merge into column tagsets 

1228 # (we use different and_tagsets within the row) 

1229 if row_tagsets_rownum != hdrspan.rownum: 

1230 # row_tagsets_rownum was initialized as 10000000 

1231 ret = and_tagsets(lang, pos, coltags, row_tagsets) 

1232 if celltext == debug_cell_text: 1232 ↛ 1233line 1232 didn't jump to line 1233 because the condition on line 1232 was never true

1233 print( 

1234 "merging rows: {} {} -> {}".format( 

1235 coltags, row_tagsets, ret 

1236 ) 

1237 ) 

1238 coltags = ret 

1239 row_tagsets = [()] 

1240 row_tagsets_rownum = hdrspan.rownum 

1241 # Merge into coltags 

1242 if hdrspan.all_headers_row and hdrspan.rownum + 1 == last_header_row: 

1243 # If this row is all headers and immediately preceeds the last 

1244 # header we accepted, take any header from there. 

1245 row_tagsets = and_tagsets(lang, pos, row_tagsets, tagsets) 

1246 if celltext == debug_cell_text: 1246 ↛ 1247line 1246 didn't jump to line 1247 because the condition on line 1246 was never true

1247 print("merged (next header row): {}".format(row_tagsets)) 

1248 else: 

1249 # new_cats is for the new tags (higher up in the table) 

1250 new_cats = tagset_cats(tagsets) 

1251 # cur_cats is for the tags already collected (lower in the table) 

1252 cur_cats = tagset_cats(coltags) 

1253 if celltext == debug_cell_text: 1253 ↛ 1254line 1253 didn't jump to line 1254 because the condition on line 1253 was never true

1254 print( 

1255 "row={} start={} colspan={} tagsets={} coltags={} " 

1256 "new_cats={} cur_cats={}".format( 

1257 hdrspan.rownum, 

1258 hdrspan.start, 

1259 hdrspan.colspan, 

1260 tagsets, 

1261 coltags, 

1262 new_cats, 

1263 cur_cats, 

1264 ) 

1265 ) 

1266 if "detail" in new_cats: 

1267 if not any(coltags): # Only if no tags so far 

1268 coltags = or_tagsets(lang, pos, coltags, tagsets) 

1269 if celltext == debug_cell_text: 1269 ↛ 1270line 1269 didn't jump to line 1270 because the condition on line 1269 was never true

1270 print("stopping on detail after merge") 

1271 break 

1272 # Here, we block bleeding of categories from above 

1273 elif "non-finite" in cur_cats and "non-finite" in new_cats: 

1274 stop = get_lang_conf(lang, "stop_non_finite_non_finite") 

1275 if stop: 1275 ↛ 1301line 1275 didn't jump to line 1301 because the condition on line 1275 was always true

1276 if celltext == debug_cell_text: 1276 ↛ 1277line 1276 didn't jump to line 1277 because the condition on line 1276 was never true

1277 print("stopping on non-finite-non-finite") 

1278 break 

1279 elif "non-finite" in cur_cats and "voice" in new_cats: 

1280 stop = get_lang_conf(lang, "stop_non_finite_voice") 

1281 if stop: 1281 ↛ 1301line 1281 didn't jump to line 1301 because the condition on line 1281 was always true

1282 if celltext == debug_cell_text: 1282 ↛ 1283line 1282 didn't jump to line 1283 because the condition on line 1282 was never true

1283 print("stopping on non-finite-voice") 

1284 break 

1285 elif "non-finite" in new_cats and cur_cats & set( 

1286 ("person", "number") 

1287 ): 

1288 if celltext == debug_cell_text: 1288 ↛ 1289line 1288 didn't jump to line 1289 because the condition on line 1288 was never true

1289 print("stopping on non-finite new") 

1290 break 

1291 elif "non-finite" in new_cats and "tense" in new_cats: 

1292 stop = get_lang_conf(lang, "stop_non_finite_tense") 

1293 if stop: 

1294 if celltext == debug_cell_text: 1294 ↛ 1295line 1294 didn't jump to line 1295 because the condition on line 1294 was never true

1295 print("stopping on non-finite new") 

1296 break 

1297 elif "non-finite" in cur_cats and new_cats & set(("mood",)): 1297 ↛ 1298line 1297 didn't jump to line 1298 because the condition on line 1297 was never true

1298 if celltext == debug_cell_text: 

1299 print("stopping on non-finite cur") 

1300 break 

1301 if ( 

1302 "tense" in new_cats 

1303 and any("imperative" in x for x in coltags) 

1304 and get_lang_conf(lang, "imperative_no_tense") 

1305 ): 

1306 if celltext == debug_cell_text: 1306 ↛ 1307line 1306 didn't jump to line 1307 because the condition on line 1306 was never true

1307 print("skipping tense in imperative") 

1308 continue 

1309 elif ( 

1310 "mood" in new_cats 

1311 and "mood" in cur_cats 

1312 and 

1313 # Allow if all new tags are already in current set 

1314 any( 

1315 t not in ts1 

1316 for ts1 in coltags # current 

1317 for ts2 in tagsets # new (from above) 

1318 for t in ts2 

1319 ) 

1320 ): 

1321 skip = get_lang_conf(lang, "skip_mood_mood") 

1322 if skip: 

1323 if celltext == debug_cell_text: 1323 ↛ 1324line 1323 didn't jump to line 1324 because the condition on line 1323 was never true

1324 print("skipping on mood-mood") 

1325 # we continue to next header 

1326 else: 

1327 if celltext == debug_cell_text: 1327 ↛ 1328line 1327 didn't jump to line 1328 because the condition on line 1327 was never true

1328 print("stopping on mood-mood") 

1329 break 

1330 elif "tense" in new_cats and "tense" in cur_cats: 

1331 skip = get_lang_conf(lang, "skip_tense_tense") 

1332 if skip: 

1333 if celltext == debug_cell_text: 1333 ↛ 1334line 1333 didn't jump to line 1334 because the condition on line 1333 was never true

1334 print("skipping on tense-tense") 

1335 # we continue to next header 

1336 else: 

1337 if celltext == debug_cell_text: 1337 ↛ 1338line 1337 didn't jump to line 1338 because the condition on line 1337 was never true

1338 print("stopping on tense-tense") 

1339 break 

1340 elif "aspect" in new_cats and "aspect" in cur_cats: 

1341 if celltext == debug_cell_text: 1341 ↛ 1342line 1341 didn't jump to line 1342 because the condition on line 1341 was never true

1342 print("skipping on aspect-aspect") 

1343 continue 

1344 elif "number" in cur_cats and "number" in new_cats: 

1345 if celltext == debug_cell_text: 1345 ↛ 1346line 1345 didn't jump to line 1346 because the condition on line 1345 was never true

1346 print("stopping on number-number") 

1347 break 

1348 elif "number" in cur_cats and "gender" in new_cats: 

1349 if celltext == debug_cell_text: 1349 ↛ 1350line 1349 didn't jump to line 1350 because the condition on line 1349 was never true

1350 print("stopping on number-gender") 

1351 break 

1352 elif "person" in cur_cats and "person" in new_cats: 

1353 if celltext == debug_cell_text: 1353 ↛ 1354line 1353 didn't jump to line 1354 because the condition on line 1353 was never true

1354 print("stopping on person-person") 

1355 break 

1356 else: 

1357 # Merge tags and continue to next header up/left in the table. 

1358 row_tagsets = and_tagsets(lang, pos, row_tagsets, tagsets) 

1359 if celltext == debug_cell_text: 1359 ↛ 1360line 1359 didn't jump to line 1360 because the condition on line 1359 was never true

1360 print("merged: {}".format(coltags)) 

1361 # Update the row number from which we have last taken headers 

1362 last_header_row = hdrspan.rownum 

1363 # Merge the final row tagset into coltags 

1364 coltags = and_tagsets(lang, pos, coltags, row_tagsets) 

1365 # print( 

1366 # "HDRSPANS:", list((x.start, x.colspan, x.tagsets) for x in hdrspans) 

1367 # ) 

1368 if celltext == debug_cell_text: 1368 ↛ 1369line 1368 didn't jump to line 1369 because the condition on line 1368 was never true

1369 print("COMPUTE_COLTAGS {} {}: {}".format(start, colspan, coltags)) 

1370 assert isinstance(coltags, list) 

1371 assert all(isinstance(x, tuple) for x in coltags) 

1372 return coltags 

1373 

1374 

1375def parse_simple_table( 

1376 wxr, tablecontext, word, lang, pos, rows, titles, source, after, depth 

1377): 

1378 """This is the default table parser. Despite its name, it can parse 

1379 complex tables. This returns a list of forms to be added to the 

1380 part-of-speech, or None if the table could not be parsed.""" 

1381 assert isinstance(wxr, WiktextractContext) 

1382 assert isinstance(tablecontext, TableContext) 

1383 assert isinstance(word, str) 

1384 assert isinstance(lang, str) 

1385 assert isinstance(pos, str) 

1386 assert isinstance(rows, list) 

1387 assert isinstance(source, str) 

1388 assert isinstance(after, str) 

1389 assert isinstance(depth, int) 

1390 for row in rows: 

1391 for col in row: 

1392 assert isinstance(col, InflCell) 

1393 assert isinstance(titles, list) 

1394 for x in titles: 

1395 assert isinstance(x, str) 

1396 

1397 # print("PARSE_SIMPLE_TABLE: TITLES:", titles) 

1398 if debug_cell_text: 1398 ↛ 1399line 1398 didn't jump to line 1399 because the condition on line 1398 was never true

1399 print("ROWS:") 

1400 for row in rows: 

1401 print(" ", row) 

1402 

1403 # Check for forced rowspan kludge. See e.g. 

1404 # maorski/Serbo-Croatian. These are essentially multi-row 

1405 # cells implemented using <br> rather than separate cell. We fix this 

1406 # by identifying rows where this happens, and splitting the current row 

1407 # to multiple rows by synthesizing additional cells. 

1408 new_rows = [] 

1409 for row in rows: 

1410 split_row = ( 

1411 any(x.is_title and x.text in ("inanimate\nanimate",) for x in row) 

1412 and 

1413 # x is an InflCell 

1414 all(x.rowspan == 1 for x in row) 

1415 ) 

1416 if not split_row: 

1417 new_rows.append(row) 

1418 continue 

1419 row1 = [] 

1420 row2 = [] 

1421 for cell in row: 

1422 cell1 = copy.deepcopy(cell) 

1423 if "\n" in cell.text: 

1424 # Has more than one line - split this cell 

1425 parts = cell.text.strip().splitlines() 

1426 if len(parts) != 2: 1426 ↛ 1427line 1426 didn't jump to line 1427 because the condition on line 1426 was never true

1427 wxr.wtp.debug( 

1428 "forced rowspan kludge got {} parts: {!r}".format( 

1429 len(parts), cell.text 

1430 ), 

1431 sortid="inflection/1234", 

1432 ) 

1433 cell2 = copy.deepcopy(cell) 

1434 cell1.text = parts[0] 

1435 cell2.text = parts[1] 

1436 else: 

1437 cell1.rowspan = 2 

1438 cell2 = cell1 # ref, not a copy 

1439 row1.append(cell1) 

1440 row2.append(cell2) 

1441 new_rows.append(row1) 

1442 new_rows.append(row2) 

1443 rows = new_rows 

1444 # print("ROWS AFTER FORCED ROWSPAN KLUDGE:") 

1445 # for row in rows: 

1446 # print(" ", row) 

1447 

1448 # Parse definitions for references (from table itself and from text 

1449 # after it) 

1450 def_ht = {} 

1451 

1452 def add_defs(defs: list[tuple[str, str]]) -> None: 

1453 for ref, d in defs: 

1454 # print("DEF: ref={} d={}".format(ref, d)) 

1455 d = d.strip() 

1456 d = d.split(". ")[0].strip() # text before ". " 

1457 if not d: 1457 ↛ 1458line 1457 didn't jump to line 1458 because the condition on line 1457 was never true

1458 continue 

1459 if d.endswith("."): # catc ".."?? 

1460 d = d[:-1] 

1461 tags, topics = decode_tags(d, no_unknown_starts=True) 

1462 # print(f"{ref=}, {d=}, {tags=}") 

1463 if topics or any("error-unknown-tag" in ts for ts in tags): 

1464 d = d[0].lower() + d[1:] 

1465 tags, topics = decode_tags(d, no_unknown_starts=True) 

1466 if topics or any("error-unknown-tag" in ts for ts in tags): 

1467 # Failed to parse as tags 

1468 # print("Failed: topics={} tags={}" 

1469 # .format(topics, tags)) 

1470 continue 

1471 tags1_s: set[str] = set() 

1472 for ts in tags: 

1473 tags1_s.update(ts) 

1474 tags1 = tuple(sorted(tags1_s)) 

1475 # print("DEFINED: {} -> {}".format(ref, tags1)) 

1476 def_ht[ref] = tags1 

1477 

1478 def generate_tags( 

1479 rowtags: list[tuple[str]], table_tags: list[str] 

1480 ) -> tuple[ 

1481 list[tuple[str, ...]], list[tuple[str, ...]], list[tuple[str, ...]] 

1482 ]: 

1483 new_coltags = [] 

1484 all_hdr_tags = [] # list of tuples 

1485 new_rowtags = [] 

1486 for rt0 in rowtags: 

1487 for ct0 in compute_coltags( 

1488 lang, 

1489 pos, 

1490 hdrspans, 

1491 col_idx, # col_idx=>start 

1492 colspan, 

1493 col, # cell_text 

1494 ): 

1495 base_tags: set[str] = ( 

1496 set(rt0) 

1497 | set(ct0) 

1498 | set(global_tags) 

1499 | set(itertools.chain.from_iterable(table_tags)) 

1500 ) # Union. 

1501 alt_tags = expand_header( 

1502 wxr, 

1503 tablecontext, 

1504 word, 

1505 lang, 

1506 pos, 

1507 text, 

1508 base_tags, 

1509 depth=depth, 

1510 ) 

1511 # base_tags are used in infl_map "if"-conds. 

1512 for tt in alt_tags: 

1513 if tt not in all_hdr_tags: 

1514 all_hdr_tags.append(tt) 

1515 tt_s = set(tt) 

1516 # Certain tags are always moved to word-level tags 

1517 if tt_s & TAGS_FORCED_WORDTAGS: 1517 ↛ 1518line 1517 didn't jump to line 1518 because the condition on line 1517 was never true

1518 table_tags.extend(tt_s & TAGS_FORCED_WORDTAGS) 

1519 tt_s = tt_s - TAGS_FORCED_WORDTAGS 

1520 # Add tags from referenced footnotes 

1521 tt_s.update(refs_tags) 

1522 # Sort, convert to tuple, and add to set of 

1523 # alternatives. 

1524 tt = tuple(sorted(tt_s)) 

1525 if tt not in new_coltags: 

1526 new_coltags.append(tt) 

1527 # Kludge (saprast/Latvian/Verb): ignore row tags 

1528 # if trying to add a non-finite after mood. 

1529 if any(valid_tags[t] == "mood" for t in rt0) and any( 

1530 valid_tags[t] == "non-finite" for t in tt 

1531 ): 

1532 tags = tuple(sorted(set(tt) | set(hdr_tags))) 

1533 else: 

1534 tags = tuple(sorted(set(tt) | set(rt0) | set(hdr_tags))) 

1535 if tags not in new_rowtags: 

1536 new_rowtags.append(tags) 

1537 return new_rowtags, new_coltags, all_hdr_tags 

1538 

1539 def add_new_hdrspan( 

1540 col: str, 

1541 hdrspans: list[HdrSpan], 

1542 store_new_hdrspan: bool, 

1543 col0_followed_by_nonempty: bool, 

1544 col0_hdrspan: Optional[HdrSpan], 

1545 ) -> tuple[str, bool, Optional[HdrSpan]]: 

1546 hdrspan = HdrSpan( 

1547 col_idx, colspan, rowspan, rownum, new_coltags, col, all_headers 

1548 ) 

1549 hdrspans.append(hdrspan) 

1550 

1551 # infl-map tag "dummy-store-hdrspan" causes this new hdrspan 

1552 # to be added to a register of stored hdrspans to be used 

1553 # later with "dummy-load-stored-hdrspans". 

1554 if store_new_hdrspan: 1554 ↛ 1555line 1554 didn't jump to line 1555 because the condition on line 1554 was never true

1555 tablecontext.stored_hdrspans.append(hdrspan) 

1556 

1557 # Handle headers that are above left-side header 

1558 # columns and are followed by personal pronouns in 

1559 # remaining columns (basically headers that 

1560 # evaluate to no tags). In such cases widen the 

1561 # left-side header to the full row. 

1562 if previously_seen: # id(cell) in seen_cells previously 

1563 col0_followed_by_nonempty = True 

1564 return col, col0_followed_by_nonempty, col0_hdrspan 

1565 elif col0_hdrspan is None: 

1566 col0_hdrspan = hdrspan 

1567 elif any(all_hdr_tags): 1567 ↛ 1635line 1567 didn't jump to line 1635 because the condition on line 1567 was always true

1568 col0_cats = tagset_cats(col0_hdrspan.tagsets) 

1569 later_cats = tagset_cats(all_hdr_tags) 

1570 col0_allowed = get_lang_conf(lang, "hdr_expand_first") 

1571 later_allowed = get_lang_conf(lang, "hdr_expand_cont") 

1572 later_allowed = later_allowed | set(["dummy"]) 

1573 # dummy2 has different behavior than plain dummy 

1574 # and does not belong here. 

1575 

1576 # print("col0_cats={} later_cats={} " 

1577 # "fol_by_nonempty={} col_idx={} end={} " 

1578 # "tagsets={}" 

1579 # .format(col0_cats, later_cats, 

1580 # col0_followed_by_nonempty, col_idx, 

1581 # col0_hdrspan.start + 

1582 # col0_hdrspan.colspan, 

1583 # col0_hdrspan.tagsets)) 

1584 # print("col0.rowspan={} rowspan={}" 

1585 # .format(col0_hdrspan.rowspan, rowspan)) 

1586 # Only expand if [col0_cats and later_cats are allowed 

1587 # and don't overlap] and [col0 has tags], and there have 

1588 # been [no disallowed cells in between]. 

1589 # 

1590 # There are three cases here: 

1591 # - col0_hdrspan set, continue with allowed current 

1592 # - col0_hdrspan set, expand, start new 

1593 # - col0_hdrspan set, no expand, start new 

1594 if ( 

1595 not col0_followed_by_nonempty 

1596 and 

1597 # XXX Only one cat of tags: kunna/Swedish 

1598 # XXX len(col0_cats) == 1 and 

1599 col0_hdrspan.rowspan >= rowspan 

1600 and 

1601 # from hdrspan 

1602 not (later_cats - later_allowed) 

1603 and not (col0_cats & later_cats) 

1604 ): 

1605 # First case: col0 set, continue 

1606 return col, col0_followed_by_nonempty, col0_hdrspan 

1607 # We are going to start new col0_hdrspan. Check if 

1608 # we should expand. 

1609 if ( 

1610 not col0_followed_by_nonempty 

1611 and not (col0_cats - col0_allowed) 

1612 and 

1613 # Only "allowed" allowed 

1614 # XXX len(col0_cats) == 1 and 

1615 col_idx > col0_hdrspan.start + col0_hdrspan.colspan 

1616 ): 

1617 # col_idx is beyond current colspan 

1618 # *Expand* current col0_hdrspan 

1619 # print("EXPANDING COL0 MID: {} from {} to {} " 

1620 # "cols {}" 

1621 # .format(col0_hdrspan.text, 

1622 # col0_hdrspan.colspan, 

1623 # col_idx - col0_hdrspan.start, 

1624 # col0_hdrspan.tagsets)) 

1625 col0_hdrspan.colspan = col_idx - col0_hdrspan.start 

1626 col0_hdrspan.expanded = True 

1627 # Clear old col0_hdrspan 

1628 if col == debug_cell_text: 1628 ↛ 1629line 1628 didn't jump to line 1629 because the condition on line 1628 was never true

1629 print("START NEW {}".format(hdrspan.tagsets)) 

1630 col0_hdrspan = None 

1631 # Now start new, unless it comes from previous row 

1632 if not previously_seen: 1632 ↛ 1635line 1632 didn't jump to line 1635 because the condition on line 1632 was always true

1633 col0_hdrspan = hdrspan 

1634 col0_followed_by_nonempty = False 

1635 return col, col0_followed_by_nonempty, col0_hdrspan 

1636 

1637 def split_text_into_alts(col: str) -> tuple[str, list[str], list[str]]: 

1638 # Split the cell text into alternatives 

1639 split_extra_tags = [] 

1640 if col and is_superscript(col[0]): 1640 ↛ 1641line 1640 didn't jump to line 1641 because the condition on line 1640 was never true

1641 alts = [col] 

1642 else: 

1643 separators = [";", "•", r"\n", " or "] 

1644 if " + " not in col: 

1645 separators.append(",") 

1646 if not col.endswith("/"): 

1647 separators.append("/") 

1648 if col in special_phrase_splits: 

1649 # Use language-specific special splits. 

1650 # These are phrases and constructions that have 

1651 # unique ways of splitting, not specific characters 

1652 # to split on like with the default splitting. 

1653 alts, tags = special_phrase_splits[col] 

1654 split_extra_tags = tags.split() 

1655 for x in split_extra_tags: 

1656 assert x in valid_tags 

1657 assert isinstance(alts, (list, tuple)) 

1658 assert isinstance(tags, str) 

1659 else: 

1660 # Use default splitting. However, recognize 

1661 # language-specific replacements and change them to magic 

1662 # characters before splitting. This way we won't split 

1663 # them. This is important for, e.g., recognizing 

1664 # alternative pronouns. 

1665 # The magic characters are characters out of Unicode scope 

1666 # that are given a simple incremental value, int > unicode. 

1667 repls = {} 

1668 magic_ch = MAGIC_FIRST 

1669 trs = get_lang_conf(lang, "form_transformations") 

1670 # trs is a list of lists of strings 

1671 for _, v, _, _ in trs: 

1672 # v is a pattern string, like "^ich" 

1673 # form_transformations data is doing double-duty here, 

1674 # because the pattern strings are already known to us and 

1675 # not meant to be split. 

1676 m = re.search(v, col) 

1677 if m is not None: 

1678 # if pattern found in text 

1679 magic = chr(magic_ch) 

1680 magic_ch += 1 # next magic character value 

1681 col = re.sub(v, magic, col) # replace with magic ch 

1682 repls[magic] = m.group(0) 

1683 # remember what regex match string each magic char 

1684 # replaces. .group(0) is the whole match. 

1685 alts0 = split_at_comma_semi(col, separators=separators) 

1686 # with magic characters in place, split the text so that 

1687 # pre-transformation text is out of the way. 

1688 alts = [] 

1689 for alt in alts0: 

1690 # create a new list with the separated items and 

1691 # the magic characters replaced with the original texts. 

1692 for k, v in repls.items(): 

1693 alt = re.sub(k, v, alt) 

1694 alts.append(alt) 

1695 

1696 # Remove "*" from beginning of forms, as in non-attested 

1697 # or reconstructed forms. Otherwise it might confuse romanization 

1698 # detection. 

1699 alts = list(re.sub(r"^\*\*?([^ ])", r"\1", x) for x in alts) 

1700 alts = list( 

1701 x for x in alts if not re.match(r"pronounced with |\(with ", x) 

1702 ) 

1703 alts = list( 

1704 re.sub(r"^\((in the sense [^)]*)\)\s+", "", x) for x in alts 

1705 ) 

1706 return col, alts, split_extra_tags 

1707 

1708 def handle_mixed_lines(alts: list[str]) -> list[tuple[str, str, str]]: 

1709 # Handle the special case where romanization is given under 

1710 # normal form, e.g. in Russian. There can be multiple 

1711 # comma-separated forms in each case. We also handle the case 

1712 # where instead of romanization we have IPA pronunciation 

1713 # (e.g., avoir/French/verb). 

1714 len2 = len(alts) // 2 

1715 # Check for IPAs (forms first, IPAs under) 

1716 # base, base, IPA, IPA 

1717 if ( 

1718 len(alts) % 2 == 0 # Divisibly by two 

1719 and all( 

1720 re.match(r"^\s*/.*/\s*$", x) # Inside slashes = IPA 

1721 for x in alts[len2:] 

1722 ) 

1723 ): # In the second half of alts 

1724 nalts = list( 

1725 (alts[i], "", alts[i + len2]) 

1726 # List of tuples: (base, "", ipa) 

1727 for i in range(len2) 

1728 ) 

1729 # base, base, base, IPA 

1730 elif ( 

1731 len(alts) > 2 

1732 and re.match(r"^\s*/.*/\s*$", alts[-1]) 

1733 and all(not x.startswith("/") for x in alts[:-1]) 

1734 ): 

1735 # Only if the last alt is IPA 

1736 nalts = list((alts[i], "", alts[-1]) for i in range(len(alts) - 1)) 

1737 # base, IPA, IPA, IPA 

1738 elif ( 

1739 len(alts) > 2 

1740 and not alts[0].startswith("/") 

1741 and all( 

1742 re.match(r"^\s*/.*/\s*$", alts[i]) for i in range(1, len(alts)) 

1743 ) 

1744 ): 

1745 # First is base and the rest is IPA alternatives 

1746 nalts = list((alts[0], "", alts[i]) for i in range(1, len(alts))) 

1747 

1748 # Check for romanizations, forms first, romanizations under 

1749 elif ( 

1750 len(alts) % 2 == 0 

1751 and not any("(" in x for x in alts) 

1752 and all( 

1753 classify_desc( 

1754 re.sub( 

1755 r"\^.*$", 

1756 "", 

1757 # Remove ends of strings starting from ^. 

1758 # Supescripts have been already removed 

1759 # from the string, while ^xyz needs to be 

1760 # removed separately, though it's usually 

1761 # something with a single letter? 

1762 "".join(xx for xx in x if not is_superscript(xx)), 

1763 ) 

1764 ) 

1765 == "other" 

1766 for x in alts[:len2] 

1767 ) 

1768 and all( 

1769 classify_desc( 

1770 re.sub( 

1771 r"\^.*$", 

1772 "", 

1773 "".join(xx for xx in x if not is_superscript(xx)), 

1774 ) 

1775 ) 

1776 in ("romanization", "english") 

1777 for x in alts[len2:] 

1778 ) 

1779 ): 

1780 nalts = list((alts[i], alts[i + len2], "") for i in range(len2)) 

1781 # Check for romanizations, forms and romanizations alternating 

1782 elif ( 

1783 len(alts) % 2 == 0 

1784 and not any("(" in x for x in alts) 

1785 and all( 

1786 classify_desc( 

1787 re.sub( 

1788 r"\^.*$", 

1789 "", 

1790 "".join(xx for xx in alts[i] if not is_superscript(xx)), 

1791 ) 

1792 ) 

1793 == "other" 

1794 for i in range(0, len(alts), 2) 

1795 ) 

1796 and all( 

1797 classify_desc( 

1798 re.sub( 

1799 r"\^.*$", 

1800 "", 

1801 "".join(xx for xx in alts[i] if not is_superscript(xx)), 

1802 ) 

1803 ) 

1804 in ("romanization", "english") 

1805 for i in range(1, len(alts), 2) 

1806 ) 

1807 ): 

1808 # odds 

1809 nalts = list( 

1810 (alts[i], alts[i + 1], "") for i in range(0, len(alts), 2) 

1811 ) 

1812 # evens 

1813 # Handle complex Georgian entries with alternative forms and* 

1814 # *romanizations. It's a bit of a mess. Remove this kludge if not 

1815 # needed anymore. NOTE THAT THE PARENTHESES ON THE WEBSITE ARE NOT 

1816 # DISPLAYED. They are put inside their own span elements that are 

1817 # then hidden with some CSS. 

1818 # https://en.wiktionary.org/wiki/%E1%83%90%E1%83%9B%E1%83%94%E1%83%A0%E1%83%98%E1%83%99%E1%83%98%E1%83%A1_%E1%83%A8%E1%83%94%E1%83%94%E1%83%A0%E1%83%97%E1%83%94%E1%83%91%E1%83%A3%E1%83%9A%E1%83%98_%E1%83%A8%E1%83%A2%E1%83%90%E1%83%A2%E1%83%94%E1%83%91%E1%83%98 

1819 # ამერიკის შეერთებულ შტატებს(ა) (ameriḳis šeertebul šṭaṭebs(a)) 

1820 # The above should generate two alts entries, with two different 

1821 # parallel versions, one without (a) and with (a) at the end, 

1822 # for both the Georgian original and the romanization. 

1823 elif ( 1823 ↛ 1828line 1823 didn't jump to line 1828 because the condition on line 1823 was never true

1824 tablecontext.template_name == "ka-decl-noun" 

1825 and len(alts) == 1 

1826 and " (" in alts[0] 

1827 ): 

1828 nalts = ka_decl_noun_template_cell(alts) 

1829 else: 

1830 new_alts = [] 

1831 for alt in alts: 

1832 lst = [""] 

1833 idx = 0 

1834 for m in re.finditer( 

1835 r"(^|\w|\*)\((\w+" r"(/\w+)*)\)", 

1836 # start OR letter OR asterisk (word/word*) 

1837 # \\___________group 1_______/ \ \_g3_/// 

1838 # \ \__gr. 2_// 

1839 # \_____________group 0________________/ 

1840 alt, 

1841 ): 

1842 v = m.group(2) # (word/word/word...) 

1843 if ( 

1844 classify_desc(v) == "tags" # Tags inside parens 

1845 or m.group(0) == alt 

1846 ): # All in parens 

1847 continue 

1848 new_lst = [] 

1849 for x in lst: 

1850 x += alt[idx : m.start()] + m.group(1) 

1851 # alt until letter or asterisk 

1852 idx = m.end() 

1853 vparts = v.split("/") 

1854 # group(2) = ["word", "wörd"...] 

1855 if len(vparts) == 1: 

1856 new_lst.append(x) 

1857 new_lst.append(x + v) 

1858 # "kind(er)" -> ["kind", "kinder"] 

1859 else: 

1860 for vv in vparts: 

1861 new_lst.append(x + vv) 

1862 # "lampai(tten/den)" -> 

1863 # ["lampaitten", "lampaiden"] 

1864 lst = new_lst 

1865 for x in lst: 

1866 new_alts.append(x + alt[idx:]) 

1867 # add the end of alt 

1868 nalts = list((x, "", "") for x in new_alts) 

1869 # [form, no romz, no ipa] 

1870 return nalts 

1871 

1872 def find_semantic_parens(form: str) -> tuple[str, list[str]]: 

1873 # "Some languages" (=Greek) use brackets to mark things that 

1874 # require tags, like (informality), [rarity] and {archaicity}. 

1875 extra_tags = [] 

1876 if re.match(r"\([^][(){}]*\)$", form): 

1877 if get_lang_conf(lang, "parentheses_for_informal"): 

1878 form = form[1:-1] 

1879 extra_tags.append("informal") 

1880 else: 

1881 form = form[1:-1] 

1882 elif re.match(r"\{\[[^][(){}]*\]\}$", form): 

1883 if get_lang_conf( 1883 ↛ 1890line 1883 didn't jump to line 1890 because the condition on line 1883 was always true

1884 lang, "square_brackets_for_rare" 

1885 ) and get_lang_conf(lang, "curly_brackets_for_archaic"): 

1886 # είμαι/Greek/Verb 

1887 form = form[2:-2] 

1888 extra_tags.extend(["rare", "archaic"]) 

1889 else: 

1890 form = form[2:-2] 

1891 elif re.match(r"\{[^][(){}]*\}$", form): 

1892 if get_lang_conf(lang, "curly_brackets_for_archaic"): 1892 ↛ 1897line 1892 didn't jump to line 1897 because the condition on line 1892 was always true

1893 # είμαι/Greek/Verb 

1894 form = form[1:-1] 

1895 extra_tags.extend(["archaic"]) 

1896 else: 

1897 form = form[1:-1] 

1898 elif re.match(r"\[[^][(){}]*\]$", form): 

1899 if get_lang_conf(lang, "square_brackets_for_rare"): 1899 ↛ 1904line 1899 didn't jump to line 1904 because the condition on line 1899 was always true

1900 # είμαι/Greek/Verb 

1901 form = form[1:-1] 

1902 extra_tags.append("rare") 

1903 else: 

1904 form = form[1:-1] 

1905 return form, extra_tags 

1906 

1907 def handle_parens( 

1908 form: str, roman: str, clitic: str, extra_tags: list[str] 

1909 ) -> tuple[str, str, str]: 

1910 if re.match(r"[’'][a-z]([a-z][a-z]?)?$", paren): 

1911 # is there a clitic starting with apostrophe? 

1912 clitic = paren 

1913 # assume the whole paren is a clitic 

1914 # then remove paren from form 

1915 form = (form[: m.start()] + subst + form[m.end() :]).strip() 

1916 elif classify_desc(paren) == "tags": 

1917 tagsets1, topics1 = decode_tags(paren) 

1918 if not topics1: 1918 ↛ 1939line 1918 didn't jump to line 1939 because the condition on line 1918 was always true

1919 for ts in tagsets1: 

1920 ts = tuple(x for x in ts if " " not in x) 

1921 # There are some generated tags containing 

1922 # spaces; do not let them through here. 

1923 extra_tags.extend(ts) 

1924 form = (form[: m.start()] + subst + form[m.end() :]).strip() 

1925 # brackets contain romanization 

1926 elif ( 

1927 m.start() > 0 

1928 and not roman 

1929 and classify_desc(form[: m.start()]) == "other" 

1930 and 

1931 # "other" ~ text 

1932 classify_desc(paren) in ("romanization", "english") 

1933 and not re.search(r"^with |-form$", paren) 

1934 ): 

1935 roman = paren 

1936 form = (form[: m.start()] + subst + form[m.end() :]).strip() 

1937 elif re.search(r"^with |-form", paren): 1937 ↛ 1938line 1937 didn't jump to line 1938 because the condition on line 1937 was never true

1938 form = (form[: m.start()] + subst + form[m.end() :]).strip() 

1939 return form, roman, clitic 

1940 

1941 def merge_row_and_column_tags(form, some_has_covered_text): 

1942 # Merge column tags and row tags. We give preference 

1943 # to moods etc coming from rowtags (cf. austteigen/German/Verb 

1944 # imperative forms). 

1945 

1946 # In certain cases, what a tag means depends on whether 

1947 # it is a row or column header. Depending on the language, 

1948 # we replace certain tags with others if they're in 

1949 # a column or row 

1950 

1951 ret = [] 

1952 # rtagreplacs = get_lang_conf(lang, "rowtag_replacements") 

1953 # ctagreplacs = get_lang_conf(lang, "coltag_replacements") 

1954 for rt in sorted(rowtags): 

1955 if "dummy-use-as-coltags" in rt: 1955 ↛ 1956line 1955 didn't jump to line 1956 because the condition on line 1955 was never true

1956 continue 

1957 # if lang was in rowtag_replacements) 

1958 # if not rtagreplacs == None: 

1959 # rt = replace_directional_tags(rt, rtagreplacs) 

1960 for ct in sorted(coltags): 

1961 if "dummy-use-as-rowtags" in ct: 1961 ↛ 1962line 1961 didn't jump to line 1962 because the condition on line 1961 was never true

1962 continue 

1963 # if lang was in coltag_replacements 

1964 # if not ctagreplacs == None: 

1965 # ct = replace_directional_tags(ct, 

1966 # ctagreplacs) 

1967 tags = set(global_tags) 

1968 tags.update(extra_tags) 

1969 tags.update(rt) 

1970 tags.update(refs_tags) 

1971 tags.update(tablecontext.section_header) 

1972 # Merge tags from column. For certain kinds of tags, 

1973 # those coming from row take precedence. 

1974 old_tags = set(tags) 

1975 for t in ct: 

1976 c = valid_tags[t] 

1977 if c in ("mood", "case", "number") and any( 

1978 valid_tags[tt] == c for tt in old_tags 

1979 ): 

1980 continue 

1981 tags.add(t) 

1982 

1983 # Extract language-specific tags from the 

1984 # form. This may also adjust the form. 

1985 form, lang_tags = lang_specific_tags(lang, pos, form) 

1986 tags.update(lang_tags) 

1987 

1988 # For non-finite verb forms, see if they have 

1989 # a gender/class suffix 

1990 if pos == "verb" and any( 

1991 valid_tags[t] == "non-finite" for t in tags 

1992 ): 

1993 form, tt = parse_head_final_tags(wxr, lang, form) 

1994 tags.update(tt) 

1995 

1996 # Remove "personal" tag if have nth person; these 

1997 # come up with e.g. reconhecer/Portuguese/Verb. But 

1998 # not if we also have "pronoun" 

1999 if ( 

2000 "personal" in tags 

2001 and "pronoun" not in tags 

2002 and any( 

2003 x in tags 

2004 for x in [ 

2005 "first-person", 

2006 "second-person", 

2007 "third-person", 

2008 ] 

2009 ) 

2010 ): 

2011 tags.remove("personal") 

2012 

2013 # If we have impersonal, remove person and number. 

2014 # This happens with e.g. viajar/Portuguese/Verb 

2015 if "impersonal" in tags: 

2016 tags = tags - set( 

2017 [ 

2018 "first-person", 

2019 "second-person", 

2020 "third-person", 

2021 "singular", 

2022 "plural", 

2023 ] 

2024 ) 

2025 

2026 # Remove unnecessary "positive" tag from verb forms 

2027 if pos == "verb" and "positive" in tags: 

2028 if "negative" in tags: 2028 ↛ 2029line 2028 didn't jump to line 2029 because the condition on line 2028 was never true

2029 tags.remove("negative") 

2030 tags.remove("positive") 

2031 

2032 # Many Russian (and other Slavic) inflection tables 

2033 # have animate/inanimate distinction that generates 

2034 # separate entries for neuter/feminine, but the 

2035 # distinction only applies to masculine. Remove them 

2036 # form neuter/feminine and eliminate duplicates. 

2037 if get_lang_conf(lang, "masc_only_animate"): 

2038 for t1 in ("animate", "inanimate"): 

2039 for t2 in ("neuter", "feminine"): 

2040 if ( 

2041 t1 in tags 

2042 and t2 in tags 

2043 and "masculine" not in tags 

2044 and "plural" not in tags 

2045 ): 

2046 tags.remove(t1) 

2047 

2048 # German adjective tables contain "(keiner)" etc 

2049 # for mixed declension plural. When the adjective 

2050 # disappears and it becomes just one word, remove 

2051 # the "includes-article" tag. e.g. eiskalt/German 

2052 if "includes-article" in tags and " " not in form: 

2053 tags.remove("includes-article") 

2054 

2055 # Handle ignored forms. We mark that the form was 

2056 # provided. This is important information; some words 

2057 # just do not have a certain form. However, there also 

2058 # many cases where no word in a language has a 

2059 # particular form. Post-processing could detect and 

2060 # remove such cases. 

2061 if form in IGNORED_COLVALUES: 

2062 # if cell text seems to be ignorable 

2063 if "dummy-ignore-skipped" in tags: 

2064 continue 

2065 if ( 

2066 col_idx not in has_covering_hdr 

2067 and some_has_covered_text 

2068 ): 

2069 continue 

2070 # don't ignore this cell if there's been a header 

2071 # above it 

2072 form = "-" 

2073 elif col_idx in has_covering_hdr: 

2074 some_has_covered_text = True 

2075 

2076 # Handle ambiguous object concord. If a header 

2077 # gives the "dummy-object-concord"-tag to a word, 

2078 # replace person, number and gender tags with 

2079 # their "object-" counterparts so that the verb 

2080 # agrees with the object instead. 

2081 # Use only when the verb has ONLY object agreement! 

2082 # a پخول/Pashto 

2083 if "dummy-object-concord" in tags: 2083 ↛ 2084line 2083 didn't jump to line 2084 because the condition on line 2083 was never true

2084 for subtag, objtag in object_concord_replacements.items(): 

2085 if subtag in tags: 

2086 tags.remove(subtag) 

2087 tags.add(objtag) 

2088 

2089 # Remove the dummy mood tag that we sometimes 

2090 # use to block adding other mood and related 

2091 # tags 

2092 tags = tags - set( 

2093 [ 

2094 "dummy-mood", 

2095 "dummy-tense", 

2096 "dummy-ignore-skipped", 

2097 "dummy-object-concord", 

2098 "dummy-reset-headers", 

2099 "dummy-use-as-coltags", 

2100 "dummy-use-as-rowtags", 

2101 "dummy-store-hdrspan", 

2102 "dummy-load-stored-hdrspans", 

2103 "dummy-reset-stored-hdrspans", 

2104 "dummy-section-header", 

2105 ] 

2106 ) 

2107 

2108 # Perform language-specific tag replacements according 

2109 # to rules in a table. 

2110 lang_tag_mappings = get_lang_conf(lang, "lang_tag_mappings") 

2111 if lang_tag_mappings is not None: 2111 ↛ 2112line 2111 didn't jump to line 2112 because the condition on line 2111 was never true

2112 for pre, post in lang_tag_mappings.items(): 

2113 if all(t in tags for t in pre): 

2114 tags = (tags - set(pre)) | set(post) 

2115 

2116 # Warn if there are entries with empty tags 

2117 if not tags: 

2118 wxr.wtp.debug( 

2119 "inflection table: empty tags for {}".format(form), 

2120 sortid="inflection/1826", 

2121 ) 

2122 

2123 # Warn if form looks like IPA 

2124 ########## XXX ######## 

2125 # Because IPA is its own unicode block, we could also 

2126 # technically do a Unicode name check to see if a string 

2127 # contains IPA. Not all valid IPA characters are in the 

2128 # IPA extension block, so you can technically have false 

2129 # negatives if it's something like /toki/, but it 

2130 # shouldn't give false positives. 

2131 # Alternatively, you could make a list of IPA-admissible 

2132 # characters and reject non-IPA stuff with that. 

2133 if re.match(r"\s*/.*/\s*$", form): 2133 ↛ 2134line 2133 didn't jump to line 2134 because the condition on line 2133 was never true

2134 wxr.wtp.debug( 

2135 "inflection table form looks like IPA: " 

2136 "form={} tags={}".format(form, tags), 

2137 sortid="inflection/1840", 

2138 ) 

2139 

2140 # Note that this checks `form`, not `in tags` 

2141 if form == "dummy-ignored-text-cell": 2141 ↛ 2142line 2141 didn't jump to line 2142 because the condition on line 2141 was never true

2142 continue 

2143 

2144 if "dummy-remove-this-cell" in tags: 2144 ↛ 2145line 2144 didn't jump to line 2145 because the condition on line 2144 was never true

2145 continue 

2146 

2147 # Add the form 

2148 tags = list(sorted(tags)) 

2149 dt = {"form": form, "tags": tags, "source": source} 

2150 if roman: 

2151 dt["roman"] = roman 

2152 if ipa: 

2153 dt["ipa"] = ipa 

2154 ret.append(dt) 

2155 # If we got separate clitic form, add it 

2156 if clitic: 

2157 dt = { 

2158 "form": clitic, 

2159 "tags": tags + ["clitic"], 

2160 "source": source, 

2161 } 

2162 ret.append(dt) 

2163 return ret, form, some_has_covered_text 

2164 

2165 # First extract definitions from cells 

2166 # See defs_ht for footnote defs stuff 

2167 for row in rows: 

2168 for cell in row: 

2169 text, refs, defs, hdr_tags = extract_cell_content( 

2170 lang, word, cell.text 

2171 ) 

2172 # refs, defs = footnote stuff, defs -> (ref, def) 

2173 add_defs(defs) 

2174 # Extract definitions from text after table 

2175 text, refs, defs, hdr_tags = extract_cell_content(lang, word, after) 

2176 add_defs(defs) 

2177 

2178 # Then extract the actual forms 

2179 ret = [] 

2180 hdrspans = [] 

2181 first_col_has_text = False 

2182 rownum = 0 

2183 title = None 

2184 global_tags = [] 

2185 table_tags = [] 

2186 special_phrase_splits = get_lang_conf(lang, "special_phrase_splits") 

2187 form_replacements = get_lang_conf(lang, "form_replacements") 

2188 form_transformations = get_lang_conf(lang, "form_transformations") 

2189 possibly_ignored_forms = get_lang_conf(lang, "conditionally_ignored_cells") 

2190 cleanup_rules = get_lang_conf(lang, "minor_text_cleanups") 

2191 

2192 for title in titles: 

2193 more_global_tags, more_table_tags, extra_forms = parse_title( 

2194 title, source 

2195 ) 

2196 global_tags.extend(more_global_tags) 

2197 table_tags.extend(more_table_tags) 

2198 ret.extend(extra_forms) 

2199 cell_rowcnt = collections.defaultdict(int) 

2200 seen_cells = set() 

2201 has_covering_hdr = set() 

2202 some_has_covered_text = False 

2203 for row in rows: 

2204 # print("ROW:", row) 

2205 # print("====") 

2206 # print(f"Start of PREVIOUS row hdrspans:" 

2207 # f"{tuple(sp.tagsets for sp in hdrspans)}") 

2208 # print(f"Start of row txt: {tuple(t.text for t in row)}") 

2209 if not row: 2209 ↛ 2210line 2209 didn't jump to line 2210 because the condition on line 2209 was never true

2210 continue # Skip empty rows 

2211 all_headers = all(x.is_title or not x.text.strip() for x in row) 

2212 text = row[0].text 

2213 if ( 

2214 row[0].is_title 

2215 and text 

2216 and not is_superscript(text[0]) 

2217 and text not in infl_map # zealous inflation map? 

2218 and ( 

2219 re.match(r"Inflection ", text) 

2220 or re.sub( 

2221 r"\s+", 

2222 " ", # flatten whitespace 

2223 re.sub( 

2224 r"\s*\([^)]*\)", 

2225 "", 

2226 # Remove whitespace+parens 

2227 text, 

2228 ), 

2229 ).strip() 

2230 not in infl_map 

2231 ) 

2232 and not re.match(infl_start_re, text) 

2233 and all( 

2234 x.is_title == row[0].is_title and x.text == text 

2235 # all InflCells in `row` have the same is_title and text 

2236 for x in row 

2237 ) 

2238 ): 

2239 if text and title is None: 

2240 # Only if there were no titles previously make the first 

2241 # text that is found the title 

2242 title = text 

2243 if re.match(r"(Note:|Notes:)", title): 2243 ↛ 2244line 2243 didn't jump to line 2244 because the condition on line 2243 was never true

2244 continue # not a title 

2245 more_global_tags, more_table_tags, extra_forms = parse_title( 

2246 title, source 

2247 ) 

2248 global_tags.extend(more_global_tags) 

2249 table_tags.extend(more_table_tags) 

2250 ret.extend(extra_forms) 

2251 continue # Skip title rows without incrementing i 

2252 if "dummy-skip-this" in global_tags: 2252 ↛ 2253line 2252 didn't jump to line 2253 because the condition on line 2252 was never true

2253 return [] 

2254 rowtags = [()] 

2255 # have_hdr = False 

2256 # have_hdr never used? 

2257 have_text = False 

2258 samecell_cnt = 0 

2259 col0_hdrspan = None # col0 or later header (despite its name) 

2260 col0_followed_by_nonempty = False 

2261 row_empty = True 

2262 for col_idx, cell in enumerate(row): 

2263 colspan = cell.colspan # >= 1 

2264 rowspan = cell.rowspan # >= 1 

2265 previously_seen = id(cell) in seen_cells 

2266 # checks to see if this cell was in the previous ROW 

2267 seen_cells.add(id(cell)) 

2268 if samecell_cnt == 0: 

2269 # First column of a (possible multi-column) cell 

2270 samecell_cnt = colspan - 1 

2271 else: 

2272 assert samecell_cnt > 0 

2273 samecell_cnt -= 1 

2274 continue 

2275 

2276 # is_first_row_of_cell = cell_rowcnt[id(cell)] == 0 

2277 # never used? 

2278 

2279 # defaultdict(int) around line 1900 

2280 cell_rowcnt[id(cell)] += 1 

2281 # => how many cols this spans 

2282 col = cell.text 

2283 if not col: 

2284 continue 

2285 row_empty = False 

2286 is_title = cell.is_title 

2287 

2288 # If the cell has a target, i.e., text after colon, interpret 

2289 # it as simply specifying a value for that value and ignore 

2290 # it otherwise. 

2291 if cell.target: 

2292 text, refs, defs, hdr_tags = extract_cell_content( 

2293 lang, word, col 

2294 ) 

2295 if not text: 2295 ↛ 2296line 2295 didn't jump to line 2296 because the condition on line 2295 was never true

2296 continue 

2297 refs_tags = set() 

2298 for ref in refs: # gets tags from footnotes 2298 ↛ 2299line 2298 didn't jump to line 2299 because the loop on line 2298 never started

2299 if ref in def_ht: 

2300 refs_tags.update(def_ht[ref]) 

2301 rowtags = expand_header( 

2302 wxr, 

2303 tablecontext, 

2304 word, 

2305 lang, 

2306 pos, 

2307 text, 

2308 [], 

2309 silent=True, 

2310 depth=depth, 

2311 ) 

2312 rowtags = list( 

2313 set(tuple(sorted(set(x) | refs_tags)) for x in rowtags) 

2314 ) 

2315 is_title = False 

2316 col = cell.target 

2317 

2318 # print(rownum, col_idx, col) 

2319 # print(f"is_title: {is_title}") 

2320 if is_title: 

2321 # It is a header cell 

2322 text, refs, defs, hdr_tags = extract_cell_content( 

2323 lang, word, col 

2324 ) 

2325 if not text: 

2326 continue 

2327 # Extract tags from referenced footnotes 

2328 refs_tags = set() 

2329 for ref in refs: 

2330 if ref in def_ht: 

2331 refs_tags.update(def_ht[ref]) 

2332 

2333 # Expand header to tags 

2334 v = expand_header( 

2335 wxr, 

2336 tablecontext, 

2337 word, 

2338 lang, 

2339 pos, 

2340 text, 

2341 [], 

2342 silent=True, 

2343 depth=depth, 

2344 ) 

2345 # print("EXPANDED {!r} to {}".format(text, v)) 

2346 

2347 if col_idx == 0: 

2348 # first_col_has_text is used for a test to ignore 

2349 # upper-left cells that are just text without 

2350 # header info 

2351 first_col_has_text = True 

2352 # Check if the header expands to reset hdrspans 

2353 if any("dummy-reset-headers" in tt for tt in v): 

2354 new_hdrspans = [] 

2355 for hdrspan in hdrspans: 

2356 # if there are HdrSpan objects (abstract headers with 

2357 # row- and column-spans) that are to the left or at the 

2358 # same row or below, KEEP those; things above and to 

2359 # the right of the hdrspan with dummy-reset-headers 

2360 # are discarded. Tags from the header together with 

2361 # dummy-reset-headers are kept as normal. 

2362 if ( 

2363 hdrspan.start + hdrspan.colspan < col_idx 

2364 or hdrspan.rownum > rownum - cell.rowspan 

2365 ): 

2366 new_hdrspans.append(hdrspan) 

2367 hdrspans = new_hdrspans 

2368 

2369 for tt in v: 

2370 if "dummy-section-header" in tt: 2370 ↛ 2371line 2370 didn't jump to line 2371 because the condition on line 2370 was never true

2371 tablecontext.section_header = tt 

2372 break 

2373 if "dummy-reset-section-header" in tt: 2373 ↛ 2374line 2373 didn't jump to line 2374 because the condition on line 2373 was never true

2374 tablecontext.section_header = [] 

2375 # Text between headers on a row causes earlier headers to 

2376 # be reset 

2377 if have_text: 

2378 # print(" HAVE_TEXT BEFORE HDR:", col) 

2379 # Reset rowtags if new title column after previous 

2380 # text cells 

2381 # +-----+-----+-----+-----+ 

2382 # |hdr-a|txt-a|hdr-B|txt-B| 

2383 # +-----+-----+-----+-----+ 

2384 # ^reset rowtags=> 

2385 # XXX beware of header "—": "" - must not clear on that if 

2386 # it expands to no tags 

2387 rowtags = [()] 

2388 # have_hdr = True 

2389 # have_hdr never used? 

2390 # print("HAVE_HDR: {} rowtags={}".format(col, rowtags)) 

2391 # Update rowtags and coltags 

2392 has_covering_hdr.add(col_idx) # col_idx == current column 

2393 # has_covering_hdr is a set that has the col_idx-ids of columns 

2394 # that have previously had some kind of header. It is never 

2395 # resetted inside the col_idx-loops OR the bigger rows-loop, so 

2396 # applies to the whole table. 

2397 

2398 rowtags, new_coltags, all_hdr_tags = generate_tags( 

2399 rowtags, table_tags 

2400 ) 

2401 

2402 if any("dummy-skip-this" in ts for ts in rowtags): 

2403 continue # Skip this cell 

2404 

2405 if any("dummy-load-stored-hdrspans" in ts for ts in v): 2405 ↛ 2406line 2405 didn't jump to line 2406 because the condition on line 2405 was never true

2406 hdrspans.extend(tablecontext.stored_hdrspans) 

2407 

2408 if any("dummy-reset-stored-hdrspans" in ts for ts in v): 2408 ↛ 2409line 2408 didn't jump to line 2409 because the condition on line 2408 was never true

2409 tablecontext.stored_hdrspans = [] 

2410 

2411 if any("dummy-store-hdrspan" in ts for ts in v): 2411 ↛ 2413line 2411 didn't jump to line 2413 because the condition on line 2411 was never true

2412 # print(f"STORED: {col}") 

2413 store_new_hdrspan = True 

2414 else: 

2415 store_new_hdrspan = False 

2416 

2417 new_coltags = list( 

2418 x 

2419 for x in new_coltags 

2420 if not any(t in noinherit_tags for t in x) 

2421 ) 

2422 # print("new_coltags={} previously_seen={} all_hdr_tags={}" 

2423 # .format(new_coltags, previously_seen, all_hdr_tags)) 

2424 if any(new_coltags): 

2425 ( 

2426 col, 

2427 col0_followed_by_nonempty, 

2428 col0_hdrspan, 

2429 ) = add_new_hdrspan( 

2430 col, 

2431 hdrspans, 

2432 store_new_hdrspan, 

2433 col0_followed_by_nonempty, 

2434 col0_hdrspan, 

2435 ) 

2436 

2437 continue 

2438 

2439 # These values are ignored, at least for now 

2440 if re.match(r"^(# |\(see )", col): 2440 ↛ 2441line 2440 didn't jump to line 2441 because the condition on line 2440 was never true

2441 continue 

2442 

2443 if any("dummy-skip-this" in ts for ts in rowtags): 

2444 continue # Skip this cell 

2445 

2446 # If the word has no rowtags and is a multi-row cell, then 

2447 # ignore this. This happens with empty separator rows 

2448 # within a rowspan>1 cell. cf. wander/English/Conjugation. 

2449 if rowtags == [()] and rowspan > 1: 

2450 continue 

2451 

2452 # Minor cleanup. See e.g. είμαι/Greek/Verb present participle. 

2453 if cleanup_rules: 

2454 for regx, substitution in cleanup_rules.items(): 

2455 col = re.sub(regx, substitution, col) 

2456 

2457 if ( 2457 ↛ 2462line 2457 didn't jump to line 2462 because the condition on line 2457 was never true

2458 col_idx == 0 

2459 and not first_col_has_text 

2460 and get_lang_conf(lang, "ignore_top_left_text_cell") is True 

2461 ): 

2462 continue # Skip text at top left, as in Icelandic, Faroese 

2463 

2464 # if col0_hdrspan is not None: 

2465 # print("COL0 FOLLOWED NONHDR: {!r} by {!r}" 

2466 # .format(col0_hdrspan.text, col)) 

2467 col0_followed_by_nonempty = True 

2468 have_text = True 

2469 

2470 # Determine column tags for the multi-column cell 

2471 combined_coltags = compute_coltags( 

2472 lang, pos, hdrspans, col_idx, colspan, col 

2473 ) 

2474 if any("dummy-ignored-text-cell" in ts for ts in combined_coltags): 2474 ↛ 2475line 2474 didn't jump to line 2475 because the condition on line 2474 was never true

2475 continue 

2476 

2477 # Split the text into separate forms. First simplify spaces except 

2478 # newline. 

2479 col = re.sub(r"[ \t\r]+", " ", col) 

2480 # Split the cell text into alternatives 

2481 

2482 col, alts, split_extra_tags = split_text_into_alts(col) 

2483 

2484 # Some cells have mixed form content, like text and romanization, 

2485 # or text and IPA. Handle these. 

2486 alts = handle_mixed_lines(alts) 

2487 

2488 alts = list((x, combined_coltags) for x in alts) 

2489 

2490 # Generate forms from the alternatives 

2491 # alts is a list of (tuple of forms, tuple of tags) 

2492 for (form, base_roman, ipa), coltags in alts: 

2493 form = form.strip() 

2494 extra_tags = [] 

2495 extra_tags.extend(split_extra_tags) 

2496 # Handle special splits again here, so that we can have custom 

2497 # mappings from form to form and tags. 

2498 if form in form_replacements: 

2499 replacement, tags = form_replacements[form] 

2500 for x in tags.split(): 

2501 assert x in valid_tags 

2502 assert isinstance(replacement, str) 

2503 assert isinstance(tags, str) 

2504 form = replacement 

2505 extra_tags.extend(tags.split()) 

2506 

2507 check_romanization_form_transformation = False 

2508 # loop over regexes in form_transformation and replace text 

2509 # in form using regex patterns 

2510 # this does a bit of the same stuff the above does, 

2511 # but with regexes and re.sub() instead 

2512 for ( 

2513 form_transformations_pos, 

2514 v, 

2515 subst, 

2516 tags, 

2517 ) in form_transformations: 

2518 # v is a pattern string, like "^ich" 

2519 if pos != form_transformations_pos: 

2520 continue 

2521 m = re.search(v, form) 

2522 if m is not None: 

2523 form = re.sub(v, subst, form) 

2524 for x in tags.split(): 

2525 assert x in valid_tags 

2526 extra_tags.extend(tags.split()) 

2527 check_romanization_form_transformation = True 

2528 break 

2529 

2530 # Clean the value, extracting reference symbols 

2531 form, refs, defs, hdr_tags = extract_cell_content( 

2532 lang, word, form 

2533 ) 

2534 # if refs: 

2535 # print("REFS:", refs) 

2536 extra_tags.extend(hdr_tags) 

2537 # Extract tags from referenced footnotes 

2538 # Extract tags from referenced footnotes 

2539 refs_tags = set() 

2540 for ref in refs: 

2541 if ref in def_ht: 

2542 refs_tags.update(def_ht[ref]) 

2543 

2544 if base_roman: 

2545 if check_romanization_form_transformation: 2545 ↛ 2549line 2545 didn't jump to line 2549 because the condition on line 2545 was never true

2546 # because form_transformations are used to handle things 

2547 # where the romanization has the "same" structure, we 

2548 # need to handle that here too.... 

2549 for ( 

2550 _, 

2551 v, 

2552 subst, 

2553 _, 

2554 ) in form_transformations: 

2555 # v is a pattern string, like "^ich" 

2556 m = re.search(v, base_roman) 

2557 if m is not None: 

2558 base_roman = re.sub(v, subst, base_roman) 

2559 # XXX add tag stuff here if needed 

2560 break 

2561 

2562 base_roman, _, _, hdr_tags = extract_cell_content( 

2563 lang, word, base_roman 

2564 ) 

2565 extra_tags.extend(hdr_tags) 

2566 

2567 # Do some additional cleanup on the cell. 

2568 form = re.sub(r"^\s*,\s*", "", form) 

2569 form = re.sub(r"\s*,\s*$", "", form) 

2570 form = re.sub(r"\s*(,\s*)+", ", ", form) 

2571 form = re.sub(r"(?i)^Main:", "", form) 

2572 form = re.sub(r"\s+", " ", form) 

2573 form = form.strip() 

2574 

2575 # Look for parentheses that have semantic meaning 

2576 form, et = find_semantic_parens(form) 

2577 extra_tags.extend(et) 

2578 

2579 # Handle parentheses in the table element. We parse 

2580 # tags anywhere and romanizations anywhere but beginning. 

2581 roman = base_roman 

2582 paren = None 

2583 clitic = None 

2584 m = re.search(r"(\s+|^)\(([^)]*)\)", form) 

2585 # start|spaces + (anything) 

2586 if m is not None: 

2587 subst = m.group(1) 

2588 paren = m.group(2) 

2589 else: 

2590 m = re.search(r"\(([^)]*)\)(\s+|$)", form) 

2591 # (anything) + spaces|end 

2592 if m is not None: 2592 ↛ 2593line 2592 didn't jump to line 2593 because the condition on line 2592 was never true

2593 paren = m.group(1) 

2594 subst = m.group(2) 

2595 if paren is not None: 

2596 form, roman, clitic = handle_parens( 

2597 form, roman, clitic, extra_tags 

2598 ) 

2599 

2600 # Ignore certain forms that are not really forms, 

2601 # unless they're really, really close to the article title 

2602 if form in ( 2602 ↛ 2607line 2602 didn't jump to line 2607 because the condition on line 2602 was never true

2603 "", 

2604 "unchanged", 

2605 "after an", # in sona/Irish/Adj/Mutation 

2606 ): 

2607 Lev = distw([form], word) 

2608 if form and Lev < 0.1: 

2609 wxr.wtp.debug( 

2610 "accepted possible false positive '{}' with" 

2611 "> 0.1 Levenshtein distance in {}/{}".format( 

2612 form, word, lang 

2613 ), 

2614 sortid="inflection/2213", 

2615 ) 

2616 elif form and Lev < 0.3: 

2617 wxr.wtp.debug( 

2618 "skipped possible match '{}' with > 0.3" 

2619 "Levenshtein distance in {}/{}".format( 

2620 form, word, lang 

2621 ), 

2622 sortid="inflection/2218", 

2623 ) 

2624 continue 

2625 else: 

2626 continue 

2627 # print("ROWTAGS={} COLTAGS={} REFS_TAGS={} " 

2628 # "FORM={!r} ROMAN={!r}" 

2629 # .format(rowtags, coltags, refs_tags, 

2630 # form, roman)) 

2631 

2632 # Merge tags from row and column and do miscellaneous 

2633 # tag-related handling. 

2634 ( 

2635 merge_ret, 

2636 form, 

2637 some_has_covered_text, 

2638 ) = merge_row_and_column_tags(form, some_has_covered_text) 

2639 ret.extend(merge_ret) 

2640 

2641 # End of row. 

2642 rownum += 1 

2643 # For certain languages, if the row was empty, reset 

2644 # hdrspans (saprast/Latvian/Verb, but not aussteigen/German/Verb). 

2645 if row_empty and get_lang_conf(lang, "empty_row_resets"): 

2646 hdrspans = [] 

2647 # Check if we should expand col0_hdrspan. 

2648 if col0_hdrspan is not None: 

2649 col0_allowed = get_lang_conf(lang, "hdr_expand_first") 

2650 col0_cats = tagset_cats(col0_hdrspan.tagsets) 

2651 # Only expand if col0_cats and later_cats are allowed 

2652 # and don't overlap and col0 has tags, and there have 

2653 # been no disallowed cells in between. 

2654 if ( 

2655 not col0_followed_by_nonempty 

2656 and not (col0_cats - col0_allowed) 

2657 and 

2658 # len(col0_cats) == 1 and 

2659 col_idx > col0_hdrspan.start + col0_hdrspan.colspan 

2660 ): 

2661 # If an earlier header is only followed by headers that yield 

2662 # no tags, expand it to entire row 

2663 # print("EXPANDING COL0: {} from {} to {} cols {}" 

2664 # .format(col0_hdrspan.text, col0_hdrspan.colspan, 

2665 # len(row) - col0_hdrspan.start, 

2666 # col0_hdrspan.tagsets)) 

2667 col0_hdrspan.colspan = len(row) - col0_hdrspan.start 

2668 col0_hdrspan.expanded = True 

2669 # XXX handle refs and defs 

2670 # for x in hdrspans: 

2671 # print(" HDRSPAN {} {} {} {!r}" 

2672 # .format(x.start, x.colspan, x.tagsets, x.text)) 

2673 

2674 # Post-process German nouns with articles in separate columns. We move the 

2675 # definite/indefinite/usually-without-article markers into the noun and 

2676 # remove the article entries. 

2677 if get_lang_conf(lang, "articles_in_separate_columns") and any( 

2678 "noun" in x["tags"] for x in ret 

2679 ): 

2680 new_ret = [] 

2681 saved_tags = set() 

2682 had_noun = False 

2683 for dt in ret: 

2684 tags = dt["tags"] 

2685 # print(tags) 

2686 if "noun" in tags: 

2687 tags = list( 

2688 sorted(set(t for t in tags if t != "noun") | saved_tags) 

2689 ) 

2690 had_noun = True 

2691 elif ( 2691 ↛ 2718line 2691 didn't jump to line 2718 because the condition on line 2691 was always true

2692 "indefinite" in tags 

2693 or "definite" in tags 

2694 or "usually-without-article" in tags 

2695 or "without-article" in tags 

2696 ): 

2697 if had_noun: 

2698 saved_tags = set(tags) 

2699 else: 

2700 saved_tags = saved_tags | set(tags) # E.g. Haus/German 

2701 remove_useless_tags(lang, pos, saved_tags) 

2702 saved_tags = saved_tags & set( 

2703 [ 

2704 "masculine", 

2705 "feminine", 

2706 "neuter", 

2707 "singular", 

2708 "plural", 

2709 "indefinite", 

2710 "definite", 

2711 "usually-without-article", 

2712 "without-article", 

2713 ] 

2714 ) 

2715 had_noun = False 

2716 continue # Skip the articles 

2717 

2718 dt = dt.copy() 

2719 dt["tags"] = tags 

2720 new_ret.append(dt) 

2721 ret = new_ret 

2722 

2723 elif possibly_ignored_forms: 

2724 # Some languages have tables with cells that are kind of separated 

2725 # and difficult to handle, like eulersche Formel/German where 

2726 # the definite and indefinite articles are just floating. 

2727 # If a language has a dict of conditionally_ignored_cells, 

2728 # and if the contents of a cell is found in one of the rules 

2729 # there, ignore that cell if it 

2730 # 1. Does not have the appropriate tag (like "definite" for "die") 

2731 # and 

2732 # 2. The title of the article is not one of the other co-words 

2733 # (ie. it's an article for the definite articles in german etc.) 

2734 # pass 

2735 new_ret = [] 

2736 for cell_data in ret: 

2737 tags = cell_data["tags"] 

2738 text = cell_data["form"] 

2739 skip_this = False 

2740 for key_tag, ignored_forms in possibly_ignored_forms.items(): 

2741 if text not in ignored_forms: 2741 ↛ 2743line 2741 didn't jump to line 2743 because the condition on line 2741 was always true

2742 continue 

2743 if word in ignored_forms: 

2744 continue 

2745 if key_tag not in tags: 

2746 skip_this = True 

2747 

2748 if skip_this: 2748 ↛ 2749line 2748 didn't jump to line 2749 because the condition on line 2748 was never true

2749 continue 

2750 new_ret.append(cell_data) 

2751 

2752 ret = new_ret 

2753 

2754 # Post-process English inflection tables, addding "multiword-construction" 

2755 # when the number of words has increased. 

2756 if lang == "English" and pos == "verb": 

2757 word_words = len(word.split()) 

2758 new_ret = [] 

2759 for dt in ret: 

2760 form = dt.get("form", "") 

2761 if len(form.split()) > word_words: 

2762 dt = dt.copy() 

2763 dt["tags"] = list(dt.get("tags", [])) 

2764 # This strange copy-assigning shuffle is preventative black 

2765 # magic; do not touch lest you invoke deep bugs. 

2766 data_append(dt, "tags", "multiword-construction") 

2767 new_ret.append(dt) 

2768 ret = new_ret 

2769 

2770 # Always insert "table-tags" detail as the first entry in any inflection 

2771 # table. This way we can reliably detect where a new table starts. 

2772 # Table-tags applies until the next table-tags entry. 

2773 if ret or table_tags: 

2774 table_tags = list(sorted(set(table_tags))) 

2775 dt = { 

2776 "form": " ".join(table_tags), 

2777 "source": source, 

2778 "tags": ["table-tags"], 

2779 } 

2780 if dt["form"] == "": 

2781 dt["form"] = "no-table-tags" 

2782 if tablecontext.template_name: 

2783 tn = { 

2784 "form": tablecontext.template_name, 

2785 "source": source, 

2786 "tags": ["inflection-template"], 

2787 } 

2788 ret = [dt] + [tn] + ret 

2789 else: 

2790 ret = [dt] + ret 

2791 

2792 return ret 

2793 

2794 

2795def handle_generic_table( 

2796 wxr, tablecontext, data, word, lang, pos, rows, titles, source, after, depth 

2797): 

2798 assert isinstance(wxr, WiktextractContext) 

2799 assert isinstance(data, dict) 

2800 assert isinstance(word, str) 

2801 assert isinstance(lang, str) 

2802 assert isinstance(pos, str) 

2803 assert isinstance(rows, list) 

2804 assert isinstance(source, str) 

2805 assert isinstance(after, str) 

2806 assert isinstance(depth, int) 

2807 for row in rows: 

2808 assert isinstance(row, list) 

2809 for x in row: 

2810 assert isinstance(x, InflCell) 

2811 assert isinstance(titles, list) 

2812 for x in titles: 

2813 assert isinstance(x, str) 

2814 

2815 # Try to parse the table as a simple table 

2816 ret = parse_simple_table( 

2817 wxr, tablecontext, word, lang, pos, rows, titles, source, after, depth 

2818 ) 

2819 if ret is None: 2819 ↛ 2822line 2819 didn't jump to line 2822 because the condition on line 2819 was never true

2820 # XXX handle other table formats 

2821 # We were not able to handle the table 

2822 wxr.wtp.debug( 

2823 "unhandled inflection table format, {}/{}".format(word, lang), 

2824 sortid="inflection/2370", 

2825 ) 

2826 return 

2827 

2828 # Add the returned forms but eliminate duplicates. 

2829 have_forms = set() 

2830 for dt in ret: 

2831 fdt = freeze(dt) 

2832 if fdt in have_forms: 

2833 continue # Don't add duplicates 

2834 # Some Russian words have Declension and Pre-reform declension partially 

2835 # duplicating same data. Don't add "dated" tags variant if already have 

2836 # the same without "dated" from the modern declension table 

2837 

2838 tags = dt.get("tags", []) 

2839 for dated_tag in ("dated",): 

2840 if dated_tag in tags: 

2841 dt2 = dt.copy() 

2842 tags2 = list(x for x in tags if x != dated_tag) 

2843 dt2["tags"] = tags2 

2844 if tags2 and freeze(dt2) in have_forms: 2844 ↛ 2845line 2844 didn't jump to line 2845 because the condition on line 2844 was never true

2845 break # Already have without archaic 

2846 else: 

2847 if "table-tags" not in tags: 

2848 have_forms.add(fdt) 

2849 data_append(data, "forms", dt) 

2850 

2851 

2852def determine_header( 

2853 wxr, 

2854 tablecontext, 

2855 lang, 

2856 word, 

2857 pos, 

2858 table_kind, 

2859 kind, 

2860 style, 

2861 row, 

2862 col, 

2863 celltext, 

2864 titletext, 

2865 cols_headered, 

2866 target, 

2867 cellstyle, 

2868): 

2869 assert isinstance(table_kind, NodeKind) 

2870 assert isinstance(kind, (NodeKind, str)) 

2871 assert style is None or isinstance(style, str) 

2872 assert cellstyle is None or isinstance(cellstyle, str) 

2873 

2874 if table_kind == NodeKind.TABLE: 

2875 header_kind = NodeKind.TABLE_HEADER_CELL 

2876 elif table_kind == NodeKind.HTML: 2876 ↛ 2878line 2876 didn't jump to line 2878 because the condition on line 2876 was always true

2877 header_kind = "th" 

2878 idx = celltext.find(": ") 

2879 is_title = False 

2880 # remove anything in parentheses, compress whitespace, .strip() 

2881 cleaned_titletext = re.sub( 

2882 r"\s+", " ", re.sub(r"\s*\([^)]*\)", "", titletext) 

2883 ).strip() 

2884 cleaned, _, _, _ = extract_cell_content(lang, word, celltext) 

2885 cleaned = re.sub(r"\s+", " ", cleaned) 

2886 hdr_expansion = expand_header( 

2887 wxr, 

2888 tablecontext, 

2889 word, 

2890 lang, 

2891 pos, 

2892 cleaned, 

2893 [], 

2894 silent=True, 

2895 ignore_tags=True, 

2896 ) 

2897 candidate_hdr = not any( 

2898 any(t.startswith("error-") for t in ts) for ts in hdr_expansion 

2899 ) 

2900 # KJ candidate_hdr says that a specific cell is a candidate 

2901 # for being a header because it passed through expand_header 

2902 # without getting any "error-" tags; that is, the contents 

2903 # is "valid" for being a header; these are the false positives 

2904 # we want to catch 

2905 ignored_cell = any( 

2906 any(t.startswith("dummy-") for t in ts) for ts in hdr_expansion 

2907 ) 

2908 # ignored_cell should NOT be used to filter for headers, like 

2909 # candidate_hdr is used, but only to filter for related *debug 

2910 # messages*: some dummy-tags are actually half-way to headers, 

2911 # like ones with "Notes", so they MUST be headers, but later 

2912 # on they're ignored *as* headers so they don't need to print 

2913 # out any cells-as-headers debug messages. 

2914 if ( 

2915 candidate_hdr 

2916 and kind != header_kind 

2917 and cleaned != "" 

2918 and cleaned != "dummy-ignored-text-cell" 

2919 and cleaned not in IGNORED_COLVALUES 

2920 ): 

2921 # print("col: {}".format(col)) 

2922 if not ignored_cell and lang not in LANGUAGES_WITH_CELLS_AS_HEADERS: 

2923 wxr.wtp.debug( 

2924 "rejected heuristic header: " 

2925 "table cell identified as header and given " 

2926 "candidate status, BUT {} is not in " 

2927 "LANGUAGES_WITH_CELLS_AS_HEADERS; " 

2928 "cleaned text: {}".format(lang, cleaned), 

2929 sortid="inflection/2447", 

2930 ) 

2931 candidate_hdr = False 

2932 elif cleaned not in LANGUAGES_WITH_CELLS_AS_HEADERS.get(lang, ""): 

2933 wxr.wtp.debug( 

2934 "rejected heuristic header: " 

2935 "table cell identified as header and given " 

2936 "candidate status, BUT the cleaned text is " 

2937 "not in LANGUAGES_WITH_CELLS_AS_HEADERS[{}]; " 

2938 "cleaned text: {}".format(lang, cleaned), 

2939 sortid="inflection/2457", 

2940 ) 

2941 candidate_hdr = False 

2942 else: 

2943 wxr.wtp.debug( 

2944 "accepted heuristic header: " 

2945 "table cell identified as header and given " 

2946 "candidate status, AND the cleaned text is " 

2947 "in LANGUAGES_WITH_CELLS_AS_HEADERS[{}]; " 

2948 "cleaned text: {}".format(lang, cleaned), 

2949 sortid="inflection/2466", 

2950 ) 

2951 

2952 # If the cell starts with something that could start a 

2953 # definition (typically a reference symbol), make it a candidate 

2954 # regardless of whether the language is listed. 

2955 if re.match(def_re, cleaned) and not re.match(nondef_re, cleaned): 2955 ↛ 2956line 2955 didn't jump to line 2956 because the condition on line 2955 was never true

2956 candidate_hdr = True 

2957 

2958 # print("titletext={!r} hdr_expansion={!r} candidate_hdr={!r} " 

2959 # "lang={} pos={}" 

2960 # .format(titletext, hdr_expansion, candidate_hdr, 

2961 # lang, pos)) 

2962 if idx >= 0 and titletext[:idx] in infl_map: 

2963 target = titletext[idx + 2 :].strip() 

2964 celltext = celltext[:idx] 

2965 is_title = True 

2966 elif ( 

2967 kind == header_kind 

2968 and " + " not in titletext # For "avoir + blah blah"? 

2969 and not any( 

2970 isinstance(x, WikiNode) 

2971 and x.kind == NodeKind.HTML 

2972 and x.sarg == "span" 

2973 and x.attrs.get("lang") in ("az",) 

2974 for x in col.children 

2975 ) 

2976 ): 

2977 is_title = True 

2978 elif ( 

2979 candidate_hdr 

2980 and cleaned_titletext not in IGNORED_COLVALUES 

2981 and distw([cleaned_titletext], word) > 0.3 

2982 and cleaned_titletext not in ("I", "es") 

2983 ): 

2984 is_title = True 

2985 # if first column or same style as first column 

2986 elif ( 

2987 style == cellstyle 

2988 and 

2989 # and title is not identical to word name 

2990 titletext != word 

2991 and cleaned not in IGNORED_COLVALUES 

2992 and cleaned != "dummy-ignored-text-cell" 

2993 and 

2994 # the style composite string is not broken 

2995 not style.startswith("////") 

2996 and " + " not in titletext 

2997 ): 

2998 if not ignored_cell and lang not in LANGUAGES_WITH_CELLS_AS_HEADERS: 2998 ↛ 2999line 2998 didn't jump to line 2999 because the condition on line 2998 was never true

2999 wxr.wtp.debug( 

3000 "rejected heuristic header: " 

3001 "table cell identified as header based " 

3002 "on style, BUT {} is not in " 

3003 "LANGUAGES_WITH_CELLS_AS_HEADERS; " 

3004 "cleaned text: {}, style: {}".format(lang, cleaned, style), 

3005 sortid="inflection/2512", 

3006 ) 

3007 elif ( 3007 ↛ 3011line 3007 didn't jump to line 3011 because the condition on line 3007 was never true

3008 not ignored_cell 

3009 and cleaned not in LANGUAGES_WITH_CELLS_AS_HEADERS.get(lang, "") 

3010 ): 

3011 wxr.wtp.debug( 

3012 "rejected heuristic header: " 

3013 "table cell identified as header based " 

3014 "on style, BUT the cleaned text is " 

3015 "not in LANGUAGES_WITH_CELLS_AS_HEADERS[{}]; " 

3016 "cleaned text: {}, style: {}".format(lang, cleaned, style), 

3017 sortid="inflection/2522", 

3018 ) 

3019 else: 

3020 wxr.wtp.debug( 

3021 "accepted heuristic header: " 

3022 "table cell identified as header based " 

3023 "on style, AND the cleaned text is " 

3024 "in LANGUAGES_WITH_CELLS_AS_HEADERS[{}]; " 

3025 "cleaned text: {}, style: {}".format(lang, cleaned, style), 

3026 sortid="inflection/2530", 

3027 ) 

3028 is_title = True 

3029 if ( 3029 ↛ 3036line 3029 didn't jump to line 3036 because the condition on line 3029 was never true

3030 not is_title 

3031 and len(row) < len(cols_headered) 

3032 and cols_headered[len(row)] 

3033 ): 

3034 # Whole column has title suggesting they are headers 

3035 # (e.g. "Case") 

3036 is_title = True 

3037 if re.match( 

3038 r"Conjugation of |Declension of |Inflection of |" 

3039 r"Mutation of |Notes\b", # \b is word-boundary 

3040 titletext, 

3041 ): 

3042 is_title = True 

3043 return is_title, hdr_expansion, target, celltext 

3044 

3045 

3046class TableContext: 

3047 """Saved context used when parsing a table and its subtables.""" 

3048 

3049 __slot__ = ( 

3050 "stored_hdrspans", 

3051 "section_header", 

3052 "template_name", 

3053 ) 

3054 

3055 def __init__(self, template_name=None): 

3056 self.stored_hdrspans = [] 

3057 self.section_header = [] 

3058 if not template_name: 

3059 self.template_name = "" 

3060 else: 

3061 self.template_name = template_name 

3062 

3063 

3064def handle_wikitext_or_html_table( 

3065 wxr, word, lang, pos, data, tree, titles, source, after, tablecontext=None 

3066): 

3067 """Parses a table from parsed Wikitext format into rows and columns of 

3068 InflCell objects and then calls handle_generic_table() to parse it into 

3069 forms. This adds the forms into ``data``.""" 

3070 assert isinstance(wxr, WiktextractContext) 

3071 assert isinstance(word, str) 

3072 assert isinstance(lang, str) 

3073 assert isinstance(pos, str) 

3074 assert isinstance(data, dict) 

3075 assert isinstance(tree, WikiNode) 

3076 assert tree.kind == NodeKind.TABLE or ( 

3077 tree.kind == NodeKind.HTML and tree.sarg == "table" 

3078 ) 

3079 assert isinstance(titles, list) 

3080 assert isinstance(source, str) 

3081 for x in titles: 

3082 assert isinstance(x, str) 

3083 assert isinstance(after, str) 

3084 assert tablecontext is None or isinstance(tablecontext, TableContext) 

3085 # Imported here to avoid a circular import 

3086 from wiktextract.page import clean_node, recursively_extract 

3087 

3088 # from wikitextprocessor.parser import print_tree 

3089 # print_tree(tree) 

3090 # print("-------==========-------") 

3091 

3092 if not tablecontext: 

3093 tablecontext = TableContext() 

3094 

3095 def handle_table1( 

3096 wxr, 

3097 tablecontext, 

3098 word, 

3099 lang, 

3100 pos, 

3101 data, 

3102 tree, 

3103 titles, 

3104 source, 

3105 after, 

3106 depth, 

3107 ): 

3108 """Helper function allowing the 'flattening' out of the table 

3109 recursion: instead of handling the tables in the wrong order 

3110 (recursively), this function adds to new_row that is then 

3111 iterated through in the main function at the end, creating 

3112 a longer table (still in pieces) in the correct order.""" 

3113 

3114 assert isinstance(data, dict) 

3115 assert isinstance(titles, list) 

3116 assert isinstance(source, str) 

3117 for x in titles: 

3118 assert isinstance(x, str) 

3119 assert isinstance(after, str) 

3120 assert isinstance(depth, int) 

3121 # print("HANDLE_WIKITEXT_TABLE", titles) 

3122 

3123 col_gap_data = [] # Filling for columns with rowspan > 1 

3124 # col_gap_data contains None or InflCell 

3125 vertical_still_left = [] # Number of remaining rows for which to fill 

3126 # the column; vertical_still_left contains int 

3127 cols_headered = [] # [F, T, F, F...] 

3128 # True when the whole column contains headers, even 

3129 # when the cell is not considered a header; triggered 

3130 # by the "*" inflmap meta-tag. 

3131 rows = [] 

3132 

3133 sub_ret = [] 

3134 

3135 # from wikitextprocessor.parser import print_tree 

3136 # print_tree(tree) 

3137 for node in tree.children: 

3138 if not isinstance(node, WikiNode): 

3139 continue 

3140 if node.kind == NodeKind.HTML: 

3141 kind = node.sarg 

3142 else: 

3143 kind = node.kind 

3144 

3145 # print(" {}".format(node)) 

3146 if kind in (NodeKind.TABLE_CAPTION, "caption"): 

3147 # print(" CAPTION:", node) 

3148 pass 

3149 elif kind in (NodeKind.TABLE_ROW, "tr"): 

3150 if "vsShow" in node.attrs.get("class", "").split(): 

3151 # vsShow rows are those that are intially shown in tables 

3152 # that have more data. The hidden data duplicates these 

3153 # rows, so we skip it and just process the hidden data. 

3154 continue 

3155 

3156 # if ( 

3157 # len(node.children) == 1 

3158 # and node.children[0].attrs.get("class") == "separator" 

3159 # ): 

3160 # print("------------------ skip separator") 

3161 # continue 

3162 

3163 # Parse a table row. 

3164 row = [] 

3165 style = None 

3166 row_has_nonempty_cells = False 

3167 # Have nonempty cell not from rowspan 

3168 for col in get_table_cells(node): 

3169 # loop through each cell in the ROW 

3170 

3171 # The below skip is not needed anymore, because we "skip" in 

3172 # get_table_cells, but left here as a comment 

3173 # if not isinstance(col, WikiNode): 

3174 # # This skip is not used for counting, 

3175 # # "None" is not used in 

3176 # # indexing or counting or looping. 

3177 # continue 

3178 if col.kind == NodeKind.HTML: 

3179 kind = col.sarg 

3180 else: 

3181 kind = col.kind 

3182 if kind not in ( 3182 ↛ 3188line 3182 didn't jump to line 3188 because the condition on line 3182 was never true

3183 NodeKind.TABLE_HEADER_CELL, 

3184 NodeKind.TABLE_CELL, 

3185 "th", 

3186 "td", 

3187 ): 

3188 print(" UNEXPECTED ROW CONTENT: {}".format(col)) 

3189 continue 

3190 

3191 while ( 

3192 len(row) < len(vertical_still_left) 

3193 and vertical_still_left[len(row)] > 0 

3194 ): 

3195 # vertical_still_left is [...0, 0, 2...] for each 

3196 # column. It is populated at the end of the loop, at the 

3197 # same time as col_gap_data. This needs to be looped and 

3198 # filled this way because each `for col`-looping jumps 

3199 # straight to the next meaningful cell; there is no 

3200 # "None" cells, only emptiness between, and rowspan and 

3201 # colspan are just to generate the "fill- 

3202 vertical_still_left[len(row)] -= 1 

3203 row.append(col_gap_data[len(row)]) 

3204 

3205 # appending row is how "indexing" is 

3206 # done here; something is appended, 

3207 # like a filler-cell here or a "start" 

3208 # cell at the end of the row-loop, 

3209 # which increased len(row) which is 

3210 # then used as the target-index to check 

3211 # for gaps. vertical_still_left is 

3212 # the countdown to when to stop 

3213 # filling in gaps, and goes down to 0, 

3214 # and col_gap_data is not touched 

3215 # except when a new rowspan is needed, 

3216 # at the same time that 

3217 # vertical_still_left gets reassigned. 

3218 

3219 try: 

3220 rowspan = int(col.attrs.get("rowspan", "1")) # 🡙 

3221 colspan = int(col.attrs.get("colspan", "1")) # 🡘 

3222 except ValueError: 

3223 rowspan = 1 

3224 colspan = 1 

3225 # print("COL:", col) 

3226 

3227 # Too many of these errors 

3228 if colspan > 100: 

3229 # wxr.wtp.error( 

3230 # f"Colspan {colspan} over 30, set to 1", 

3231 # sortid="inflection/20250113a", 

3232 # ) 

3233 colspan = 100 

3234 if rowspan > 100: 3234 ↛ 3239line 3234 didn't jump to line 3239 because the condition on line 3234 was never true

3235 # wxr.wtp.error( 

3236 # f"Rowspan {rowspan} over 30, set to 1", 

3237 # sortid="inflection/20250113b", 

3238 # ) 

3239 rowspan = 100 

3240 

3241 # Process any nested tables recursively. 

3242 tables, rest = recursively_extract( 

3243 col, 

3244 lambda x: isinstance(x, WikiNode) 

3245 and (x.kind == NodeKind.TABLE or x.sarg == "table"), 

3246 ) 

3247 

3248 # Clean the rest of the cell. 

3249 celltext = clean_node(wxr, None, rest) 

3250 # print("CLEANED:", celltext) 

3251 # print(f"SUBTABLES: {tables}") 

3252 

3253 # Handle nested tables. 

3254 for tbl in tables: 

3255 # Some nested tables (e.g., croí/Irish) have subtitles 

3256 # as normal paragraphs in the same cell under a descrip- 

3257 # tive text that should be treated as a title (e.g., 

3258 # "Forms with the definite article", with "definite" not 

3259 # mentioned elsewhere). 

3260 new_titles = list(titles) 

3261 if celltext: 

3262 new_titles.append(celltext) 

3263 subtbl = handle_table1( 

3264 wxr, 

3265 tablecontext, 

3266 word, 

3267 lang, 

3268 pos, 

3269 data, 

3270 tbl, 

3271 new_titles, 

3272 source, 

3273 "", 

3274 depth + 1, 

3275 ) 

3276 if subtbl: 3276 ↛ 3254line 3276 didn't jump to line 3254 because the condition on line 3276 was always true

3277 sub_ret.append((rows, titles, after, depth)) 

3278 rows = [] 

3279 titles = [] 

3280 after = "" 

3281 sub_ret.extend(subtbl) 

3282 

3283 # This magic value is used as part of header detection 

3284 cellstyle = ( 

3285 col.attrs.get("style", "") 

3286 + "//" 

3287 + col.attrs.get("class", "") 

3288 + "//" 

3289 + str(kind) 

3290 ) 

3291 

3292 if not row: # if first column in row 

3293 style = cellstyle 

3294 target = None 

3295 titletext = celltext.strip() 

3296 while titletext and is_superscript(titletext[-1]): 

3297 titletext = titletext[:-1] 

3298 

3299 ( 

3300 is_title, 

3301 hdr_expansion, 

3302 target, 

3303 celltext, 

3304 ) = determine_header( 

3305 wxr, 

3306 tablecontext, 

3307 lang, 

3308 word, 

3309 pos, 

3310 tree.kind, 

3311 kind, 

3312 style, 

3313 row, 

3314 col, 

3315 celltext, 

3316 titletext, 

3317 cols_headered, 

3318 None, 

3319 cellstyle, 

3320 ) 

3321 

3322 if is_title: 

3323 # If this cell gets a "*" tag, make the whole column 

3324 # below it (toggling it in cols_headered = [F, F, T...]) 

3325 # into headers. 

3326 while len(cols_headered) <= len(row): 

3327 cols_headered.append(False) 

3328 if any("*" in tt for tt in hdr_expansion): 

3329 cols_headered[len(row)] = True 

3330 celltext = "" 

3331 # if row_has_nonempty_cells has been True at some point, it 

3332 # keeps on being True. 

3333 # if row_has_nonempty_cells or is_title or celltext != "": 

3334 # row_has_nonempty_cells = True 

3335 # ⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓ 

3336 row_has_nonempty_cells |= is_title or celltext != "" 

3337 cell = InflCell( 

3338 celltext, is_title, colspan, rowspan, target 

3339 ) 

3340 for _ in range(0, colspan): 

3341 # colspan🡘 current loop (col) or 1 

3342 # All the data-filling for colspan 

3343 # is done simply in this loop, 

3344 # while rowspan needs to use 

3345 # vertical_still_left to count gaps 

3346 # and col_gap_data to fill in 

3347 # those gaps with InflCell data. 

3348 if rowspan > 1: # rowspan🡙 current loop (col) or 1 

3349 while len(col_gap_data) <= len(row): 

3350 # Initialize col_gap_data/ed if 

3351 # it is lacking slots 

3352 # for each column; col_gap_data and 

3353 # vertical_still_left are never 

3354 # reset to [], during 

3355 # the whole table function. 

3356 col_gap_data.append(None) 

3357 vertical_still_left.append(0) 

3358 # Below is where the "rectangle" block of rowspan 

3359 # and colspan is filled for the future. 

3360 col_gap_data[len(row)] = cell 

3361 # col_gap_data contains cells that 

3362 # will be used in the 

3363 # future, or None 

3364 vertical_still_left[len(row)] = rowspan - 1 

3365 # A counter for how many gaps🡙 are still left to be 

3366 # filled (row.append or 

3367 # row[col_gap_data[len(row)] => 

3368 # rows), it is not reset to [], but decremented to 0 

3369 # each time a row gets something from col_gap_data. 

3370 # Append this cell 1+ times for colspan🡘 

3371 row.append(cell) 

3372 if not row: 

3373 continue 

3374 # After looping the original row-nodes above, fill 

3375 # in the rest of the row if the final cell has colspan 

3376 # (inherited from above, so a cell with rowspan and colspan) 

3377 for i in range(len(row), len(vertical_still_left)): 

3378 if vertical_still_left[i] <= 0: 

3379 continue 

3380 vertical_still_left[i] -= 1 

3381 while len(row) < i: 

3382 row.append(InflCell("", False, 1, 1, None)) 

3383 row.append(col_gap_data[i]) 

3384 # print(" ROW {!r}".format(row)) 

3385 if row_has_nonempty_cells: 3385 ↛ 3137line 3385 didn't jump to line 3137 because the condition on line 3385 was always true

3386 rows.append(row) 

3387 elif kind in ( 3387 ↛ 3137line 3387 didn't jump to line 3137 because the condition on line 3387 was always true

3388 NodeKind.TABLE_HEADER_CELL, 

3389 NodeKind.TABLE_CELL, 

3390 "th", 

3391 "td", 

3392 "span", 

3393 ): 

3394 # print(" TOP-LEVEL CELL", node) 

3395 pass 

3396 

3397 if sub_ret: 

3398 main_ret = sub_ret 

3399 main_ret.append((rows, titles, after, depth)) 

3400 else: 

3401 main_ret = [(rows, titles, after, depth)] 

3402 return main_ret 

3403 

3404 new_rows = handle_table1( 

3405 wxr, tablecontext, word, lang, pos, data, tree, titles, source, after, 0 

3406 ) 

3407 

3408 # Now we have a table that has been parsed into rows and columns of 

3409 # InflCell objects. Parse the inflection table from that format. 

3410 if new_rows: 3410 ↛ exitline 3410 didn't return from function 'handle_wikitext_or_html_table' because the condition on line 3410 was always true

3411 for rows, titles, after, depth in new_rows: 

3412 handle_generic_table( 

3413 wxr, 

3414 tablecontext, 

3415 data, 

3416 word, 

3417 lang, 

3418 pos, 

3419 rows, 

3420 titles, 

3421 source, 

3422 after, 

3423 depth, 

3424 ) 

3425 

3426 

3427def get_table_cells(node: WikiNode) -> Generator[WikiNode, None, None]: 

3428 """If a wikitext table cell contains HTML cells `<td>`, as they sometimes 

3429 do because it is easier to write wikitext conditionals that way, 

3430 those td-elements are parsed as child elements of the Wikitext cell. 

3431 This generator will yield wikitext and HTML direct children of 

3432 `node` and if a Wikitext TABLE_CELL has direct td-element children, 

3433 those are also yielded.""" 

3434 for col in node.children: 

3435 if not isinstance(col, WikiNode): 

3436 continue 

3437 if any( 

3438 isinstance(c, HTMLNode) and c.sarg in ("th", "td") 

3439 for c in col.children 

3440 ): 

3441 html_cells = [] 

3442 content = [] 

3443 for c in col.children: 

3444 if isinstance(c, HTMLNode) and c.sarg in ("th", "td"): 

3445 html_cells.append(c) 

3446 else: 

3447 content.append(c) 

3448 # Remove td-elements from col so they are not returned twice 

3449 col.children = content 

3450 yield col 

3451 for c in html_cells: 

3452 yield c 

3453 else: 

3454 yield col 

3455 

3456 

3457def handle_html_table( 

3458 wxr, word, lang, pos, data, tree, titles, source, after, tablecontext=None 

3459): 

3460 """A passer-on function for html-tables, XXX, remove these?""" 

3461 handle_wikitext_or_html_table( 

3462 wxr, word, lang, pos, data, tree, titles, source, after, tablecontext 

3463 ) 

3464 

3465 

3466def handle_wikitext_table( 

3467 wxr, word, lang, pos, data, tree, titles, source, after, tablecontext=None 

3468): 

3469 """A passer-on function for html-tables, XXX, remove these?""" 

3470 handle_wikitext_or_html_table( 

3471 wxr, word, lang, pos, data, tree, titles, source, after, tablecontext 

3472 ) 

3473 

3474 

3475def parse_inflection_section( 

3476 wxr, data, word, lang, pos, section, tree, tablecontext=None 

3477): 

3478 """Parses an inflection section on a page. ``data`` should be the 

3479 data for a part-of-speech, and inflections will be added to it.""" 

3480 

3481 # print("PARSE_INFLECTION_SECTION {}/{}/{}/{}" 

3482 # .format(word, lang, pos, section)) 

3483 assert isinstance(wxr, WiktextractContext) 

3484 assert isinstance(data, dict) 

3485 assert isinstance(word, str) 

3486 assert isinstance(lang, str) 

3487 assert isinstance(section, str) 

3488 assert isinstance(tree, WikiNode) 

3489 assert tablecontext is None or isinstance(tablecontext, TableContext) 

3490 source = section 

3491 tables = [] 

3492 titleparts = [] 

3493 preceding_bolded_title = "" 

3494 

3495 # from wikitextprocessor.parser import print_tree 

3496 # print_tree(tree) 

3497 # print("--------------******************----------------") 

3498 

3499 def process_tables(): 

3500 for kind, node, titles, after in tables: 

3501 after = "".join(after).strip() 

3502 after = clean_value(wxr, after) 

3503 if kind == "wikitext": 

3504 handle_wikitext_table( 

3505 wxr, 

3506 word, 

3507 lang, 

3508 pos, 

3509 data, 

3510 node, 

3511 titles, 

3512 source, 

3513 after, 

3514 tablecontext=tablecontext, 

3515 ) 

3516 elif kind == "html": 3516 ↛ 3530line 3516 didn't jump to line 3530 because the condition on line 3516 was always true

3517 handle_html_table( 

3518 wxr, 

3519 word, 

3520 lang, 

3521 pos, 

3522 data, 

3523 node, 

3524 titles, 

3525 source, 

3526 after, 

3527 tablecontext=tablecontext, 

3528 ) 

3529 else: 

3530 raise RuntimeError( 

3531 "{}: unimplemented table kind {}".format(word, kind) 

3532 ) 

3533 

3534 def recurse_navframe(node, titles): 

3535 nonlocal tables 

3536 nonlocal titleparts 

3537 titleparts = [] 

3538 old_tables = tables 

3539 tables = [] 

3540 

3541 recurse(node, [], navframe=True) 

3542 

3543 process_tables() 

3544 tables = old_tables 

3545 

3546 def recurse(node, titles, navframe=False): 

3547 nonlocal tables 

3548 if isinstance(node, (list, tuple)): 

3549 for x in node: 

3550 recurse(x, titles, navframe) 

3551 return 

3552 if isinstance(node, str): 

3553 if tables: 

3554 tables[-1][-1].append(node) 

3555 elif navframe: 

3556 titleparts.append(node) 

3557 return 

3558 if not isinstance(node, WikiNode): 3558 ↛ 3559line 3558 didn't jump to line 3559 because the condition on line 3558 was never true

3559 if navframe: 

3560 wxr.wtp.debug( 

3561 "inflection table: unhandled in NavFrame: {}".format(node), 

3562 sortid="inflection/2907", 

3563 ) 

3564 return 

3565 kind = node.kind 

3566 if navframe: 

3567 if kind == NodeKind.HTML: 

3568 classes = node.attrs.get("class", "").split() 

3569 if "NavToggle" in classes: 3569 ↛ 3570line 3569 didn't jump to line 3570 because the condition on line 3569 was never true

3570 return 

3571 if "NavHead" in classes: 

3572 # print("NAVHEAD:", node) 

3573 recurse(node.children, titles, navframe) 

3574 return 

3575 if "NavContent" in classes: 

3576 # print("NAVCONTENT:", node) 

3577 title = "".join(titleparts).strip() 

3578 title = html.unescape(title) 

3579 title = title.strip() 

3580 new_titles = list(titles) 

3581 if not re.match(r"(Note:|Notes:)", title): 3581 ↛ 3583line 3581 didn't jump to line 3583 because the condition on line 3581 was always true

3582 new_titles.append(title) 

3583 recurse(node, new_titles, navframe=False) 

3584 return 

3585 else: 

3586 if kind == NodeKind.TABLE: 

3587 tables.append(["wikitext", node, titles, []]) 

3588 return 

3589 elif kind == NodeKind.HTML and node.sarg == "table": 

3590 classes = node.attrs.get("class", ()) 

3591 if "audiotable" in classes: 

3592 return 

3593 tables.append(["html", node, titles, []]) 

3594 return 

3595 elif kind in ( 3595 ↛ 3602line 3595 didn't jump to line 3602 because the condition on line 3595 was never true

3596 NodeKind.LEVEL2, 

3597 NodeKind.LEVEL3, 

3598 NodeKind.LEVEL4, 

3599 NodeKind.LEVEL5, 

3600 NodeKind.LEVEL6, 

3601 ): 

3602 return # Skip subsections 

3603 if ( 

3604 kind == NodeKind.HTML 

3605 and node.sarg == "div" 

3606 and "NavFrame" in node.attrs.get("class", "").split() 

3607 ): 

3608 recurse_navframe(node, titles) 

3609 return 

3610 if kind == NodeKind.LINK: 

3611 if len(node.largs) > 1: 

3612 recurse(node.largs[1:], titles, navframe) 

3613 else: 

3614 recurse(node.largs[0], titles, navframe) 

3615 return 

3616 if kind == NodeKind.LIST and node.sarg == ";": 

3617 nonlocal preceding_bolded_title 

3618 from wiktextract.page import clean_node 

3619 

3620 preceding_bolded_title = clean_node(wxr, None, node).strip("; ") 

3621 for x in node.children: 

3622 recurse(x, titles, navframe) 

3623 

3624 assert tree.kind == NodeKind.ROOT 

3625 for x in tree.children: 

3626 if preceding_bolded_title != "": 

3627 recurse(x, [preceding_bolded_title]) 

3628 else: 

3629 recurse(x, []) 

3630 

3631 # Process the tables we found 

3632 process_tables() 

3633 

3634 # XXX this code is used for extracting tables for inflection tests 

3635 if wxr.config.expand_tables: 3635 ↛ 3636line 3635 didn't jump to line 3636 because the condition on line 3635 was never true

3636 if section != "Mutation": 

3637 with open(wxr.config.expand_tables, "w") as f: 

3638 f.write(word + "\n") 

3639 f.write(lang + "\n") 

3640 f.write(pos + "\n") 

3641 f.write(section + "\n") 

3642 text = wxr.wtp.node_to_wikitext(tree) 

3643 f.write(text + "\n")