Coverage for src/wiktextract/extractor/en/inflection.py: 87%

1521 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-12 08:27 +0000

1# Code for parsing inflection tables. 

2# 

3# Copyright (c) 2021-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org. 

4 

5import collections 

6import copy 

7import functools 

8import html 

9import itertools 

10import re 

11import unicodedata 

12from typing import Generator, Optional, Union 

13 

14from wikitextprocessor import MAGIC_FIRST, HTMLNode, NodeKind, WikiNode 

15 

16from ...clean import clean_value 

17from ...datautils import data_append, freeze, split_at_comma_semi 

18from ...tags import valid_tags 

19from ...wxr_context import WiktextractContext 

20from .form_descriptions import ( 

21 classify_desc, 

22 decode_tags, 

23 distw, 

24 parse_head_final_tags, 

25) 

26from .inflection_kludges import ka_decl_noun_template_cell 

27from .inflectiondata import infl_map, infl_start_map, infl_start_re 

28from .lang_specific_configs import get_lang_conf, lang_specific_tags 

29from .table_headers_heuristics_data import LANGUAGES_WITH_CELLS_AS_HEADERS 

30from .type_utils import FormData 

31 

32# --debug-text-cell WORD 

33# Command-line parameter for debugging. When parsing inflection tables, 

34# print out debug messages when encountering this text. 

35debug_cell_text: Optional[str] = None 

36 

37 

38def set_debug_cell_text(text: str) -> None: 

39 global debug_cell_text 

40 debug_cell_text = text 

41 

42 

43TagSets = list[tuple[str, ...]] 

44 

45# Column texts that are interpreted as an empty column. 

46IGNORED_COLVALUES = { 

47 "-", 

48 "־", 

49 "᠆", 

50 "‐", 

51 "‑", 

52 "‒", 

53 "–", 

54 "—", 

55 "―", 

56 "−", 

57 "⸺", 

58 "⸻", 

59 "﹘", 

60 "﹣", 

61 "-", 

62 "/", 

63 "?", 

64 "not used", 

65 "not applicable", 

66} 

67 

68# These tags are never inherited from above 

69# XXX merge with lang_specific 

70noinherit_tags = { 

71 "infinitive-i", 

72 "infinitive-i-long", 

73 "infinitive-ii", 

74 "infinitive-iii", 

75 "infinitive-iv", 

76 "infinitive-v", 

77} 

78 

79# Subject->object transformation mapping, when using dummy-object-concord 

80# to replace subject concord tags with object concord tags 

81object_concord_replacements = { 

82 "first-person": "object-first-person", 

83 "second-person": "object-second-person", 

84 "third-person": "object-third-person", 

85 "singular": "object-singular", 

86 "plural": "object-plural", 

87 "definite": "object-definite", 

88 "indefinite": "object-indefinite", 

89 "class-1": "object-class-1", 

90 "class-2": "object-class-2", 

91 "class-3": "object-class-3", 

92 "class-4": "object-class-4", 

93 "class-5": "object-class-5", 

94 "class-6": "object-class-6", 

95 "class-7": "object-class-7", 

96 "class-8": "object-class-8", 

97 "class-9": "object-class-9", 

98 "class-10": "object-class-10", 

99 "class-11": "object-class-11", 

100 "class-12": "object-class-12", 

101 "class-13": "object-class-13", 

102 "class-14": "object-class-14", 

103 "class-15": "object-class-15", 

104 "class-16": "object-class-16", 

105 "class-17": "object-class-17", 

106 "class-18": "object-class-18", 

107 "masculine": "object-masculine", 

108 "feminine": "object-feminine", 

109} 

110 

111# Words in title that cause addition of tags in all entries 

112title_contains_global_map = { 

113 "possessive": "possessive", 

114 "possessed forms of": "possessive", 

115 "predicative forms of": "predicative", 

116 "negative": "negative", 

117 "positive definite forms": "positive definite", 

118 "positive indefinite forms": "positive indefinite", 

119 "comparative": "comparative", 

120 "superlative": "superlative", 

121 "combined forms": "combined-form", 

122 "mutation": "mutation", 

123 "definite article": "definite", 

124 "indefinite article": "indefinite", 

125 "indefinite declension": "indefinite", 

126 "bare forms": "indefinite", # e.g., cois/Irish 

127 "definite declension": "definite", 

128 "pre-reform": "dated", 

129 "personal pronouns": "personal pronoun", 

130 "composed forms of": "multiword-construction", 

131 "subordinate-clause forms of": "subordinate-clause", 

132 "participles of": "participle", 

133 "variation of": "dummy-skip-this", # a'/Scottish Gaelic 

134 "command form of": "imperative", # a راتلل/Pashto 

135 "historical inflection of": "dummy-skip-this", # kork/Norwegian Nynorsk 

136 "obsolete declension": "obsolete", # März/German 20241111 

137} 

138for k, v in title_contains_global_map.items(): 

139 if any(t not in valid_tags for t in v.split()): 139 ↛ 140line 139 didn't jump to line 140 because the condition on line 139 was never true

140 print("TITLE_CONTAINS_GLOBAL_MAP UNRECOGNIZED TAG: {}: {}".format(k, v)) 

141table_hdr_ign_part = r"(Inflection|Conjugation|Declension|Mutation) of [^\s]" 

142 

143table_hdr_ign_part_re = re.compile(r"(?i)(" + table_hdr_ign_part + ")") 

144# (?i) python regex extension, ignore case 

145title_contains_global_re = re.compile( 

146 r"(?i)(^|\b)({}|{})($|\b)".format( 

147 table_hdr_ign_part, 

148 "|".join(re.escape(x) for x in title_contains_global_map.keys()), 

149 ) 

150) 

151 

152# Words in title that cause addition of tags to table-tags "form" 

153title_contains_wordtags_map = { 

154 "pf": "perfective", 

155 "impf": "imperfective", 

156 "strong": "strong", 

157 "weak": "weak", 

158 "countable": "countable", 

159 "uncountable": "uncountable", 

160 "inanimate": "inanimate", 

161 "animate": "animate", 

162 "transitive": "transitive", 

163 "intransitive": "intransitive", 

164 "ditransitive": "ditransitive", 

165 "ambitransitive": "ambitransitive", 

166 "archaic": "archaic", 

167 "dated": "dated", 

168 "affirmative": "affirmative", 

169 "negative": "negative", 

170 "subject pronouns": "subjective", 

171 "object pronouns": "objective", 

172 "emphatic": "emphatic", 

173 "proper noun": "proper-noun", 

174 "no plural": "no-plural", 

175 "imperfective": "imperfective", 

176 "perfective": "perfective", 

177 "no supine stem": "no-supine", 

178 "no perfect stem": "no-perfect", 

179 "deponent": "deponent", 

180 "irregular": "irregular", 

181 "no short forms": "no-short-form", 

182 "iō-variant": "iō-variant", 

183 "1st declension": "declension-1", 

184 "2nd declension": "declension-2", 

185 "3rd declension": "declension-3", 

186 "4th declension": "declension-4", 

187 "5th declension": "declension-5", 

188 "6th declension": "declension-6", 

189 "first declension": "declension-1", 

190 "second declension": "declension-2", 

191 "third declension": "declension-3", 

192 "fourth declension": "declension-4", 

193 "fifth declension": "declension-5", 

194 "sixth declension": "declension-6", 

195 "1st conjugation": "conjugation-1", 

196 "2nd conjugation": "conjugation-2", 

197 "3rd conjugation": "conjugation-3", 

198 "4th conjugation": "conjugation-4", 

199 "5th conjugation": "conjugation-5", 

200 "6th conjugation": "conjugation-6", 

201 "7th conjugation": "conjugation-7", 

202 "first conjugation": "conjugation-1", 

203 "second conjugation": "conjugation-2", 

204 "third conjugation": "conjugation-3", 

205 "fourth conjugation": "conjugation-4", 

206 "fifth conjugation": "conjugation-5", 

207 "sixth conjugation": "conjugation-6", 

208 "seventh conjugation": "conjugation-7", 

209 # Corsican regional tags in table header 

210 "cismontane": "Cismontane", 

211 "ultramontane": "Ultramontane", 

212 "western lombard": "Western-Lombard", 

213 "eastern lombard": "Eastern-Lombard", 

214 "contracted": "contracted", 

215 "present": "present", 

216 "perfect": "perfect", 

217 "imperfect": "imperfect", 

218 "pluperfect": "pluperfect", 

219 "future": "future", 

220 "aorist": "aorist", 

221} 

222for k, v in title_contains_wordtags_map.items(): 

223 if any(t not in valid_tags for t in v.split()): 223 ↛ 224line 223 didn't jump to line 224 because the condition on line 223 was never true

224 print( 

225 "TITLE_CONTAINS_WORDTAGS_MAP UNRECOGNIZED TAG: {}: {}".format(k, v) 

226 ) 

227title_contains_wordtags_re = re.compile( 

228 r"(?i)(^|\b)({}|{})($|\b)".format( 

229 table_hdr_ign_part, 

230 "|".join(re.escape(x) for x in title_contains_wordtags_map.keys()), 

231 ) 

232) 

233 

234# Parenthesized elements in title that are converted to tags in 

235# "table-tags" form 

236title_elements_map = { 

237 "weak": "weak", 

238 "strong": "strong", 

239 "separable": "separable", 

240 "masculine": "masculine", 

241 "feminine": "feminine", 

242 "neuter": "neuter", 

243 "singular": "singular", 

244 "plural": "plural", 

245 "archaic": "archaic", 

246 "dated": "dated", 

247 "Attic": "Attic", # e.g. καλός/Greek/Adj 

248 "Epic": "Epic", # e.g. καλός/Greek/Adj 

249} 

250for k, v in title_elements_map.items(): 

251 if any(t not in valid_tags for t in v.split()): 251 ↛ 252line 251 didn't jump to line 252 because the condition on line 251 was never true

252 print("TITLE_ELEMENTS_MAP UNRECOGNIZED TAG: {}: {}".format(k, v)) 

253 

254# Parenthized element starts to map them to tags for form for the rest of 

255# the element 

256title_elemstart_map = { 

257 "auxiliary": "auxiliary", 

258 "Kotus type": "class", 

259 "ÕS type": "class", 

260 "class": "class", 

261 "short class": "class", 

262 "type": "class", 

263 "strong class": "class", 

264 "weak class": "class", 

265 "accent paradigm": "accent-paradigm", 

266 "stem in": "class", 

267} 

268for k, v in title_elemstart_map.items(): 

269 if any(t not in valid_tags for t in v.split()): 269 ↛ 270line 269 didn't jump to line 270 because the condition on line 269 was never true

270 print("TITLE_ELEMSTART_MAP UNRECOGNIZED TAG: {}: {}".format(k, v)) 

271title_elemstart_re = re.compile( 

272 r"^({}) ".format("|".join(re.escape(x) for x in title_elemstart_map.keys())) 

273) 

274 

275 

276# Regexp for cell starts that are likely definitions of reference symbols. 

277# See also nondef_re. 

278def_re = re.compile( 

279 r"(\s*•?\s+)?" 

280 r"((\*+|[△†0123456789⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻]+)([⁾):]|\s|(?=[A-Z]))|" 

281 r"\^(\*+|[△†])|" 

282 r"([¹²³⁴⁵⁶⁷⁸⁹])|" 

283 r"([ᴬᴮᴰᴱᴳᴴᴵᴶᴷᴸᴹᴺᴼᴾᴿᵀᵁⱽᵂᵃᵇᶜᵈᵉᶠᵍʰⁱʲᵏˡᵐⁿᵒᵖʳˢᵗᵘᵛʷˣʸᶻᵝᵞᵟᶿᶥᵠᵡ]))" 

284) 

285# ᴺᴸᴴ persan/Old Irish 

286 

287# Regexp for cell starts that are exceptions to def_re and do not actually 

288# start a definition. 

289nondef_re = re.compile( 

290 r"(^\s*(1|2|3)\s+(sg|pl)\s*$|" # 1s or 3p etc. 

291 r"\s*\d\d?\s*/\s*\d\d?\s*$)" 

292) # taka/Swahili "15 / 17" 

293 

294# Certain tags are moved from headers in tables into word tags, as they always 

295# apply to the whole word. 

296TAGS_FORCED_WORDTAGS: set[str] = set( 

297 [ 

298 # This was originally created for a issue with number paradigms in 

299 # Arabic, but that is being handled elsewhere now. 

300 ] 

301) 

302 

303 

304class InflCell: 

305 """Cell in an inflection table.""" 

306 

307 __slots__ = ( 

308 "text", 

309 "is_title", 

310 "colspan", 

311 "rowspan", 

312 "target", 

313 ) 

314 

315 def __init__( 

316 self, 

317 text: str, 

318 is_title: bool, 

319 colspan: int, 

320 rowspan: int, 

321 target: Optional[str], 

322 ) -> None: 

323 assert isinstance(text, str) 

324 assert is_title in (True, False) 

325 assert isinstance(colspan, int) and colspan >= 1 

326 assert isinstance(rowspan, int) and rowspan >= 1 

327 assert target is None or isinstance(target, str) 

328 self.text = text.strip() 

329 self.is_title = text and is_title 

330 self.colspan = colspan 

331 self.rowspan = rowspan 

332 self.target = target 

333 

334 def __str__(self) -> str: 

335 v = "{}/{}/{}/{!r}".format( 

336 self.text, self.is_title, self.colspan, self.rowspan 

337 ) 

338 if self.target: 

339 v += ": {!r}".format(self.target) 

340 return v 

341 

342 def __repr__(self) -> str: 

343 return str(self) 

344 

345 

346class HdrSpan: 

347 """Saved information about a header cell/span during the parsing 

348 of a table.""" 

349 

350 __slots__ = ( 

351 "start", 

352 "colspan", 

353 "rowspan", 

354 "rownum", # Row number where this occurred 

355 "tagsets", # list of tuples 

356 "text", # For debugging 

357 "all_headers_row", 

358 "expanded", # The header has been expanded to cover whole row/part 

359 ) 

360 

361 def __init__( 

362 self, 

363 start: int, 

364 colspan: int, 

365 rowspan: int, 

366 rownum: int, 

367 tagsets: TagSets, 

368 text: str, 

369 all_headers_row: bool, 

370 ) -> None: 

371 assert isinstance(start, int) and start >= 0 

372 assert isinstance(colspan, int) and colspan >= 1 

373 assert isinstance(rownum, int) 

374 assert isinstance(tagsets, list) 

375 for x in tagsets: 

376 assert isinstance(x, tuple) 

377 assert all_headers_row in (True, False) 

378 self.start = start 

379 self.colspan = colspan 

380 self.rowspan = rowspan 

381 self.rownum = rownum 

382 self.tagsets = list(tuple(sorted(set(tags))) for tags in tagsets) 

383 self.text = text 

384 self.all_headers_row = all_headers_row 

385 self.expanded = False 

386 

387 

388def is_superscript(ch: str) -> bool: 

389 """Returns True if the argument is a superscript character.""" 

390 assert isinstance(ch, str) and len(ch) == 1 

391 try: 

392 name = unicodedata.name(ch) 

393 except ValueError: 

394 return False 

395 return ( 

396 re.match( 

397 r"SUPERSCRIPT |" 

398 r"MODIFIER LETTER SMALL |" 

399 r"MODIFIER LETTER CAPITAL ", 

400 name, 

401 ) 

402 is not None 

403 ) 

404 

405 

406def remove_useless_tags(lang: str, pos: str, tags: set[str]) -> None: 

407 """Remove certain tag combinations from ``tags`` when they serve no purpose 

408 together (cover all options).""" 

409 assert isinstance(lang, str) 

410 assert isinstance(pos, str) 

411 assert isinstance(tags, set) 

412 if ( 

413 "animate" in tags 

414 and "inanimate" in tags 

415 and get_lang_conf(lang, "animate_inanimate_remove") 

416 ): 

417 tags.remove("animate") 

418 tags.remove("inanimate") 

419 if ( 

420 "virile" in tags 

421 and "nonvirile" in tags 

422 and get_lang_conf(lang, "virile_nonvirile_remove") 

423 ): 

424 tags.remove("virile") 

425 tags.remove("nonvirile") 

426 # If all numbers in the language are listed, remove them all 

427 numbers = get_lang_conf(lang, "numbers") 

428 if numbers and all(x in tags for x in numbers): 

429 for x in numbers: 

430 tags.remove(x) 

431 # If all genders in the language are listed, remove them all 

432 genders = get_lang_conf(lang, "genders") 

433 if genders and all(x in tags for x in genders): 

434 for x in genders: 

435 tags.remove(x) 

436 # If all voices in the language are listed, remove them all 

437 voices = get_lang_conf(lang, "voices") 

438 if voices and all(x in tags for x in voices): 

439 for x in voices: 

440 tags.remove(x) 

441 # If all strengths of the language are listed, remove them all 

442 strengths = get_lang_conf(lang, "strengths") 

443 if strengths and all(x in tags for x in strengths): 

444 for x in strengths: 

445 tags.remove(x) 

446 # If all persons of the language are listed, remove them all 

447 persons = get_lang_conf(lang, "persons") 

448 if persons and all(x in tags for x in persons): 

449 for x in persons: 

450 tags.remove(x) 

451 # If all definitenesses of the language are listed, remove them all 

452 definitenesses = get_lang_conf(lang, "definitenesses") 

453 if definitenesses and all(x in tags for x in definitenesses): 

454 for x in definitenesses: 

455 tags.remove(x) 

456 

457 

458def tagset_cats(tagset: TagSets) -> set[str]: 

459 """Returns a set of tag categories for the tagset (merged from all 

460 alternatives).""" 

461 return set(valid_tags[t] for ts in tagset for t in ts) 

462 

463 

464def or_tagsets( 

465 lang: str, pos: str, tagsets1: TagSets, tagsets2: TagSets 

466) -> TagSets: 

467 """Merges two tagsets (the new tagset just merges the tags from both, in 

468 all combinations). If they contain simple alternatives (differ in 

469 only one category), they are simply merged; otherwise they are split to 

470 more alternatives. The tagsets are assumed be sets of sorted tuples.""" 

471 assert isinstance(tagsets1, list) 

472 assert all(isinstance(x, tuple) for x in tagsets1) 

473 assert isinstance(tagsets2, list) 

474 assert all(isinstance(x, tuple) for x in tagsets1) 

475 tagsets: TagSets = [] # This will be the result 

476 

477 def add_tags(tags1: tuple[str, ...]) -> None: 

478 # CONTINUE 

479 if not tags1: 

480 return # empty set would merge with anything, won't change result 

481 if not tagsets: 

482 tagsets.append(tags1) 

483 return 

484 for tags2 in tagsets: 

485 # Determine if tags1 can be merged with tags2 

486 num_differ = 0 

487 if tags1 and tags2: 487 ↛ 505line 487 didn't jump to line 505 because the condition on line 487 was always true

488 cats1 = set(valid_tags[t] for t in tags1) 

489 cats2 = set(valid_tags[t] for t in tags2) 

490 cats = cats1 | cats2 

491 for cat in cats: 

492 tags1_in_cat = set(t for t in tags1 if valid_tags[t] == cat) 

493 tags2_in_cat = set(t for t in tags2 if valid_tags[t] == cat) 

494 if ( 

495 tags1_in_cat != tags2_in_cat 

496 or not tags1_in_cat 

497 or not tags2_in_cat 

498 ): 

499 num_differ += 1 

500 if not tags1_in_cat or not tags2_in_cat: 

501 # Prevent merging if one is empty 

502 num_differ += 1 

503 # print("tags1={} tags2={} num_differ={}" 

504 # .format(tags1, tags2, num_differ)) 

505 if num_differ <= 1: 

506 # Yes, they can be merged 

507 tagsets.remove(tags2) 

508 tags_s = set(tags1) | set(tags2) 

509 remove_useless_tags(lang, pos, tags_s) 

510 tags_t = tuple(sorted(tags_s)) 

511 add_tags(tags_t) # Could result in further merging 

512 return 

513 # If we could not merge, add to tagsets 

514 tagsets.append(tags1) 

515 

516 for tags in tagsets1: 

517 add_tags(tags) 

518 for tags in tagsets2: 

519 add_tags(tags) 

520 if not tagsets: 

521 tagsets.append(()) 

522 

523 # print("or_tagsets: {} + {} -> {}" 

524 # .format(tagsets1, tagsets2, tagsets)) 

525 return tagsets 

526 

527 

528def and_tagsets( 

529 lang: str, 

530 pos: str, 

531 tagsets1: list[tuple[str, ...]], 

532 tagsets2: list[tuple[str, ...]], 

533) -> list[tuple[str, ...]]: 

534 """Merges tagsets by taking union of all cobinations, without trying 

535 to determine whether they are compatible.""" 

536 assert isinstance(tagsets1, list) and len(tagsets1) >= 1 

537 assert all(isinstance(x, tuple) for x in tagsets1) 

538 assert isinstance(tagsets2, list) and len(tagsets2) >= 1 

539 assert all(isinstance(x, tuple) for x in tagsets1) 

540 new_tagsets = [] 

541 tags: Union[set[str], tuple[str, ...]] 

542 for tags1 in tagsets1: 

543 for tags2 in tagsets2: 

544 tags = set(tags1) | set(tags2) 

545 remove_useless_tags(lang, pos, tags) 

546 if "dummy-ignored-text-cell" in tags: 546 ↛ 547line 546 didn't jump to line 547 because the condition on line 546 was never true

547 tags.remove("dummy-ignored-text-cell") 

548 tags = tuple(sorted(tags)) 

549 if tags not in new_tagsets: 549 ↛ 543line 549 didn't jump to line 543 because the condition on line 549 was always true

550 new_tagsets.append(tags) 

551 # print("and_tagsets: {} + {} -> {}" 

552 # .format(tagsets1, tagsets2, new_tagsets)) 

553 return new_tagsets 

554 

555 

556@functools.lru_cache(65536) 

557def extract_cell_content( 

558 lang: str, word: str, col: str 

559) -> tuple[str, list[str], list[tuple[str, str]], list[str]]: 

560 """Cleans a row/column header for later processing. This returns 

561 (cleaned, refs, defs, tags).""" 

562 # print("EXTRACT_CELL_CONTENT {!r}".format(col)) 

563 hdr_tags = [] 

564 col = re.sub(r"(?s)\s*,\s*$", "", col) 

565 col = re.sub(r"(?s)\s*•\s*$", "", col) 

566 col = re.sub(r"\s+", " ", col) 

567 col = col.strip() 

568 if re.search( 

569 r"^\s*(There are |" 

570 r"\* |" 

571 r"see |" 

572 r"Use |" 

573 r"use the |" 

574 r"Only used |" 

575 r"The forms in |" 

576 r"these are also written |" 

577 r"The genitive can be |" 

578 r"Genitive forms are rare or non-existant|" 

579 r"Accusative Note: |" 

580 r"Classifier Note: |" 

581 r"Noun: Assamese nouns are |" 

582 r"the active conjugation|" 

583 r"the instrumenal singular|" 

584 r"Note:|" 

585 r"\^* Note:|" 

586 r"possible mutated form |" 

587 r"The future tense: )", 

588 col, 

589 ): 

590 return "dummy-ignored-text-cell", [], [], [] 

591 

592 # Temporarily remove final parenthesized part (if separated by whitespace), 

593 # so that we can extract reference markers before it. 

594 final_paren = "" 

595 m = re.search(r"\s+\([^)]*\)$", col) 

596 if m is not None: 

597 final_paren = m.group(0) 

598 col = col[: m.start()] 

599 

600 # Extract references and tag markers 

601 refs = [] 

602 special_references = get_lang_conf(lang, "special_references") 

603 while True: 

604 m = re.search(r"\^(.|\([^)]*\))$", col) 

605 if not m: 

606 break 

607 r = m.group(1) 

608 if r.startswith("(") and r.endswith(")"): 

609 r = r[1:-1] 

610 for r1 in r.split(","): 

611 if r1 == "rare": 611 ↛ 612line 611 didn't jump to line 612 because the condition on line 611 was never true

612 hdr_tags.append("rare") 

613 elif special_references and r1 in special_references: 

614 hdr_tags.extend(special_references[r1].split()) 

615 else: 

616 # v = m.group(1) 

617 if r1.startswith("(") and r1.endswith(")"): 617 ↛ 618line 617 didn't jump to line 618 because the condition on line 617 was never true

618 r1 = r1[1:-1] 

619 refs.append(unicodedata.normalize("NFKD", r1)) 

620 col = col[: m.start()] 

621 # See if it is a ref definition 

622 # print("BEFORE REF CHECK: {!r}".format(col)) 

623 m = def_re.match(col) 

624 # print(f"Before def_re: {refs=}") 

625 if m and not nondef_re.match(col): 

626 ofs = 0 

627 ref = None 

628 deflst = [] 

629 for m in re.finditer(def_re, col): 

630 if ref: 

631 deflst.append((ref, col[ofs : m.start()].strip())) 

632 ref = unicodedata.normalize( 

633 "NFKD", m.group(3) or m.group(5) or m.group(6) or "" 

634 ) 

635 ofs = m.end() 

636 if ref: 636 ↛ 639line 636 didn't jump to line 639 because the condition on line 636 was always true

637 deflst.append((ref, col[ofs:].strip())) 

638 # print("deflst:", deflst) 

639 return "", [], deflst, [] 

640 # See if it *looks* like a reference to a definition 

641 # print(f"After def_re: {refs=}") 

642 while col: 

643 if is_superscript(col[-1]) or col[-1] in ("†",): 

644 if col.endswith("ʳᵃʳᵉ"): 

645 hdr_tags.append("rare") 

646 col = col[:-4].strip() 

647 continue 

648 if special_references: 

649 stop_flag = False 

650 for r in special_references: 

651 if col.endswith(r): 

652 hdr_tags.extend(special_references[r].split()) 

653 col = col[: -len(r)].strip() 

654 stop_flag = True 

655 break # this for loop 

656 if stop_flag: 

657 continue # this while loop 

658 # Numbers and H/L/N are useful information 

659 refs.append(unicodedata.normalize("NFKD", col[-1])) 

660 col = col[:-1] 

661 else: 

662 break 

663 

664 # Check for another form of note definition 

665 if ( 665 ↛ 671line 665 didn't jump to line 671 because the condition on line 665 was never true

666 len(col) > 2 

667 and col[1] in (")", " ", ":") 

668 and col[0].isdigit() 

669 and not re.match(nondef_re, col) 

670 ): 

671 return "", [], [(col[0], col[2:].strip())], [] 

672 col = col.strip() 

673 

674 # Extract final "*" reference symbols. Sometimes there are multiple. 

675 m = re.search(r"\*+$", col) 

676 if m is not None: 

677 col = col[: m.start()] 

678 refs.append(unicodedata.normalize("NFKD", m.group(0))) 

679 if col.endswith("(*)"): 679 ↛ 680line 679 didn't jump to line 680 because the condition on line 679 was never true

680 col = col[:-3].strip() 

681 refs.append("*") 

682 

683 # Put back the final parenthesized part 

684 col = col.strip() + final_paren 

685 # print("EXTRACT_CELL_CONTENT: orig_col={!r} col={!r} refs={!r} hdr_tags={}" 

686 # .format(orig_col, col, refs, hdr_tags)) 

687 return col.strip(), refs, [], hdr_tags 

688 

689 

690@functools.lru_cache(10000) 

691def parse_title( 

692 title: str, source: str 

693) -> tuple[list[str], list[str], list[FormData]]: 

694 """Parses inflection table title. This returns (global_tags, table_tags, 

695 extra_forms), where ``global_tags`` is tags to be added to each inflection 

696 entry, ``table_tags`` are tags for the word but not to be added to every 

697 form, and ``extra_forms`` is dictionary describing additional forms to be 

698 included in the part-of-speech entry).""" 

699 assert isinstance(title, str) 

700 assert isinstance(source, str) 

701 title = html.unescape(title) 

702 title = re.sub(r"(?i)<[^>]*>", "", title).strip() 

703 title = re.sub(r"\s+", " ", title) 

704 # print("PARSE_TITLE:", title) 

705 global_tags = [] 

706 table_tags = [] 

707 extra_forms = [] 

708 # Add certain global tags based on contained words 

709 for m in re.finditer(title_contains_global_re, title): 

710 v = m.group(0).lower() 

711 if re.match(table_hdr_ign_part_re, v): 711 ↛ 712line 711 didn't jump to line 712 because the condition on line 711 was never true

712 continue 

713 global_tags.extend(title_contains_global_map[v].split()) 

714 # Add certain tags to table-tags "form" based on contained words 

715 for m in re.finditer(title_contains_wordtags_re, title): 

716 v = m.group(0).lower() 

717 if re.match(table_hdr_ign_part_re, v): 717 ↛ 718line 717 didn't jump to line 718 because the condition on line 717 was never true

718 continue 

719 table_tags.extend(title_contains_wordtags_map[v].split()) 

720 if re.search(r"Conjugation of (s’|se ).*French verbs", title): 720 ↛ 721line 720 didn't jump to line 721 because the condition on line 720 was never true

721 global_tags.append("reflexive") 

722 # Check for <x>-type at the beginning of title (e.g., Armenian) and various 

723 # other ways of specifying an inflection class. 

724 for m in re.finditer( 

725 r"\b(" 

726 r"[\w/]+-type|" 

727 r"accent-\w+|" 

728 r"[\w/]+-stem|" 

729 r"[^ ]+ gradation|" 

730 r"\b(stem in [\w/ ]+)|" 

731 r"[^ ]+ alternation|" 

732 r"(First|Second|Third|Fourth|Fifth|Sixth|Seventh) " 

733 r"(Conjugation|declension)|" 

734 r"First and second declension|" 

735 r"(1st|2nd|3rd|4th|5th|6th) declension|" 

736 r"\w[\w/ ]* harmony" 

737 r")\b", 

738 title, 

739 ): 

740 dt: FormData = {"form": m.group(1), "source": source, "tags": ["class"]} 

741 extra_forms.append(dt) 

742 # Parse parenthesized part from title 

743 for m in re.finditer(r"\(([^)]*)\)", title): 

744 for elem in m.group(1).split(","): 

745 # group(0) is the whole string, group(1) first parens 

746 elem = elem.strip() 

747 if elem in title_elements_map: 

748 table_tags.extend(title_elements_map[elem].split()) 

749 else: 

750 m1 = re.match(title_elemstart_re, elem) 

751 if m1: 

752 tags = title_elemstart_map[m1.group(1)].split() 

753 dt = { 

754 "form": elem[m1.end() :], 

755 "source": source, 

756 "tags": tags, 

757 } 

758 extra_forms.append(dt) 

759 # For titles that contains no parenthesized parts, do some special 

760 # handling to still interpret parts from them 

761 if "(" not in title: 

762 # No parenthesized parts 

763 m1 = re.search(r"\b(Portuguese) (-.* verb) ", title) 

764 if m1 is not None: 

765 dt = {"form": m1.group(2), "tags": ["class"], "source": source} 

766 extra_forms.append(dt) 

767 for elem in title.split(","): 

768 elem = elem.strip() 

769 if elem in title_elements_map: 769 ↛ 770line 769 didn't jump to line 770 because the condition on line 769 was never true

770 table_tags.extend(title_elements_map[elem].split()) 

771 elif elem.endswith("-stem"): 771 ↛ 772line 771 didn't jump to line 772 because the condition on line 771 was never true

772 dt = {"form": elem, "tags": ["class"], "source": source} 

773 extra_forms.append(dt) 

774 return global_tags, table_tags, extra_forms 

775 

776 

777def expand_header( 

778 wxr: WiktextractContext, 

779 tablecontext: "TableContext", 

780 word: str, 

781 lang: str, 

782 pos: str, 

783 text: str, 

784 base_tags: Union[list[str], set[str], tuple[str, ...]], 

785 silent=False, 

786 ignore_tags=False, 

787 depth=0, 

788 column_number: int | None = None, 

789) -> list[tuple[str, ...]]: 

790 """Expands a cell header to tagset, handling conditional expressions 

791 in infl_map. This returns list of tuples of tags, each list element 

792 describing an alternative interpretation. ``base_tags`` is combined 

793 column and row tags for the cell in which the text is being interpreted 

794 (conditional expressions in inflection data may depend on it). 

795 If ``silent`` is True, then no warnings will be printed. If ``ignore_tags`` 

796 is True, then tags listed in "if" will be ignored in the test (this is 

797 used when trying to heuristically detect whether a non-<th> cell is anyway 

798 a header).""" 

799 assert isinstance(wxr, WiktextractContext) 

800 assert isinstance(word, str) 

801 assert isinstance(lang, str) 

802 assert isinstance(pos, str) 

803 assert isinstance(text, str) 

804 assert isinstance(base_tags, (list, tuple, set)) 

805 assert silent in (True, False) 

806 assert isinstance(depth, int) 

807 # print("EXPAND_HDR: text={!r} base_tags={!r}".format(text, base_tags)) 

808 # First map the text using the inflection map 

809 text = clean_value(wxr, text) 

810 combined_return: list[tuple[str, ...]] = [] 

811 parts = split_at_comma_semi(text, separators=[";"]) 

812 for text in parts: 

813 if not text: 813 ↛ 814line 813 didn't jump to line 814 because the condition on line 813 was never true

814 continue 

815 if text in infl_map: 

816 v = infl_map[text] # list or string 

817 else: 

818 m = re.match(infl_start_re, text) 

819 if m is not None: 819 ↛ 820line 819 didn't jump to line 820 because the condition on line 819 was never true

820 v = infl_start_map[m.group(1)] 

821 # print("INFL_START {} -> {}".format(text, v)) 

822 elif re.match(r"Notes", text): 

823 # Ignored header 

824 # print("IGNORING NOTES") 

825 combined_return = or_tagsets( 

826 lang, pos, combined_return, [("dummy-skip-this",)] 

827 ) 

828 # this just adds dummy-skip-this 

829 continue 

830 elif text in IGNORED_COLVALUES: 

831 combined_return = or_tagsets( 

832 lang, pos, combined_return, [("dummy-ignore-skipped",)] 

833 ) 

834 continue 

835 # Try without final parenthesized part 

836 text_without_parens = re.sub(r"[,/]?\s+\([^)]*\)\s*$", "", text) 

837 if text_without_parens in infl_map: 

838 v = infl_map[text_without_parens] 

839 elif m is None: 839 ↛ 855line 839 didn't jump to line 855 because the condition on line 839 was always true

840 if not silent: 

841 wxr.wtp.debug( 

842 "inflection table: unrecognized header: {}".format( 

843 repr(text) 

844 ), 

845 sortid="inflection/735", 

846 ) 

847 # Unrecognized header 

848 combined_return = or_tagsets( 

849 lang, pos, combined_return, [("error-unrecognized-form",)] 

850 ) 

851 continue 

852 

853 # Then loop interpreting the value, until the value is a simple string. 

854 # This may evaluate nested conditional expressions. 

855 default_else = None 

856 while True: 

857 # If it is a string, we are done. 

858 if isinstance(v, str): 

859 tags = set(v.split()) 

860 remove_useless_tags(lang, pos, tags) 

861 tagset = [tuple(sorted(tags))] 

862 break 

863 # For a list, just interpret it as alternatives. (Currently the 

864 # alternatives must directly be strings.) 

865 if isinstance(v, (list, tuple)): 

866 tagset = [] 

867 for x in v: 

868 tags = set(x.split()) 

869 remove_useless_tags(lang, pos, tags) 

870 tags_t = tuple(sorted(tags)) 

871 if tags_t not in tagset: 871 ↛ 867line 871 didn't jump to line 867 because the condition on line 871 was always true

872 tagset.append(tags_t) 

873 break 

874 # Otherwise the value should be a dictionary describing a 

875 # conditional expression. 

876 if not isinstance(v, dict): 876 ↛ 877line 876 didn't jump to line 877 because the condition on line 876 was never true

877 wxr.wtp.debug( 

878 "inflection table: internal: " 

879 "UNIMPLEMENTED INFL_MAP VALUE: {}".format(infl_map[text]), 

880 sortid="inflection/767", 

881 ) 

882 tagset = [()] 

883 break 

884 # Evaluate the conditional expression. 

885 assert isinstance(v, dict) 

886 cond: Union[bool, str] = "default-true" 

887 c: Union[str, list[str], set[str]] = "" 

888 # Handle "lang" condition. The value must be either a 

889 # single language or a list of languages, and the 

890 # condition evaluates to True if the table is one of 

891 # those languages. 

892 if "lang" in v: 

893 c = v["lang"] 

894 if isinstance(c, str): 

895 cond = c == lang 

896 else: 

897 assert isinstance(c, (list, tuple, set)) 

898 cond = lang in c 

899 # Handle "nested-table-depth" condition. The value must 

900 # be an int or list of ints, and the condition evaluates 

901 # True if the depth is one of those values. 

902 # "depth" is how deep into a nested table tree the current 

903 # table lies. It is first started in handle_wikitext_table, 

904 # so only applies to tables-within-tables, not other 

905 # WikiNode content. `depth` is currently only passed as a 

906 # parameter down the table parsing stack, and not stored. 

907 if cond and "nested-table-depth" in v: 907 ↛ 908line 907 didn't jump to line 908 because the condition on line 907 was never true

908 d = v["nested-table-depth"] 

909 if isinstance(d, int): 

910 cond = d == depth 

911 else: 

912 assert isinstance(d, (list, tuple, set)) 

913 cond = depth in d 

914 # Column index: check if we're in position X of the row 

915 if cond and "column-index" in v: 

916 index = v["column-index"] 

917 if isinstance(index, int): 917 ↛ 920line 917 didn't jump to line 920 because the condition on line 917 was always true

918 cond = index == column_number 

919 else: 

920 assert isinstance(index, (list, tuple, set)) 

921 cond = column_number in index 

922 # Handle inflection-template condition. Must be a string 

923 # or list of strings, and if tablecontext.template_name is in 

924 # those, accept the condition. 

925 # TableContext.template_name is passed down from page/ 

926 # parse_inflection, before parsing and expanding itself 

927 # has begun. 

928 if cond and tablecontext and "inflection-template" in v: 

929 d1 = v["inflection-template"] 

930 if isinstance(d1, str): 930 ↛ 933line 930 didn't jump to line 933 because the condition on line 930 was always true

931 cond = d1 == tablecontext.template_name 

932 else: 

933 assert isinstance(d1, (list, tuple, set)) 

934 cond = tablecontext.template_name in d1 

935 # Handle "pos" condition. The value must be either a single 

936 # part-of-speech or a list of them, and the condition evaluates to 

937 # True if the part-of-speech is any of those listed. 

938 if cond and "pos" in v: 

939 c = v["pos"] 

940 if isinstance(c, str): 

941 cond = c == pos 

942 else: 

943 assert isinstance(c, (list, tuple, set)) 

944 cond = pos in c 

945 # Handle "if" condition. The value must be a string containing a 

946 # space-separated list of tags. The condition evaluates to True if 

947 # ``base_tags`` contains all of the listed tags. If the condition 

948 # is of the form "any: ...tags...", then any of the tags will be 

949 # enough. 

950 if cond and "if" in v and not ignore_tags: 

951 c = v["if"] 

952 assert isinstance(c, str) 

953 # "if" condition is true if any of the listed tags is present if 

954 # it starts with "any:", otherwise all must be present 

955 if c.startswith("any: "): 

956 cond = any(t in base_tags for t in c[5:].split()) 

957 else: 

958 cond = all(t in base_tags for t in c.split()) 

959 

960 # Handle "default" assignment. Store the value to be used 

961 # as a default later. 

962 if "default" in v: 

963 assert isinstance(v["default"], str) 

964 default_else = v["default"] 

965 

966 # Warning message about missing conditions for debugging. 

967 

968 if cond == "default-true" and not default_else and not silent: 

969 wxr.wtp.debug( 

970 "inflection table: IF MISSING COND: word={} " 

971 "lang={} text={} base_tags={} c={} cond={}".format( 

972 word, lang, text, base_tags, c, cond 

973 ), 

974 sortid="inflection/851", 

975 ) 

976 # Based on the result of evaluating the condition, select either 

977 # "then" part or "else" part. 

978 if cond: 

979 v = v.get("then", "") 

980 else: 

981 v1 = v.get("else") 

982 if v1 is None: 

983 if default_else: 

984 v = default_else 

985 else: 

986 if not silent: 

987 wxr.wtp.debug( 

988 "inflection table: IF WITHOUT ELSE EVALS " 

989 "False: " 

990 "{}/{} {!r} base_tags={}".format( 

991 word, lang, text, base_tags 

992 ), 

993 sortid="inflection/865", 

994 ) 

995 v = "error-unrecognized-form" 

996 else: 

997 v = v1 

998 

999 # Merge the resulting tagset from this header part with the other 

1000 # tagsets from the whole header 

1001 combined_return = or_tagsets(lang, pos, combined_return, tagset) 

1002 

1003 # Return the combined tagsets, or empty tagset if we got no tagsets 

1004 if not combined_return: 

1005 combined_return = [()] 

1006 return combined_return 

1007 

1008 

1009def compute_coltags( 

1010 lang: str, 

1011 pos: str, 

1012 hdrspans: list[str], 

1013 start: int, 

1014 colspan: int, 

1015 celltext: int, 

1016) -> list[tuple[str]]: 

1017 """Computes column tags for a column of the given width based on the 

1018 current header spans.""" 

1019 assert isinstance(lang, str) 

1020 assert isinstance(pos, str) 

1021 assert isinstance(hdrspans, list) 

1022 assert isinstance(start, int) and start >= 0 

1023 assert isinstance(colspan, int) and colspan >= 1 

1024 assert isinstance(celltext, str) # For debugging only 

1025 # print("COMPUTE_COLTAGS CALLED start={} colspan={} celltext={!r}" 

1026 # .format(start, colspan, celltext)) 

1027 # For debugging, set this to the form for whose cell you want debug prints 

1028 if celltext == debug_cell_text: 1028 ↛ 1029line 1028 didn't jump to line 1029 because the condition on line 1028 was never true

1029 print( 

1030 "COMPUTE_COLTAGS CALLED start={} colspan={} celltext={!r}".format( 

1031 start, colspan, celltext 

1032 ) 

1033 ) 

1034 for hdrspan in hdrspans: 

1035 print( 

1036 " row={} start={} colspans={} tagsets={}".format( 

1037 hdrspan.rownum, 

1038 hdrspan.start, 

1039 hdrspan.colspan, 

1040 hdrspan.tagsets, 

1041 ) 

1042 ) 

1043 used = set() 

1044 coltags = [()] 

1045 last_header_row = 1000000 

1046 # Iterate through the headers in reverse order, i.e., headers lower in the 

1047 # table (closer to the cell) first. 

1048 row_tagsets = [()] 

1049 row_tagsets_rownum = 1000000 

1050 used_hdrspans = set() 

1051 for hdrspan in reversed(hdrspans): 

1052 if ( 

1053 hdrspan.start + hdrspan.colspan <= start 

1054 or hdrspan.start >= start + colspan 

1055 ): 

1056 # Does not horizontally overlap current cell. Ignore this hdrspan. 

1057 if celltext == debug_cell_text: 1057 ↛ 1058line 1057 didn't jump to line 1058 because the condition on line 1057 was never true

1058 print( 

1059 "Ignoring row={} start={} colspan={} tagsets={}".format( 

1060 hdrspan.rownum, 

1061 hdrspan.start, 

1062 hdrspan.colspan, 

1063 hdrspan.tagsets, 

1064 ) 

1065 ) 

1066 continue 

1067 # If the cell partially overlaps the current cell, assume we have 

1068 # reached something unrelated and abort. 

1069 if ( 

1070 hdrspan.start < start 

1071 and hdrspan.start + hdrspan.colspan > start 

1072 and hdrspan.start + hdrspan.colspan < start + colspan 

1073 ): 

1074 if celltext == debug_cell_text: 1074 ↛ 1075line 1074 didn't jump to line 1075 because the condition on line 1074 was never true

1075 print( 

1076 "break on partial overlap at start {} {} {}".format( 

1077 hdrspan.start, hdrspan.colspan, hdrspan.tagsets 

1078 ) 

1079 ) 

1080 break 

1081 if ( 

1082 hdrspan.start < start + colspan 

1083 and hdrspan.start > start 

1084 and hdrspan.start + hdrspan.colspan > start + colspan 

1085 and not hdrspan.expanded 

1086 ): 

1087 if celltext == debug_cell_text: 1087 ↛ 1088line 1087 didn't jump to line 1088 because the condition on line 1087 was never true

1088 print( 

1089 "break on partial overlap at end {} {} {}".format( 

1090 hdrspan.start, hdrspan.colspan, hdrspan.tagsets 

1091 ) 

1092 ) 

1093 break 

1094 # Check if we have already used this cell. 

1095 if id(hdrspan) in used_hdrspans: 

1096 continue 

1097 # We are going to use this cell. 

1098 used_hdrspans.add(id(hdrspan)) 

1099 tagsets = hdrspan.tagsets 

1100 # If the hdrspan is fully inside the current cell and does not cover 

1101 # it fully, check if we should merge information from multiple cells. 

1102 if not hdrspan.expanded and ( 

1103 hdrspan.start > start 

1104 or hdrspan.start + hdrspan.colspan < start + colspan 

1105 ): 

1106 # Multiple columns apply to the current cell, only 

1107 # gender/number/case tags present 

1108 # If there are no tags outside the range in any of the 

1109 # categories included in these cells, don't add anything 

1110 # (assume all choices valid in the language are possible). 

1111 in_cats = set( 

1112 valid_tags[t] 

1113 for x in hdrspans 

1114 if x.rownum == hdrspan.rownum 

1115 and x.start >= start 

1116 and x.start + x.colspan <= start + colspan 

1117 for tt in x.tagsets 

1118 for t in tt 

1119 ) 

1120 if celltext == debug_cell_text: 1120 ↛ 1121line 1120 didn't jump to line 1121 because the condition on line 1120 was never true

1121 print("in_cats={} tagsets={}".format(in_cats, tagsets)) 

1122 # Merge the tagsets into existing tagsets. This merges 

1123 # alternatives into the same tagset if there is only one 

1124 # category different; otherwise this splits the tagset into 

1125 # more alternatives. 

1126 includes_all_on_row = True 

1127 for x in hdrspans: 

1128 # print("X: x.rownum={} x.start={}".format(x.rownum, x.start)) 

1129 if x.rownum != hdrspan.rownum: 

1130 continue 

1131 if x.start < start or x.start + x.colspan > start + colspan: 

1132 if celltext == debug_cell_text: 1132 ↛ 1133line 1132 didn't jump to line 1133 because the condition on line 1132 was never true

1133 print( 

1134 "NOT IN RANGE: {} {} {}".format( 

1135 x.start, x.colspan, x.tagsets 

1136 ) 

1137 ) 

1138 includes_all_on_row = False 

1139 continue 

1140 if id(x) in used_hdrspans: 

1141 if celltext == debug_cell_text: 1141 ↛ 1142line 1141 didn't jump to line 1142 because the condition on line 1141 was never true

1142 print( 

1143 "ALREADY USED: {} {} {}".format( 

1144 x.start, x.colspan, x.tagsets 

1145 ) 

1146 ) 

1147 continue 

1148 used_hdrspans.add(id(x)) 

1149 if celltext == debug_cell_text: 1149 ↛ 1150line 1149 didn't jump to line 1150 because the condition on line 1149 was never true

1150 print( 

1151 "Merging into wide col: x.rownum={} " 

1152 "x.start={} x.colspan={} " 

1153 "start={} colspan={} tagsets={} x.tagsets={}".format( 

1154 x.rownum, 

1155 x.start, 

1156 x.colspan, 

1157 start, 

1158 colspan, 

1159 tagsets, 

1160 x.tagsets, 

1161 ) 

1162 ) 

1163 tagsets = or_tagsets(lang, pos, tagsets, x.tagsets) 

1164 # If all headers on the row were included, ignore them. 

1165 # See e.g. kunna/Swedish/Verb. 

1166 ts_cats = tagset_cats(tagsets) 

1167 if ( 

1168 includes_all_on_row 

1169 or 

1170 # Kludge, see fut/Hungarian/Verb 

1171 ("tense" in ts_cats and "object" in ts_cats) 

1172 ): 

1173 tagsets = [()] 

1174 # For limited categories, if the category doesn't appear 

1175 # outside, we won't include the category 

1176 if not in_cats - set( 

1177 ("gender", "number", "person", "case", "category", "voice") 

1178 ): 

1179 # Sometimes we have masc, fem, neut and plural, so treat 

1180 # number and gender as the same here (if one given, look for 

1181 # the other too) 

1182 if "number" in in_cats or "gender" in in_cats: 

1183 in_cats.update(("number", "gender")) 

1184 # Determine which categories occur outside on 

1185 # the same row. Ignore headers that have been expanded 

1186 # to cover the whole row/part of it. 

1187 out_cats = set( 

1188 valid_tags[t] 

1189 for x in hdrspans 

1190 if x.rownum == hdrspan.rownum 

1191 and not x.expanded 

1192 and ( 

1193 x.start < start or x.start + x.colspan > start + colspan 

1194 ) 

1195 for tt in x.tagsets 

1196 for t in tt 

1197 ) 

1198 if celltext == debug_cell_text: 1198 ↛ 1199line 1198 didn't jump to line 1199 because the condition on line 1198 was never true

1199 print("in_cats={} out_cats={}".format(in_cats, out_cats)) 

1200 # Remove all inside categories that do not appear outside 

1201 

1202 new_tagsets = [] 

1203 for ts in tagsets: 

1204 tags = tuple( 

1205 sorted(t for t in ts if valid_tags[t] in out_cats) 

1206 ) 

1207 if tags not in new_tagsets: 1207 ↛ 1203line 1207 didn't jump to line 1203 because the condition on line 1207 was always true

1208 new_tagsets.append(tags) 

1209 if celltext == debug_cell_text and new_tagsets != tagsets: 1209 ↛ 1210line 1209 didn't jump to line 1210 because the condition on line 1209 was never true

1210 print( 

1211 "Removed tags that do not " 

1212 "appear outside {} -> {}".format( 

1213 # have_hdr never used? 

1214 tagsets, 

1215 new_tagsets, 

1216 ) 

1217 ) 

1218 tagsets = new_tagsets 

1219 key = (hdrspan.start, hdrspan.colspan) 

1220 if key in used: 

1221 if celltext == debug_cell_text: 1221 ↛ 1222line 1221 didn't jump to line 1222 because the condition on line 1221 was never true

1222 print( 

1223 "Cellspan already used: start={} " 

1224 "colspan={} rownum={} {}".format( 

1225 hdrspan.start, 

1226 hdrspan.colspan, 

1227 hdrspan.rownum, 

1228 hdrspan.tagsets, 

1229 ) 

1230 ) 

1231 action = get_lang_conf(lang, "reuse_cellspan") 

1232 # can be "stop", "skip" or "reuse" 

1233 if action == "stop": 

1234 break 

1235 if action == "skip": 

1236 continue 

1237 assert action == "reuse" 

1238 tcats = tagset_cats(tagsets) 

1239 # Most headers block using the same column position above. However, 

1240 # "register" tags don't do this (cf. essere/Italian/verb: "formal") 

1241 if len(tcats) != 1 or "register" not in tcats: 

1242 used.add(key) 

1243 # If we have moved to a different row, merge into column tagsets 

1244 # (we use different and_tagsets within the row) 

1245 if row_tagsets_rownum != hdrspan.rownum: 

1246 # row_tagsets_rownum was initialized as 10000000 

1247 ret = and_tagsets(lang, pos, coltags, row_tagsets) 

1248 if celltext == debug_cell_text: 1248 ↛ 1249line 1248 didn't jump to line 1249 because the condition on line 1248 was never true

1249 print( 

1250 "merging rows: {} {} -> {}".format( 

1251 coltags, row_tagsets, ret 

1252 ) 

1253 ) 

1254 coltags = ret 

1255 row_tagsets = [()] 

1256 row_tagsets_rownum = hdrspan.rownum 

1257 # Merge into coltags 

1258 if hdrspan.all_headers_row and hdrspan.rownum + 1 == last_header_row: 

1259 # If this row is all headers and immediately preceeds the last 

1260 # header we accepted, take any header from there. 

1261 row_tagsets = and_tagsets(lang, pos, row_tagsets, tagsets) 

1262 if celltext == debug_cell_text: 1262 ↛ 1263line 1262 didn't jump to line 1263 because the condition on line 1262 was never true

1263 print("merged (next header row): {}".format(row_tagsets)) 

1264 else: 

1265 # new_cats is for the new tags (higher up in the table) 

1266 new_cats = tagset_cats(tagsets) 

1267 # cur_cats is for the tags already collected (lower in the table) 

1268 cur_cats = tagset_cats(coltags) 

1269 if celltext == debug_cell_text: 1269 ↛ 1270line 1269 didn't jump to line 1270 because the condition on line 1269 was never true

1270 print( 

1271 "row={} start={} colspan={} tagsets={} coltags={} " 

1272 "new_cats={} cur_cats={}".format( 

1273 hdrspan.rownum, 

1274 hdrspan.start, 

1275 hdrspan.colspan, 

1276 tagsets, 

1277 coltags, 

1278 new_cats, 

1279 cur_cats, 

1280 ) 

1281 ) 

1282 if "detail" in new_cats: 

1283 if not any(coltags): # Only if no tags so far 

1284 coltags = or_tagsets(lang, pos, coltags, tagsets) 

1285 if celltext == debug_cell_text: 1285 ↛ 1286line 1285 didn't jump to line 1286 because the condition on line 1285 was never true

1286 print("stopping on detail after merge") 

1287 break 

1288 # Here, we block bleeding of categories from above 

1289 elif "non-finite" in cur_cats and "non-finite" in new_cats: 

1290 stop = get_lang_conf(lang, "stop_non_finite_non_finite") 

1291 if stop: 1291 ↛ 1317line 1291 didn't jump to line 1317 because the condition on line 1291 was always true

1292 if celltext == debug_cell_text: 1292 ↛ 1293line 1292 didn't jump to line 1293 because the condition on line 1292 was never true

1293 print("stopping on non-finite-non-finite") 

1294 break 

1295 elif "non-finite" in cur_cats and "voice" in new_cats: 

1296 stop = get_lang_conf(lang, "stop_non_finite_voice") 

1297 if stop: 1297 ↛ 1317line 1297 didn't jump to line 1317 because the condition on line 1297 was always true

1298 if celltext == debug_cell_text: 1298 ↛ 1299line 1298 didn't jump to line 1299 because the condition on line 1298 was never true

1299 print("stopping on non-finite-voice") 

1300 break 

1301 elif "non-finite" in new_cats and cur_cats & set( 

1302 ("person", "number") 

1303 ): 

1304 if celltext == debug_cell_text: 1304 ↛ 1305line 1304 didn't jump to line 1305 because the condition on line 1304 was never true

1305 print("stopping on non-finite new") 

1306 break 

1307 elif "non-finite" in new_cats and "tense" in new_cats: 

1308 stop = get_lang_conf(lang, "stop_non_finite_tense") 

1309 if stop: 

1310 if celltext == debug_cell_text: 1310 ↛ 1311line 1310 didn't jump to line 1311 because the condition on line 1310 was never true

1311 print("stopping on non-finite new") 

1312 break 

1313 elif "non-finite" in cur_cats and new_cats & set(("mood",)): 1313 ↛ 1314line 1313 didn't jump to line 1314 because the condition on line 1313 was never true

1314 if celltext == debug_cell_text: 

1315 print("stopping on non-finite cur") 

1316 break 

1317 if ( 

1318 "tense" in new_cats 

1319 and any("imperative" in x for x in coltags) 

1320 and get_lang_conf(lang, "imperative_no_tense") 

1321 ): 

1322 if celltext == debug_cell_text: 1322 ↛ 1323line 1322 didn't jump to line 1323 because the condition on line 1322 was never true

1323 print("skipping tense in imperative") 

1324 continue 

1325 elif ( 

1326 "mood" in new_cats 

1327 and "mood" in cur_cats 

1328 and 

1329 # Allow if all new tags are already in current set 

1330 any( 

1331 t not in ts1 

1332 for ts1 in coltags # current 

1333 for ts2 in tagsets # new (from above) 

1334 for t in ts2 

1335 ) 

1336 ): 

1337 skip = get_lang_conf(lang, "skip_mood_mood") 

1338 if skip: 

1339 if celltext == debug_cell_text: 1339 ↛ 1340line 1339 didn't jump to line 1340 because the condition on line 1339 was never true

1340 print("skipping on mood-mood") 

1341 # we continue to next header 

1342 else: 

1343 if celltext == debug_cell_text: 1343 ↛ 1344line 1343 didn't jump to line 1344 because the condition on line 1343 was never true

1344 print("stopping on mood-mood") 

1345 break 

1346 elif "tense" in new_cats and "tense" in cur_cats: 

1347 skip = get_lang_conf(lang, "skip_tense_tense") 

1348 if skip: 

1349 if celltext == debug_cell_text: 1349 ↛ 1350line 1349 didn't jump to line 1350 because the condition on line 1349 was never true

1350 print("skipping on tense-tense") 

1351 # we continue to next header 

1352 else: 

1353 if celltext == debug_cell_text: 1353 ↛ 1354line 1353 didn't jump to line 1354 because the condition on line 1353 was never true

1354 print("stopping on tense-tense") 

1355 break 

1356 elif "aspect" in new_cats and "aspect" in cur_cats: 

1357 if celltext == debug_cell_text: 1357 ↛ 1358line 1357 didn't jump to line 1358 because the condition on line 1357 was never true

1358 print("skipping on aspect-aspect") 

1359 continue 

1360 elif "number" in cur_cats and "number" in new_cats: 

1361 if celltext == debug_cell_text: 1361 ↛ 1362line 1361 didn't jump to line 1362 because the condition on line 1361 was never true

1362 print("stopping on number-number") 

1363 break 

1364 elif "number" in cur_cats and "gender" in new_cats: 

1365 if celltext == debug_cell_text: 1365 ↛ 1366line 1365 didn't jump to line 1366 because the condition on line 1365 was never true

1366 print("stopping on number-gender") 

1367 break 

1368 elif "person" in cur_cats and "person" in new_cats: 

1369 if celltext == debug_cell_text: 1369 ↛ 1370line 1369 didn't jump to line 1370 because the condition on line 1369 was never true

1370 print("stopping on person-person") 

1371 break 

1372 else: 

1373 # Merge tags and continue to next header up/left in the table. 

1374 row_tagsets = and_tagsets(lang, pos, row_tagsets, tagsets) 

1375 if celltext == debug_cell_text: 1375 ↛ 1376line 1375 didn't jump to line 1376 because the condition on line 1375 was never true

1376 print("merged: {}".format(coltags)) 

1377 # Update the row number from which we have last taken headers 

1378 last_header_row = hdrspan.rownum 

1379 # Merge the final row tagset into coltags 

1380 coltags = and_tagsets(lang, pos, coltags, row_tagsets) 

1381 # print( 

1382 # "HDRSPANS:", list((x.start, x.colspan, x.tagsets) for x in hdrspans) 

1383 # ) 

1384 if celltext == debug_cell_text: 1384 ↛ 1385line 1384 didn't jump to line 1385 because the condition on line 1384 was never true

1385 print("COMPUTE_COLTAGS {} {}: {}".format(start, colspan, coltags)) 

1386 assert isinstance(coltags, list) 

1387 assert all(isinstance(x, tuple) for x in coltags) 

1388 return coltags 

1389 

1390 

1391def parse_simple_table( 

1392 wxr, tablecontext, word, lang, pos, rows, titles, source, after, depth 

1393): 

1394 """This is the default table parser. Despite its name, it can parse 

1395 complex tables. This returns a list of forms to be added to the 

1396 part-of-speech, or None if the table could not be parsed.""" 

1397 assert isinstance(wxr, WiktextractContext) 

1398 assert isinstance(tablecontext, TableContext) 

1399 assert isinstance(word, str) 

1400 assert isinstance(lang, str) 

1401 assert isinstance(pos, str) 

1402 assert isinstance(rows, list) 

1403 assert isinstance(source, str) 

1404 assert isinstance(after, str) 

1405 assert isinstance(depth, int) 

1406 for row in rows: 

1407 for col in row: 

1408 assert isinstance(col, InflCell) 

1409 assert isinstance(titles, list) 

1410 for x in titles: 

1411 assert isinstance(x, str) 

1412 

1413 # print("PARSE_SIMPLE_TABLE: TITLES:", titles) 

1414 if debug_cell_text: 1414 ↛ 1415line 1414 didn't jump to line 1415 because the condition on line 1414 was never true

1415 print("ROWS:") 

1416 for row in rows: 

1417 print(" ", row) 

1418 

1419 # Check for forced rowspan kludge. See e.g. 

1420 # maorski/Serbo-Croatian. These are essentially multi-row 

1421 # cells implemented using <br> rather than separate cell. We fix this 

1422 # by identifying rows where this happens, and splitting the current row 

1423 # to multiple rows by synthesizing additional cells. 

1424 new_rows = [] 

1425 for row in rows: 

1426 split_row = ( 

1427 any(x.is_title and x.text in ("inanimate\nanimate",) for x in row) 

1428 and 

1429 # x is an InflCell 

1430 all(x.rowspan == 1 for x in row) 

1431 ) 

1432 if not split_row: 

1433 new_rows.append(row) 

1434 continue 

1435 row1 = [] 

1436 row2 = [] 

1437 for cell in row: 

1438 cell1 = copy.deepcopy(cell) 

1439 if "\n" in cell.text: 

1440 # Has more than one line - split this cell 

1441 parts = cell.text.strip().splitlines() 

1442 if len(parts) != 2: 1442 ↛ 1443line 1442 didn't jump to line 1443 because the condition on line 1442 was never true

1443 wxr.wtp.debug( 

1444 "forced rowspan kludge got {} parts: {!r}".format( 

1445 len(parts), cell.text 

1446 ), 

1447 sortid="inflection/1234", 

1448 ) 

1449 cell2 = copy.deepcopy(cell) 

1450 cell1.text = parts[0] 

1451 cell2.text = parts[1] 

1452 else: 

1453 cell1.rowspan = 2 

1454 cell2 = cell1 # ref, not a copy 

1455 row1.append(cell1) 

1456 row2.append(cell2) 

1457 new_rows.append(row1) 

1458 new_rows.append(row2) 

1459 rows = new_rows 

1460 # print("ROWS AFTER FORCED ROWSPAN KLUDGE:") 

1461 # for row in rows: 

1462 # print(" ", row) 

1463 

1464 # Parse definitions for references (from table itself and from text 

1465 # after it) 

1466 def_ht = {} 

1467 

1468 def add_defs(defs: list[tuple[str, str]]) -> None: 

1469 for ref, d in defs: 

1470 # print("DEF: ref={} d={}".format(ref, d)) 

1471 d = d.strip() 

1472 d = d.split(". ")[0].strip() # text before ". " 

1473 if not d: 1473 ↛ 1474line 1473 didn't jump to line 1474 because the condition on line 1473 was never true

1474 continue 

1475 if d.endswith("."): # catc ".."?? 

1476 d = d[:-1] 

1477 tags, topics = decode_tags(d, no_unknown_starts=True) 

1478 # print(f"{ref=}, {transformed=}, {tags=}") 

1479 if topics or any("error-unknown-tag" in ts for ts in tags): 

1480 d = d[0].lower() + d[1:] 

1481 tags, topics = decode_tags( 

1482 d, no_unknown_starts=True 

1483 ) 

1484 if topics or any("error-unknown-tag" in ts for ts in tags): 

1485 # Failed to parse as tags 

1486 # print("Failed: topics={} tags={}" 

1487 # .format(topics, tags)) 

1488 continue 

1489 tags1_s: set[str] = set() 

1490 for ts in tags: 

1491 # Set.update is a union operation: definition tags are flat 

1492 tags1_s.update(ts) 

1493 tags1 = tuple(sorted(tags1_s)) 

1494 # print("DEFINED: {} -> {}".format(ref, tags1)) 

1495 def_ht[ref] = tags1 

1496 

1497 def generate_tags( 

1498 rowtags: list[tuple[str]], table_tags: list[str] 

1499 ) -> tuple[ 

1500 list[tuple[str, ...]], list[tuple[str, ...]], list[tuple[str, ...]] 

1501 ]: 

1502 new_coltags = [] 

1503 all_hdr_tags = [] # list of tuples 

1504 new_rowtags = [] 

1505 for rt0 in rowtags: 

1506 for ct0 in compute_coltags( 

1507 lang, 

1508 pos, 

1509 hdrspans, 

1510 col_idx, # col_idx=>start 

1511 colspan, 

1512 col, # cell_text 

1513 ): 

1514 base_tags: set[str] = ( 

1515 set(rt0) 

1516 | set(ct0) 

1517 | set(global_tags) 

1518 | set(itertools.chain.from_iterable(table_tags)) 

1519 ) # Union. 

1520 alt_tags = expand_header( 

1521 wxr, 

1522 tablecontext, 

1523 word, 

1524 lang, 

1525 pos, 

1526 text, 

1527 base_tags, 

1528 depth=depth, 

1529 column_number=col_idx, 

1530 ) 

1531 # base_tags are used in infl_map "if"-conds. 

1532 for tt in alt_tags: 

1533 if tt not in all_hdr_tags: 

1534 all_hdr_tags.append(tt) 

1535 tt_s = set(tt) 

1536 # Certain tags are always moved to word-level tags 

1537 if tt_s & TAGS_FORCED_WORDTAGS: 1537 ↛ 1538line 1537 didn't jump to line 1538 because the condition on line 1537 was never true

1538 table_tags.extend(tt_s & TAGS_FORCED_WORDTAGS) 

1539 tt_s = tt_s - TAGS_FORCED_WORDTAGS 

1540 # Add tags from referenced footnotes 

1541 tt_s.update(refs_tags) 

1542 # Sort, convert to tuple, and add to set of 

1543 # alternatives. 

1544 tt = tuple(sorted(tt_s)) 

1545 if tt not in new_coltags: 

1546 new_coltags.append(tt) 

1547 # Kludge (saprast/Latvian/Verb): ignore row tags 

1548 # if trying to add a non-finite after mood. 

1549 if any(valid_tags[t] == "mood" for t in rt0) and any( 

1550 valid_tags[t] == "non-finite" for t in tt 

1551 ): 

1552 tags = tuple(sorted(set(tt) | set(hdr_tags))) 

1553 else: 

1554 tags = tuple(sorted(set(tt) | set(rt0) | set(hdr_tags))) 

1555 if tags not in new_rowtags: 

1556 new_rowtags.append(tags) 

1557 return new_rowtags, new_coltags, all_hdr_tags 

1558 

1559 def add_new_hdrspan( 

1560 col: str, 

1561 hdrspans: list[HdrSpan], 

1562 store_new_hdrspan: bool, 

1563 col0_followed_by_nonempty: bool, 

1564 col0_hdrspan: Optional[HdrSpan], 

1565 ) -> tuple[str, bool, Optional[HdrSpan]]: 

1566 hdrspan = HdrSpan( 

1567 col_idx, colspan, rowspan, rownum, new_coltags, col, all_headers 

1568 ) 

1569 hdrspans.append(hdrspan) 

1570 

1571 # infl-map tag "dummy-store-hdrspan" causes this new hdrspan 

1572 # to be added to a register of stored hdrspans to be used 

1573 # later with "dummy-load-stored-hdrspans". 

1574 if store_new_hdrspan: 1574 ↛ 1575line 1574 didn't jump to line 1575 because the condition on line 1574 was never true

1575 tablecontext.stored_hdrspans.append(hdrspan) 

1576 

1577 # Handle headers that are above left-side header 

1578 # columns and are followed by personal pronouns in 

1579 # remaining columns (basically headers that 

1580 # evaluate to no tags). In such cases widen the 

1581 # left-side header to the full row. 

1582 if previously_seen: # id(cell) in seen_cells previously 

1583 col0_followed_by_nonempty = True 

1584 return col, col0_followed_by_nonempty, col0_hdrspan 

1585 elif col0_hdrspan is None: 

1586 col0_hdrspan = hdrspan 

1587 elif any(all_hdr_tags): 1587 ↛ 1655line 1587 didn't jump to line 1655 because the condition on line 1587 was always true

1588 col0_cats = tagset_cats(col0_hdrspan.tagsets) 

1589 later_cats = tagset_cats(all_hdr_tags) 

1590 col0_allowed = get_lang_conf(lang, "hdr_expand_first") 

1591 later_allowed = get_lang_conf(lang, "hdr_expand_cont") 

1592 later_allowed = later_allowed | set(["dummy"]) 

1593 # dummy2 has different behavior than plain dummy 

1594 # and does not belong here. 

1595 

1596 # print("col0_cats={} later_cats={} " 

1597 # "fol_by_nonempty={} col_idx={} end={} " 

1598 # "tagsets={}" 

1599 # .format(col0_cats, later_cats, 

1600 # col0_followed_by_nonempty, col_idx, 

1601 # col0_hdrspan.start + 

1602 # col0_hdrspan.colspan, 

1603 # col0_hdrspan.tagsets)) 

1604 # print("col0.rowspan={} rowspan={}" 

1605 # .format(col0_hdrspan.rowspan, rowspan)) 

1606 # Only expand if [col0_cats and later_cats are allowed 

1607 # and don't overlap] and [col0 has tags], and there have 

1608 # been [no disallowed cells in between]. 

1609 # 

1610 # There are three cases here: 

1611 # - col0_hdrspan set, continue with allowed current 

1612 # - col0_hdrspan set, expand, start new 

1613 # - col0_hdrspan set, no expand, start new 

1614 if ( 

1615 not col0_followed_by_nonempty 

1616 and 

1617 # XXX Only one cat of tags: kunna/Swedish 

1618 # XXX len(col0_cats) == 1 and 

1619 col0_hdrspan.rowspan >= rowspan 

1620 and 

1621 # from hdrspan 

1622 not (later_cats - later_allowed) 

1623 and not (col0_cats & later_cats) 

1624 ): 

1625 # First case: col0 set, continue 

1626 return col, col0_followed_by_nonempty, col0_hdrspan 

1627 # We are going to start new col0_hdrspan. Check if 

1628 # we should expand. 

1629 if ( 

1630 not col0_followed_by_nonempty 

1631 and not (col0_cats - col0_allowed) 

1632 and 

1633 # Only "allowed" allowed 

1634 # XXX len(col0_cats) == 1 and 

1635 col_idx > col0_hdrspan.start + col0_hdrspan.colspan 

1636 ): 

1637 # col_idx is beyond current colspan 

1638 # *Expand* current col0_hdrspan 

1639 # print("EXPANDING COL0 MID: {} from {} to {} " 

1640 # "cols {}" 

1641 # .format(col0_hdrspan.text, 

1642 # col0_hdrspan.colspan, 

1643 # col_idx - col0_hdrspan.start, 

1644 # col0_hdrspan.tagsets)) 

1645 col0_hdrspan.colspan = col_idx - col0_hdrspan.start 

1646 col0_hdrspan.expanded = True 

1647 # Clear old col0_hdrspan 

1648 if col == debug_cell_text: 1648 ↛ 1649line 1648 didn't jump to line 1649 because the condition on line 1648 was never true

1649 print("START NEW {}".format(hdrspan.tagsets)) 

1650 col0_hdrspan = None 

1651 # Now start new, unless it comes from previous row 

1652 if not previously_seen: 1652 ↛ 1655line 1652 didn't jump to line 1655 because the condition on line 1652 was always true

1653 col0_hdrspan = hdrspan 

1654 col0_followed_by_nonempty = False 

1655 return col, col0_followed_by_nonempty, col0_hdrspan 

1656 

1657 def split_text_into_alts(col: str) -> tuple[str, list[str], list[str]]: 

1658 # Split the cell text into alternatives 

1659 split_extra_tags = [] 

1660 if col and is_superscript(col[0]): 1660 ↛ 1661line 1660 didn't jump to line 1661 because the condition on line 1660 was never true

1661 alts = [col] 

1662 else: 

1663 separators = [";", "•", r"\n", " or "] 

1664 if " + " not in col: 

1665 separators.append(",") 

1666 if not col.endswith("/"): 

1667 separators.append("/") 

1668 if col in special_phrase_splits: 

1669 # Use language-specific special splits. 

1670 # These are phrases and constructions that have 

1671 # unique ways of splitting, not specific characters 

1672 # to split on like with the default splitting. 

1673 alts, tags = special_phrase_splits[col] 

1674 split_extra_tags = tags.split() 

1675 for x in split_extra_tags: 

1676 assert x in valid_tags 

1677 assert isinstance(alts, (list, tuple)) 

1678 assert isinstance(tags, str) 

1679 else: 

1680 # Use default splitting. However, recognize 

1681 # language-specific replacements and change them to magic 

1682 # characters before splitting. This way we won't split 

1683 # them. This is important for, e.g., recognizing 

1684 # alternative pronouns. 

1685 # The magic characters are characters out of Unicode scope 

1686 # that are given a simple incremental value, int > unicode. 

1687 repls = {} 

1688 magic_ch = MAGIC_FIRST 

1689 trs = get_lang_conf(lang, "form_transformations") 

1690 # trs is a list of lists of strings 

1691 for _, v, _, _ in trs: 

1692 # v is a pattern string, like "^ich" 

1693 # form_transformations data is doing double-duty here, 

1694 # because the pattern strings are already known to us and 

1695 # not meant to be split. 

1696 m = re.search(v, col) 

1697 if m is not None: 

1698 # if pattern found in text 

1699 magic = chr(magic_ch) 

1700 magic_ch += 1 # next magic character value 

1701 col = re.sub(v, magic, col) # replace with magic ch 

1702 repls[magic] = m.group(0) 

1703 # remember what regex match string each magic char 

1704 # replaces. .group(0) is the whole match. 

1705 alts0 = split_at_comma_semi(col, separators=separators) 

1706 # with magic characters in place, split the text so that 

1707 # pre-transformation text is out of the way. 

1708 alts = [] 

1709 for alt in alts0: 

1710 # create a new list with the separated items and 

1711 # the magic characters replaced with the original texts. 

1712 for k, v in repls.items(): 

1713 alt = re.sub(k, v, alt) 

1714 alts.append(alt) 

1715 

1716 # Remove "*" from beginning of forms, as in non-attested 

1717 # or reconstructed forms. Otherwise it might confuse romanization 

1718 # detection. 

1719 alts = list(re.sub(r"^\*\*?([^ ])", r"\1", x) for x in alts) 

1720 alts = list( 

1721 x for x in alts if not re.match(r"pronounced with |\(with ", x) 

1722 ) 

1723 alts = list( 

1724 re.sub(r"^\((in the sense [^)]*)\)\s+", "", x) for x in alts 

1725 ) 

1726 return col, alts, split_extra_tags 

1727 

1728 def handle_mixed_lines(alts: list[str]) -> list[tuple[str, str, str]]: 

1729 # Handle the special case where romanization is given under 

1730 # normal form, e.g. in Russian. There can be multiple 

1731 # comma-separated forms in each case. We also handle the case 

1732 # where instead of romanization we have IPA pronunciation 

1733 # (e.g., avoir/French/verb). 

1734 len2 = len(alts) // 2 

1735 # Check for IPAs (forms first, IPAs under) 

1736 # base, base, IPA, IPA 

1737 if ( 

1738 len(alts) % 2 == 0 # Divisibly by two 

1739 and all( 

1740 re.match(r"^\s*/.*/\s*$", x) # Inside slashes = IPA 

1741 for x in alts[len2:] 

1742 ) 

1743 ): # In the second half of alts 

1744 nalts = list( 

1745 (alts[i], "", alts[i + len2]) 

1746 # List of tuples: (base, "", ipa) 

1747 for i in range(len2) 

1748 ) 

1749 # base, base, base, IPA 

1750 elif ( 

1751 len(alts) > 2 

1752 and re.match(r"^\s*/.*/\s*$", alts[-1]) 

1753 and all(not x.startswith("/") for x in alts[:-1]) 

1754 ): 

1755 # Only if the last alt is IPA 

1756 nalts = list((alts[i], "", alts[-1]) for i in range(len(alts) - 1)) 

1757 # base, IPA, IPA, IPA 

1758 elif ( 

1759 len(alts) > 2 

1760 and not alts[0].startswith("/") 

1761 and all( 

1762 re.match(r"^\s*/.*/\s*$", alts[i]) for i in range(1, len(alts)) 

1763 ) 

1764 ): 

1765 # First is base and the rest is IPA alternatives 

1766 nalts = list((alts[0], "", alts[i]) for i in range(1, len(alts))) 

1767 

1768 # Check for romanizations, forms first, romanizations under 

1769 elif ( 

1770 len(alts) % 2 == 0 

1771 and not any("(" in x for x in alts) 

1772 and all( 

1773 classify_desc( 

1774 re.sub( 

1775 r"\^.*$", 

1776 "", 

1777 # Remove ends of strings starting from ^. 

1778 # Supescripts have been already removed 

1779 # from the string, while ^xyz needs to be 

1780 # removed separately, though it's usually 

1781 # something with a single letter? 

1782 "".join(xx for xx in x if not is_superscript(xx)), 

1783 ) 

1784 ) 

1785 == "other" 

1786 for x in alts[:len2] 

1787 ) 

1788 and all( 

1789 classify_desc( 

1790 re.sub( 

1791 r"\^.*$", 

1792 "", 

1793 "".join(xx for xx in x if not is_superscript(xx)), 

1794 ) 

1795 ) 

1796 in ("romanization", "english") 

1797 for x in alts[len2:] 

1798 ) 

1799 ): 

1800 nalts = list((alts[i], alts[i + len2], "") for i in range(len2)) 

1801 # Check for romanizations, forms and romanizations alternating 

1802 elif ( 

1803 len(alts) % 2 == 0 

1804 and not any("(" in x for x in alts) 

1805 and all( 

1806 classify_desc( 

1807 re.sub( 

1808 r"\^.*$", 

1809 "", 

1810 "".join(xx for xx in alts[i] if not is_superscript(xx)), 

1811 ) 

1812 ) 

1813 == "other" 

1814 for i in range(0, len(alts), 2) 

1815 ) 

1816 and all( 

1817 classify_desc( 

1818 re.sub( 

1819 r"\^.*$", 

1820 "", 

1821 "".join(xx for xx in alts[i] if not is_superscript(xx)), 

1822 ) 

1823 ) 

1824 in ("romanization", "english") 

1825 for i in range(1, len(alts), 2) 

1826 ) 

1827 ): 

1828 # odds 

1829 nalts = list( 

1830 (alts[i], alts[i + 1], "") for i in range(0, len(alts), 2) 

1831 ) 

1832 # evens 

1833 # Handle complex Georgian entries with alternative forms and* 

1834 # *romanizations. It's a bit of a mess. Remove this kludge if not 

1835 # needed anymore. NOTE THAT THE PARENTHESES ON THE WEBSITE ARE NOT 

1836 # DISPLAYED. They are put inside their own span elements that are 

1837 # then hidden with some CSS. 

1838 # https://en.wiktionary.org/wiki/%E1%83%90%E1%83%9B%E1%83%94%E1%83%A0%E1%83%98%E1%83%99%E1%83%98%E1%83%A1_%E1%83%A8%E1%83%94%E1%83%94%E1%83%A0%E1%83%97%E1%83%94%E1%83%91%E1%83%A3%E1%83%9A%E1%83%98_%E1%83%A8%E1%83%A2%E1%83%90%E1%83%A2%E1%83%94%E1%83%91%E1%83%98 

1839 # ამერიკის შეერთებულ შტატებს(ა) (ameriḳis šeertebul šṭaṭebs(a)) 

1840 # The above should generate two alts entries, with two different 

1841 # parallel versions, one without (a) and with (a) at the end, 

1842 # for both the Georgian original and the romanization. 

1843 elif ( 1843 ↛ 1848line 1843 didn't jump to line 1848 because the condition on line 1843 was never true

1844 tablecontext.template_name == "ka-decl-noun" 

1845 and len(alts) == 1 

1846 and " (" in alts[0] 

1847 ): 

1848 nalts = ka_decl_noun_template_cell(alts) 

1849 else: 

1850 new_alts = [] 

1851 for alt in alts: 

1852 lst = [""] 

1853 idx = 0 

1854 for m in re.finditer( 

1855 r"(^|\w|\*)\((\w+" r"(/\w+)*)\)", 

1856 # start OR letter OR asterisk (word/word*) 

1857 # \\___________group 1_______/ \ \_g3_/// 

1858 # \ \__gr. 2_// 

1859 # \_____________group 0________________/ 

1860 alt, 

1861 ): 

1862 v = m.group(2) # (word/word/word...) 

1863 if ( 

1864 classify_desc(v) == "tags" # Tags inside parens 

1865 or m.group(0) == alt 

1866 ): # All in parens 

1867 continue 

1868 new_lst = [] 

1869 for x in lst: 

1870 x += alt[idx : m.start()] + m.group(1) 

1871 # alt until letter or asterisk 

1872 idx = m.end() 

1873 vparts = v.split("/") 

1874 # group(2) = ["word", "wörd"...] 

1875 if len(vparts) == 1: 

1876 new_lst.append(x) 

1877 new_lst.append(x + v) 

1878 # "kind(er)" -> ["kind", "kinder"] 

1879 else: 

1880 for vv in vparts: 

1881 new_lst.append(x + vv) 

1882 # "lampai(tten/den)" -> 

1883 # ["lampaitten", "lampaiden"] 

1884 lst = new_lst 

1885 for x in lst: 

1886 new_alts.append(x + alt[idx:]) 

1887 # add the end of alt 

1888 nalts = list((x, "", "") for x in new_alts) 

1889 # [form, no romz, no ipa] 

1890 return nalts 

1891 

1892 def find_semantic_parens(form: str) -> tuple[str, list[str]]: 

1893 # "Some languages" (=Greek) use brackets to mark things that 

1894 # require tags, like (informality), [rarity] and {archaicity}. 

1895 extra_tags = [] 

1896 if re.match(r"\([^][(){}]*\)$", form): 

1897 if get_lang_conf(lang, "parentheses_for_informal"): 

1898 form = form[1:-1] 

1899 extra_tags.append("informal") 

1900 else: 

1901 form = form[1:-1] 

1902 elif re.match(r"\{\[[^][(){}]*\]\}$", form): 

1903 if get_lang_conf( 1903 ↛ 1910line 1903 didn't jump to line 1910 because the condition on line 1903 was always true

1904 lang, "square_brackets_for_rare" 

1905 ) and get_lang_conf(lang, "curly_brackets_for_archaic"): 

1906 # είμαι/Greek/Verb 

1907 form = form[2:-2] 

1908 extra_tags.extend(["rare", "archaic"]) 

1909 else: 

1910 form = form[2:-2] 

1911 elif re.match(r"\{[^][(){}]*\}$", form): 

1912 if get_lang_conf(lang, "curly_brackets_for_archaic"): 1912 ↛ 1917line 1912 didn't jump to line 1917 because the condition on line 1912 was always true

1913 # είμαι/Greek/Verb 

1914 form = form[1:-1] 

1915 extra_tags.extend(["archaic"]) 

1916 else: 

1917 form = form[1:-1] 

1918 elif re.match(r"\[[^][(){}]*\]$", form): 

1919 if get_lang_conf(lang, "square_brackets_for_rare"): 1919 ↛ 1924line 1919 didn't jump to line 1924 because the condition on line 1919 was always true

1920 # είμαι/Greek/Verb 

1921 form = form[1:-1] 

1922 extra_tags.append("rare") 

1923 else: 

1924 form = form[1:-1] 

1925 return form, extra_tags 

1926 

1927 def handle_parens( 

1928 form: str, roman: str, clitic: str, extra_tags: list[str] 

1929 ) -> tuple[str, str, str]: 

1930 if re.match(r"[’'][a-z]([a-z][a-z]?)?$", paren): 

1931 # is there a clitic starting with apostrophe? 

1932 clitic = paren 

1933 # assume the whole paren is a clitic 

1934 # then remove paren from form 

1935 form = (form[: m.start()] + subst + form[m.end() :]).strip() 

1936 elif classify_desc(paren) == "tags": 

1937 tagsets1, topics1 = decode_tags(paren) 

1938 if not topics1: 1938 ↛ 1959line 1938 didn't jump to line 1959 because the condition on line 1938 was always true

1939 for ts in tagsets1: 

1940 ts = tuple(x for x in ts if " " not in x) 

1941 # There are some generated tags containing 

1942 # spaces; do not let them through here. 

1943 extra_tags.extend(ts) 

1944 form = (form[: m.start()] + subst + form[m.end() :]).strip() 

1945 # brackets contain romanization 

1946 elif ( 

1947 m.start() > 0 

1948 and not roman 

1949 and classify_desc(form[: m.start()]) == "other" 

1950 and 

1951 # "other" ~ text 

1952 classify_desc(paren) in ("romanization", "english") 

1953 and not re.search(r"^with |-form$", paren) 

1954 ): 

1955 roman = paren 

1956 form = (form[: m.start()] + subst + form[m.end() :]).strip() 

1957 elif re.search(r"^with |-form", paren): 1957 ↛ 1958line 1957 didn't jump to line 1958 because the condition on line 1957 was never true

1958 form = (form[: m.start()] + subst + form[m.end() :]).strip() 

1959 return form, roman, clitic 

1960 

1961 def merge_row_and_column_tags(form, some_has_covered_text): 

1962 # Merge column tags and row tags. We give preference 

1963 # to moods etc coming from rowtags (cf. austteigen/German/Verb 

1964 # imperative forms). 

1965 

1966 # In certain cases, what a tag means depends on whether 

1967 # it is a row or column header. Depending on the language, 

1968 # we replace certain tags with others if they're in 

1969 # a column or row 

1970 

1971 ret = [] 

1972 # rtagreplacs = get_lang_conf(lang, "rowtag_replacements") 

1973 # ctagreplacs = get_lang_conf(lang, "coltag_replacements") 

1974 for rt in sorted(rowtags): 

1975 if "dummy-use-as-coltags" in rt: 1975 ↛ 1976line 1975 didn't jump to line 1976 because the condition on line 1975 was never true

1976 continue 

1977 # if lang was in rowtag_replacements) 

1978 # if not rtagreplacs == None: 

1979 # rt = replace_directional_tags(rt, rtagreplacs) 

1980 for ct in sorted(coltags): 

1981 if "dummy-use-as-rowtags" in ct: 1981 ↛ 1982line 1981 didn't jump to line 1982 because the condition on line 1981 was never true

1982 continue 

1983 # if lang was in coltag_replacements 

1984 # if not ctagreplacs == None: 

1985 # ct = replace_directional_tags(ct, 

1986 # ctagreplacs) 

1987 tags = set(global_tags) 

1988 tags.update(extra_tags) 

1989 tags.update(rt) 

1990 tags.update(refs_tags) 

1991 tags.update(tablecontext.section_header) 

1992 # Merge tags from column. For certain kinds of tags, 

1993 # those coming from row take precedence. 

1994 old_tags = set(tags) 

1995 for t in ct: 

1996 c = valid_tags[t] 

1997 if c in ("mood", "case", "number") and any( 

1998 valid_tags[tt] == c for tt in old_tags 

1999 ): 

2000 continue 

2001 tags.add(t) 

2002 

2003 # Extract language-specific tags from the 

2004 # form. This may also adjust the form. 

2005 form, lang_tags = lang_specific_tags(lang, pos, form) 

2006 tags.update(lang_tags) 

2007 

2008 # For non-finite verb forms, see if they have 

2009 # a gender/class suffix 

2010 if pos == "verb" and any( 

2011 valid_tags[t] == "non-finite" for t in tags 

2012 ): 

2013 form, tt = parse_head_final_tags(wxr, lang, form) 

2014 tags.update(tt) 

2015 

2016 # Remove "personal" tag if have nth person; these 

2017 # come up with e.g. reconhecer/Portuguese/Verb. But 

2018 # not if we also have "pronoun" 

2019 if ( 

2020 "personal" in tags 

2021 and "pronoun" not in tags 

2022 and any( 

2023 x in tags 

2024 for x in [ 

2025 "first-person", 

2026 "second-person", 

2027 "third-person", 

2028 ] 

2029 ) 

2030 ): 

2031 tags.remove("personal") 

2032 

2033 # If we have impersonal, remove person and number. 

2034 # This happens with e.g. viajar/Portuguese/Verb 

2035 if "impersonal" in tags: 

2036 tags = tags - set( 

2037 [ 

2038 "first-person", 

2039 "second-person", 

2040 "third-person", 

2041 "singular", 

2042 "plural", 

2043 ] 

2044 ) 

2045 

2046 # Remove unnecessary "positive" tag from verb forms 

2047 if pos == "verb" and "positive" in tags: 

2048 if "negative" in tags: 2048 ↛ 2049line 2048 didn't jump to line 2049 because the condition on line 2048 was never true

2049 tags.remove("negative") 

2050 tags.remove("positive") 

2051 

2052 # Many Russian (and other Slavic) inflection tables 

2053 # have animate/inanimate distinction that generates 

2054 # separate entries for neuter/feminine, but the 

2055 # distinction only applies to masculine. Remove them 

2056 # form neuter/feminine and eliminate duplicates. 

2057 if get_lang_conf(lang, "masc_only_animate"): 

2058 for t1 in ("animate", "inanimate"): 

2059 for t2 in ("neuter", "feminine"): 

2060 if ( 

2061 t1 in tags 

2062 and t2 in tags 

2063 and "masculine" not in tags 

2064 and "plural" not in tags 

2065 ): 

2066 tags.remove(t1) 

2067 

2068 # German adjective tables contain "(keiner)" etc 

2069 # for mixed declension plural. When the adjective 

2070 # disappears and it becomes just one word, remove 

2071 # the "includes-article" tag. e.g. eiskalt/German 

2072 if "includes-article" in tags and " " not in form: 

2073 tags.remove("includes-article") 

2074 

2075 # Handle ignored forms. We mark that the form was 

2076 # provided. This is important information; some words 

2077 # just do not have a certain form. However, there also 

2078 # many cases where no word in a language has a 

2079 # particular form. Post-processing could detect and 

2080 # remove such cases. 

2081 if form in IGNORED_COLVALUES: 

2082 # if cell text seems to be ignorable 

2083 if "dummy-ignore-skipped" in tags: 

2084 continue 

2085 if ( 

2086 col_idx not in has_covering_hdr 

2087 and some_has_covered_text 

2088 ): 

2089 continue 

2090 # don't ignore this cell if there's been a header 

2091 # above it 

2092 form = "-" 

2093 elif col_idx in has_covering_hdr: 

2094 some_has_covered_text = True 

2095 

2096 # Handle ambiguous object concord. If a header 

2097 # gives the "dummy-object-concord"-tag to a word, 

2098 # replace person, number and gender tags with 

2099 # their "object-" counterparts so that the verb 

2100 # agrees with the object instead. 

2101 # Use only when the verb has ONLY object agreement! 

2102 # a پخول/Pashto 

2103 if "dummy-object-concord" in tags: 2103 ↛ 2104line 2103 didn't jump to line 2104 because the condition on line 2103 was never true

2104 for subtag, objtag in object_concord_replacements.items(): 

2105 if subtag in tags: 

2106 tags.remove(subtag) 

2107 tags.add(objtag) 

2108 

2109 # Remove the dummy mood tag that we sometimes 

2110 # use to block adding other mood and related 

2111 # tags 

2112 tags = tags - set( 

2113 [ 

2114 "dummy-mood", 

2115 "dummy-tense", 

2116 "dummy-ignore-skipped", 

2117 "dummy-object-concord", 

2118 "dummy-reset-headers", 

2119 "dummy-use-as-coltags", 

2120 "dummy-use-as-rowtags", 

2121 "dummy-store-hdrspan", 

2122 "dummy-load-stored-hdrspans", 

2123 "dummy-reset-stored-hdrspans", 

2124 "dummy-section-header", 

2125 ] 

2126 ) 

2127 

2128 # Perform language-specific tag replacements according 

2129 # to rules in a table. 

2130 lang_tag_mappings = get_lang_conf(lang, "lang_tag_mappings") 

2131 if lang_tag_mappings is not None: 2131 ↛ 2132line 2131 didn't jump to line 2132 because the condition on line 2131 was never true

2132 for pre, post in lang_tag_mappings.items(): 

2133 if all(t in tags for t in pre): 

2134 tags = (tags - set(pre)) | set(post) 

2135 

2136 # Warn if there are entries with empty tags 

2137 if not tags: 

2138 wxr.wtp.debug( 

2139 "inflection table: empty tags for {}".format(form), 

2140 sortid="inflection/1826", 

2141 ) 

2142 

2143 # Warn if form looks like IPA 

2144 ########## XXX ######## 

2145 # Because IPA is its own unicode block, we could also 

2146 # technically do a Unicode name check to see if a string 

2147 # contains IPA. Not all valid IPA characters are in the 

2148 # IPA extension block, so you can technically have false 

2149 # negatives if it's something like /toki/, but it 

2150 # shouldn't give false positives. 

2151 # Alternatively, you could make a list of IPA-admissible 

2152 # characters and reject non-IPA stuff with that. 

2153 if re.match(r"\s*/.*/\s*$", form): 2153 ↛ 2154line 2153 didn't jump to line 2154 because the condition on line 2153 was never true

2154 wxr.wtp.debug( 

2155 "inflection table form looks like IPA: " 

2156 "form={} tags={}".format(form, tags), 

2157 sortid="inflection/1840", 

2158 ) 

2159 

2160 # Note that this checks `form`, not `in tags` 

2161 if form == "dummy-ignored-text-cell": 2161 ↛ 2162line 2161 didn't jump to line 2162 because the condition on line 2161 was never true

2162 continue 

2163 

2164 if "dummy-remove-this-cell" in tags: 2164 ↛ 2165line 2164 didn't jump to line 2165 because the condition on line 2164 was never true

2165 continue 

2166 

2167 # Add the form 

2168 tags = list(sorted(tags)) 

2169 dt = {"form": form, "tags": tags, "source": source} 

2170 if roman: 

2171 dt["roman"] = roman 

2172 if ipa: 

2173 dt["ipa"] = ipa 

2174 ret.append(dt) 

2175 # If we got separate clitic form, add it 

2176 if clitic: 

2177 dt = { 

2178 "form": clitic, 

2179 "tags": tags + ["clitic"], 

2180 "source": source, 

2181 } 

2182 ret.append(dt) 

2183 return ret, form, some_has_covered_text 

2184 

2185 # First extract definitions from cells 

2186 # See defs_ht for footnote defs stuff 

2187 for row in rows: 

2188 for cell in row: 

2189 text, refs, defs, hdr_tags = extract_cell_content( 

2190 lang, word, cell.text 

2191 ) 

2192 # refs, defs = footnote stuff, defs -> (ref, def) 

2193 add_defs(defs) 

2194 # Extract definitions from text after table 

2195 text, refs, defs, hdr_tags = extract_cell_content(lang, word, after) 

2196 add_defs(defs) 

2197 

2198 # Then extract the actual forms 

2199 ret = [] 

2200 hdrspans = [] 

2201 first_col_has_text = False 

2202 rownum = 0 

2203 title = None 

2204 global_tags = [] 

2205 table_tags = [] 

2206 special_phrase_splits = get_lang_conf(lang, "special_phrase_splits") 

2207 form_replacements = get_lang_conf(lang, "form_replacements") 

2208 form_transformations = get_lang_conf(lang, "form_transformations") 

2209 possibly_ignored_forms = get_lang_conf(lang, "conditionally_ignored_cells") 

2210 cleanup_rules = get_lang_conf(lang, "minor_text_cleanups") 

2211 

2212 for title in titles: 

2213 more_global_tags, more_table_tags, extra_forms = parse_title( 

2214 title, source 

2215 ) 

2216 global_tags.extend(more_global_tags) 

2217 table_tags.extend(more_table_tags) 

2218 ret.extend(extra_forms) 

2219 cell_rowcnt = collections.defaultdict(int) 

2220 seen_cells = set() 

2221 has_covering_hdr = set() 

2222 some_has_covered_text = False 

2223 for row in rows: 

2224 # print("ROW:", row) 

2225 # print("====") 

2226 # print(f"Start of PREVIOUS row hdrspans:" 

2227 # f"{tuple(sp.tagsets for sp in hdrspans)}") 

2228 # print(f"Start of row txt: {tuple(t.text for t in row)}") 

2229 if not row: 2229 ↛ 2230line 2229 didn't jump to line 2230 because the condition on line 2229 was never true

2230 continue # Skip empty rows 

2231 all_headers = all(x.is_title or not x.text.strip() for x in row) 

2232 text = row[0].text 

2233 if ( 

2234 row[0].is_title 

2235 and text 

2236 and not is_superscript(text[0]) 

2237 and text not in infl_map # zealous inflation map? 

2238 and ( 

2239 re.match(r"Inflection ", text) 

2240 or re.sub( 

2241 r"\s+", 

2242 " ", # flatten whitespace 

2243 re.sub( 

2244 r"\s*\([^)]*\)", 

2245 "", 

2246 # Remove whitespace+parens 

2247 text, 

2248 ), 

2249 ).strip() 

2250 not in infl_map 

2251 ) 

2252 and not re.match(infl_start_re, text) 

2253 and all( 

2254 x.is_title == row[0].is_title and x.text == text 

2255 # all InflCells in `row` have the same is_title and text 

2256 for x in row 

2257 ) 

2258 ): 

2259 if text and title is None: 

2260 # Only if there were no titles previously make the first 

2261 # text that is found the title 

2262 title = text 

2263 if re.match(r"(Note:|Notes:)", title): 2263 ↛ 2264line 2263 didn't jump to line 2264 because the condition on line 2263 was never true

2264 continue # not a title 

2265 more_global_tags, more_table_tags, extra_forms = parse_title( 

2266 title, source 

2267 ) 

2268 global_tags.extend(more_global_tags) 

2269 table_tags.extend(more_table_tags) 

2270 ret.extend(extra_forms) 

2271 continue # Skip title rows without incrementing i 

2272 if "dummy-skip-this" in global_tags: 2272 ↛ 2273line 2272 didn't jump to line 2273 because the condition on line 2272 was never true

2273 return [] 

2274 rowtags = [()] 

2275 # have_hdr = False 

2276 # have_hdr never used? 

2277 have_text = False 

2278 samecell_cnt = 0 

2279 col0_hdrspan = None # col0 or later header (despite its name) 

2280 col0_followed_by_nonempty = False 

2281 row_empty = True 

2282 for col_idx, cell in enumerate(row): 

2283 colspan = cell.colspan # >= 1 

2284 rowspan = cell.rowspan # >= 1 

2285 previously_seen = id(cell) in seen_cells 

2286 # checks to see if this cell was in the previous ROW 

2287 seen_cells.add(id(cell)) 

2288 if samecell_cnt == 0: 

2289 # First column of a (possible multi-column) cell 

2290 samecell_cnt = colspan - 1 

2291 else: 

2292 assert samecell_cnt > 0 

2293 samecell_cnt -= 1 

2294 continue 

2295 

2296 # is_first_row_of_cell = cell_rowcnt[id(cell)] == 0 

2297 # never used? 

2298 

2299 # defaultdict(int) around line 1900 

2300 cell_rowcnt[id(cell)] += 1 

2301 # => how many cols this spans 

2302 col = cell.text 

2303 if not col: 

2304 continue 

2305 row_empty = False 

2306 is_title = cell.is_title 

2307 

2308 # If the cell has a target, i.e., text after colon, interpret 

2309 # it as simply specifying a value for that value and ignore 

2310 # it otherwise. 

2311 if cell.target: 

2312 text, refs, defs, hdr_tags = extract_cell_content( 

2313 lang, word, col 

2314 ) 

2315 if not text: 2315 ↛ 2316line 2315 didn't jump to line 2316 because the condition on line 2315 was never true

2316 continue 

2317 refs_tags = set() 

2318 for ref in refs: # gets tags from footnotes 2318 ↛ 2319line 2318 didn't jump to line 2319 because the loop on line 2318 never started

2319 if ref in def_ht: 

2320 refs_tags.update(def_ht[ref]) 

2321 rowtags = expand_header( 

2322 wxr, 

2323 tablecontext, 

2324 word, 

2325 lang, 

2326 pos, 

2327 text, 

2328 [], 

2329 silent=True, 

2330 depth=depth, 

2331 column_number=col_idx, 

2332 ) 

2333 rowtags = list( 

2334 set(tuple(sorted(set(x) | refs_tags)) for x in rowtags) 

2335 ) 

2336 is_title = False 

2337 col = cell.target 

2338 

2339 # print(rownum, col_idx, col) 

2340 # print(f"is_title: {is_title}") 

2341 if is_title: 

2342 # It is a header cell 

2343 text, refs, defs, hdr_tags = extract_cell_content( 

2344 lang, word, col 

2345 ) 

2346 if not text: 

2347 continue 

2348 # Extract tags from referenced footnotes 

2349 refs_tags = set() 

2350 for ref in refs: 

2351 if ref in def_ht: 

2352 refs_tags.update(def_ht[ref]) 

2353 

2354 # Expand header to tags 

2355 v = expand_header( 

2356 wxr, 

2357 tablecontext, 

2358 word, 

2359 lang, 

2360 pos, 

2361 text, 

2362 [], 

2363 silent=True, 

2364 depth=depth, 

2365 column_number=col_idx, 

2366 ) 

2367 # print("EXPANDED {!r} to {}".format(text, v)) 

2368 

2369 if col_idx == 0: 

2370 # first_col_has_text is used for a test to ignore 

2371 # upper-left cells that are just text without 

2372 # header info 

2373 first_col_has_text = True 

2374 # Check if the header expands to reset hdrspans 

2375 if any("dummy-reset-headers" in tt for tt in v): 

2376 new_hdrspans = [] 

2377 for hdrspan in hdrspans: 

2378 # if there are HdrSpan objects (abstract headers with 

2379 # row- and column-spans) that are to the left or at the 

2380 # same row or below, KEEP those; things above and to 

2381 # the right of the hdrspan with dummy-reset-headers 

2382 # are discarded. Tags from the header together with 

2383 # dummy-reset-headers are kept as normal. 

2384 if ( 

2385 hdrspan.start + hdrspan.colspan < col_idx 

2386 or hdrspan.rownum > rownum - cell.rowspan 

2387 ): 

2388 new_hdrspans.append(hdrspan) 

2389 hdrspans = new_hdrspans 

2390 

2391 for tt in v: 

2392 if "dummy-section-header" in tt: 2392 ↛ 2393line 2392 didn't jump to line 2393 because the condition on line 2392 was never true

2393 tablecontext.section_header = tt 

2394 break 

2395 if "dummy-reset-section-header" in tt: 2395 ↛ 2396line 2395 didn't jump to line 2396 because the condition on line 2395 was never true

2396 tablecontext.section_header = [] 

2397 # Text between headers on a row causes earlier headers to 

2398 # be reset 

2399 if have_text: 

2400 # print(" HAVE_TEXT BEFORE HDR:", col) 

2401 # Reset rowtags if new title column after previous 

2402 # text cells 

2403 # +-----+-----+-----+-----+ 

2404 # |hdr-a|txt-a|hdr-B|txt-B| 

2405 # +-----+-----+-----+-----+ 

2406 # ^reset rowtags=> 

2407 # XXX beware of header "—": "" - must not clear on that if 

2408 # it expands to no tags 

2409 rowtags = [()] 

2410 # have_hdr = True 

2411 # have_hdr never used? 

2412 # print("HAVE_HDR: {} rowtags={}".format(col, rowtags)) 

2413 # Update rowtags and coltags 

2414 has_covering_hdr.add(col_idx) # col_idx == current column 

2415 # has_covering_hdr is a set that has the col_idx-ids of columns 

2416 # that have previously had some kind of header. It is never 

2417 # resetted inside the col_idx-loops OR the bigger rows-loop, so 

2418 # applies to the whole table. 

2419 

2420 rowtags, new_coltags, all_hdr_tags = generate_tags( 

2421 rowtags, table_tags 

2422 ) 

2423 

2424 if any("dummy-skip-this" in ts for ts in rowtags): 

2425 continue # Skip this cell 

2426 

2427 if any("dummy-load-stored-hdrspans" in ts for ts in v): 2427 ↛ 2428line 2427 didn't jump to line 2428 because the condition on line 2427 was never true

2428 hdrspans.extend(tablecontext.stored_hdrspans) 

2429 

2430 if any("dummy-reset-stored-hdrspans" in ts for ts in v): 2430 ↛ 2431line 2430 didn't jump to line 2431 because the condition on line 2430 was never true

2431 tablecontext.stored_hdrspans = [] 

2432 

2433 if any("dummy-store-hdrspan" in ts for ts in v): 2433 ↛ 2435line 2433 didn't jump to line 2435 because the condition on line 2433 was never true

2434 # print(f"STORED: {col}") 

2435 store_new_hdrspan = True 

2436 else: 

2437 store_new_hdrspan = False 

2438 

2439 new_coltags = list( 

2440 x 

2441 for x in new_coltags 

2442 if not any(t in noinherit_tags for t in x) 

2443 ) 

2444 # print("new_coltags={} previously_seen={} all_hdr_tags={}" 

2445 # .format(new_coltags, previously_seen, all_hdr_tags)) 

2446 if any(new_coltags): 

2447 ( 

2448 col, 

2449 col0_followed_by_nonempty, 

2450 col0_hdrspan, 

2451 ) = add_new_hdrspan( 

2452 col, 

2453 hdrspans, 

2454 store_new_hdrspan, 

2455 col0_followed_by_nonempty, 

2456 col0_hdrspan, 

2457 ) 

2458 

2459 continue 

2460 

2461 # These values are ignored, at least for now 

2462 if re.match(r"^(# |\(see )", col): 2462 ↛ 2463line 2462 didn't jump to line 2463 because the condition on line 2462 was never true

2463 continue 

2464 

2465 if any("dummy-skip-this" in ts for ts in rowtags): 

2466 continue # Skip this cell 

2467 

2468 # If the word has no rowtags and is a multi-row cell, then 

2469 # ignore this. This happens with empty separator rows 

2470 # within a rowspan>1 cell. cf. wander/English/Conjugation. 

2471 if rowtags == [()] and rowspan > 1: 

2472 continue 

2473 

2474 # Minor cleanup. See e.g. είμαι/Greek/Verb present participle. 

2475 if cleanup_rules: 

2476 for regx, substitution in cleanup_rules.items(): 

2477 col = re.sub(regx, substitution, col) 

2478 

2479 if ( 2479 ↛ 2484line 2479 didn't jump to line 2484 because the condition on line 2479 was never true

2480 col_idx == 0 

2481 and not first_col_has_text 

2482 and get_lang_conf(lang, "ignore_top_left_text_cell") is True 

2483 ): 

2484 continue # Skip text at top left, as in Icelandic, Faroese 

2485 

2486 # if col0_hdrspan is not None: 

2487 # print("COL0 FOLLOWED NONHDR: {!r} by {!r}" 

2488 # .format(col0_hdrspan.text, col)) 

2489 col0_followed_by_nonempty = True 

2490 have_text = True 

2491 

2492 # Determine column tags for the multi-column cell 

2493 combined_coltags = compute_coltags( 

2494 lang, pos, hdrspans, col_idx, colspan, col 

2495 ) 

2496 if any("dummy-ignored-text-cell" in ts for ts in combined_coltags): 2496 ↛ 2497line 2496 didn't jump to line 2497 because the condition on line 2496 was never true

2497 continue 

2498 

2499 # Split the text into separate forms. First simplify spaces except 

2500 # newline. 

2501 col = re.sub(r"[ \t\r]+", " ", col) 

2502 # Split the cell text into alternatives 

2503 

2504 col, alts, split_extra_tags = split_text_into_alts(col) 

2505 

2506 # Some cells have mixed form content, like text and romanization, 

2507 # or text and IPA. Handle these. 

2508 alts = handle_mixed_lines(alts) 

2509 

2510 alts = list((x, combined_coltags) for x in alts) 

2511 

2512 # Generate forms from the alternatives 

2513 # alts is a list of (tuple of forms, tuple of tags) 

2514 for (form, base_roman, ipa), coltags in alts: 

2515 form = form.strip() 

2516 extra_tags = [] 

2517 extra_tags.extend(split_extra_tags) 

2518 # Handle special splits again here, so that we can have custom 

2519 # mappings from form to form and tags. 

2520 if form in form_replacements: 

2521 replacement, tags = form_replacements[form] 

2522 for x in tags.split(): 

2523 assert x in valid_tags 

2524 assert isinstance(replacement, str) 

2525 assert isinstance(tags, str) 

2526 form = replacement 

2527 extra_tags.extend(tags.split()) 

2528 

2529 check_romanization_form_transformation = False 

2530 # loop over regexes in form_transformation and replace text 

2531 # in form using regex patterns 

2532 # this does a bit of the same stuff the above does, 

2533 # but with regexes and re.sub() instead 

2534 for ( 

2535 form_transformations_pos, 

2536 v, 

2537 subst, 

2538 tags, 

2539 ) in form_transformations: 

2540 # v is a pattern string, like "^ich" 

2541 if pos != form_transformations_pos: 

2542 continue 

2543 m = re.search(v, form) 

2544 if m is not None: 

2545 form = re.sub(v, subst, form) 

2546 for x in tags.split(): 

2547 assert x in valid_tags 

2548 extra_tags.extend(tags.split()) 

2549 check_romanization_form_transformation = True 

2550 break 

2551 

2552 # Clean the value, extracting reference symbols 

2553 form, refs, defs, hdr_tags = extract_cell_content( 

2554 lang, word, form 

2555 ) 

2556 # if refs: 

2557 # print("REFS:", refs) 

2558 extra_tags.extend(hdr_tags) 

2559 # Extract tags from referenced footnotes 

2560 refs_tags = set() 

2561 for ref in refs: 

2562 if ref in def_ht: 

2563 refs_tags.update(def_ht[ref]) 

2564 

2565 if base_roman: 

2566 if check_romanization_form_transformation: 2566 ↛ 2570line 2566 didn't jump to line 2570 because the condition on line 2566 was never true

2567 # because form_transformations are used to handle things 

2568 # where the romanization has the "same" structure, we 

2569 # need to handle that here too.... 

2570 for ( 

2571 _, 

2572 v, 

2573 subst, 

2574 _, 

2575 ) in form_transformations: 

2576 # v is a pattern string, like "^ich" 

2577 m = re.search(v, base_roman) 

2578 if m is not None: 

2579 base_roman = re.sub(v, subst, base_roman) 

2580 # XXX add tag stuff here if needed 

2581 break 

2582 

2583 base_roman, _, _, hdr_tags = extract_cell_content( 

2584 lang, word, base_roman 

2585 ) 

2586 extra_tags.extend(hdr_tags) 

2587 

2588 # Do some additional cleanup on the cell. 

2589 form = re.sub(r"^\s*,\s*", "", form) 

2590 form = re.sub(r"\s*,\s*$", "", form) 

2591 form = re.sub(r"\s*(,\s*)+", ", ", form) 

2592 form = re.sub(r"(?i)^Main:", "", form) 

2593 form = re.sub(r"\s+", " ", form) 

2594 form = form.strip() 

2595 

2596 # Look for parentheses that have semantic meaning 

2597 form, et = find_semantic_parens(form) 

2598 extra_tags.extend(et) 

2599 

2600 # Handle parentheses in the table element. We parse 

2601 # tags anywhere and romanizations anywhere but beginning. 

2602 roman = base_roman 

2603 paren = None 

2604 clitic = None 

2605 m = re.search(r"(\s+|^)\(([^)]*)\)", form) 

2606 # start|spaces + (anything) 

2607 if m is not None: 

2608 subst = m.group(1) 

2609 paren = m.group(2) 

2610 else: 

2611 m = re.search(r"\(([^)]*)\)(\s+|$)", form) 

2612 # (anything) + spaces|end 

2613 if m is not None: 2613 ↛ 2614line 2613 didn't jump to line 2614 because the condition on line 2613 was never true

2614 paren = m.group(1) 

2615 subst = m.group(2) 

2616 if paren is not None: 

2617 form, roman, clitic = handle_parens( 

2618 form, roman, clitic, extra_tags 

2619 ) 

2620 

2621 # Ignore certain forms that are not really forms, 

2622 # unless they're really, really close to the article title 

2623 if form in ( 2623 ↛ 2628line 2623 didn't jump to line 2628 because the condition on line 2623 was never true

2624 "", 

2625 "unchanged", 

2626 "after an", # in sona/Irish/Adj/Mutation 

2627 ): 

2628 Lev = distw([form], word) 

2629 if form and Lev < 0.1: 

2630 wxr.wtp.debug( 

2631 "accepted possible false positive '{}' with" 

2632 "> 0.1 Levenshtein distance in {}/{}".format( 

2633 form, word, lang 

2634 ), 

2635 sortid="inflection/2213", 

2636 ) 

2637 elif form and Lev < 0.3: 

2638 wxr.wtp.debug( 

2639 "skipped possible match '{}' with > 0.3" 

2640 "Levenshtein distance in {}/{}".format( 

2641 form, word, lang 

2642 ), 

2643 sortid="inflection/2218", 

2644 ) 

2645 continue 

2646 else: 

2647 continue 

2648 # print("ROWTAGS={} COLTAGS={} REFS_TAGS={} " 

2649 # "FORM={!r} ROMAN={!r}" 

2650 # .format(rowtags, coltags, refs_tags, 

2651 # form, roman)) 

2652 

2653 # Merge tags from row and column and do miscellaneous 

2654 # tag-related handling. 

2655 ( 

2656 merge_ret, 

2657 form, 

2658 some_has_covered_text, 

2659 ) = merge_row_and_column_tags(form, some_has_covered_text) 

2660 ret.extend(merge_ret) 

2661 

2662 # End of row. 

2663 rownum += 1 

2664 # For certain languages, if the row was empty, reset 

2665 # hdrspans (saprast/Latvian/Verb, but not aussteigen/German/Verb). 

2666 if row_empty and get_lang_conf(lang, "empty_row_resets"): 

2667 hdrspans = [] 

2668 # Check if we should expand col0_hdrspan. 

2669 if col0_hdrspan is not None: 

2670 col0_allowed = get_lang_conf(lang, "hdr_expand_first") 

2671 col0_cats = tagset_cats(col0_hdrspan.tagsets) 

2672 # Only expand if col0_cats and later_cats are allowed 

2673 # and don't overlap and col0 has tags, and there have 

2674 # been no disallowed cells in between. 

2675 if ( 

2676 not col0_followed_by_nonempty 

2677 and not (col0_cats - col0_allowed) 

2678 and 

2679 # len(col0_cats) == 1 and 

2680 col_idx > col0_hdrspan.start + col0_hdrspan.colspan 

2681 ): 

2682 # If an earlier header is only followed by headers that yield 

2683 # no tags, expand it to entire row 

2684 # print("EXPANDING COL0: {} from {} to {} cols {}" 

2685 # .format(col0_hdrspan.text, col0_hdrspan.colspan, 

2686 # len(row) - col0_hdrspan.start, 

2687 # col0_hdrspan.tagsets)) 

2688 col0_hdrspan.colspan = len(row) - col0_hdrspan.start 

2689 col0_hdrspan.expanded = True 

2690 # XXX handle refs and defs 

2691 # for x in hdrspans: 

2692 # print(" HDRSPAN {} {} {} {!r}" 

2693 # .format(x.start, x.colspan, x.tagsets, x.text)) 

2694 

2695 # Post-process German nouns with articles in separate columns. We move the 

2696 # definite/indefinite/usually-without-article markers into the noun and 

2697 # remove the article entries. 

2698 if get_lang_conf(lang, "articles_in_separate_columns") and any( 

2699 "noun" in x["tags"] for x in ret 

2700 ): 

2701 new_ret = [] 

2702 saved_tags = set() 

2703 had_noun = False 

2704 for dt in ret: 

2705 tags = dt["tags"] 

2706 # print(tags) 

2707 if "noun" in tags: 

2708 tags = list( 

2709 sorted(set(t for t in tags if t != "noun") | saved_tags) 

2710 ) 

2711 had_noun = True 

2712 elif ( 2712 ↛ 2739line 2712 didn't jump to line 2739 because the condition on line 2712 was always true

2713 "indefinite" in tags 

2714 or "definite" in tags 

2715 or "usually-without-article" in tags 

2716 or "without-article" in tags 

2717 ): 

2718 if had_noun: 

2719 saved_tags = set(tags) 

2720 else: 

2721 saved_tags = saved_tags | set(tags) # E.g. Haus/German 

2722 remove_useless_tags(lang, pos, saved_tags) 

2723 saved_tags = saved_tags & set( 

2724 [ 

2725 "masculine", 

2726 "feminine", 

2727 "neuter", 

2728 "singular", 

2729 "plural", 

2730 "indefinite", 

2731 "definite", 

2732 "usually-without-article", 

2733 "without-article", 

2734 ] 

2735 ) 

2736 had_noun = False 

2737 continue # Skip the articles 

2738 

2739 dt = dt.copy() 

2740 dt["tags"] = tags 

2741 new_ret.append(dt) 

2742 ret = new_ret 

2743 

2744 elif possibly_ignored_forms: 

2745 # Some languages have tables with cells that are kind of separated 

2746 # and difficult to handle, like eulersche Formel/German where 

2747 # the definite and indefinite articles are just floating. 

2748 # If a language has a dict of conditionally_ignored_cells, 

2749 # and if the contents of a cell is found in one of the rules 

2750 # there, ignore that cell if it 

2751 # 1. Does not have the appropriate tag (like "definite" for "die") 

2752 # and 

2753 # 2. The title of the article is not one of the other co-words 

2754 # (ie. it's an article for the definite articles in german etc.) 

2755 # pass 

2756 new_ret = [] 

2757 for cell_data in ret: 

2758 tags = cell_data["tags"] 

2759 text = cell_data["form"] 

2760 skip_this = False 

2761 for key_tag, ignored_forms in possibly_ignored_forms.items(): 

2762 if text not in ignored_forms: 2762 ↛ 2764line 2762 didn't jump to line 2764 because the condition on line 2762 was always true

2763 continue 

2764 if word in ignored_forms: 

2765 continue 

2766 if key_tag not in tags: 

2767 skip_this = True 

2768 

2769 if skip_this: 2769 ↛ 2770line 2769 didn't jump to line 2770 because the condition on line 2769 was never true

2770 continue 

2771 new_ret.append(cell_data) 

2772 

2773 ret = new_ret 

2774 

2775 # Post-process English inflection tables, addding "multiword-construction" 

2776 # when the number of words has increased. 

2777 if lang == "English" and pos == "verb": 

2778 word_words = len(word.split()) 

2779 new_ret = [] 

2780 for dt in ret: 

2781 form = dt.get("form", "") 

2782 if len(form.split()) > word_words: 

2783 dt = dt.copy() 

2784 dt["tags"] = list(dt.get("tags", [])) 

2785 # This strange copy-assigning shuffle is preventative black 

2786 # magic; do not touch lest you invoke deep bugs. 

2787 data_append(dt, "tags", "multiword-construction") 

2788 new_ret.append(dt) 

2789 ret = new_ret 

2790 

2791 # Always insert "table-tags" detail as the first entry in any inflection 

2792 # table. This way we can reliably detect where a new table starts. 

2793 # Table-tags applies until the next table-tags entry. 

2794 if ret or table_tags: 

2795 table_tags = list(sorted(set(table_tags))) 

2796 dt = { 

2797 "form": " ".join(table_tags), 

2798 "source": source, 

2799 "tags": ["table-tags"], 

2800 } 

2801 if dt["form"] == "": 

2802 dt["form"] = "no-table-tags" 

2803 if tablecontext.template_name: 

2804 tn = { 

2805 "form": tablecontext.template_name, 

2806 "source": source, 

2807 "tags": ["inflection-template"], 

2808 } 

2809 ret = [dt] + [tn] + ret 

2810 else: 

2811 ret = [dt] + ret 

2812 

2813 return ret 

2814 

2815 

2816def handle_generic_table( 

2817 wxr, tablecontext, data, word, lang, pos, rows, titles, source, after, depth 

2818): 

2819 assert isinstance(wxr, WiktextractContext) 

2820 assert isinstance(data, dict) 

2821 assert isinstance(word, str) 

2822 assert isinstance(lang, str) 

2823 assert isinstance(pos, str) 

2824 assert isinstance(rows, list) 

2825 assert isinstance(source, str) 

2826 assert isinstance(after, str) 

2827 assert isinstance(depth, int) 

2828 for row in rows: 

2829 assert isinstance(row, list) 

2830 for x in row: 

2831 assert isinstance(x, InflCell) 

2832 assert isinstance(titles, list) 

2833 for x in titles: 

2834 assert isinstance(x, str) 

2835 

2836 # Try to parse the table as a simple table 

2837 ret = parse_simple_table( 

2838 wxr, tablecontext, word, lang, pos, rows, titles, source, after, depth 

2839 ) 

2840 if ret is None: 2840 ↛ 2843line 2840 didn't jump to line 2843 because the condition on line 2840 was never true

2841 # XXX handle other table formats 

2842 # We were not able to handle the table 

2843 wxr.wtp.debug( 

2844 "unhandled inflection table format, {}/{}".format(word, lang), 

2845 sortid="inflection/2370", 

2846 ) 

2847 return 

2848 

2849 # Add the returned forms but eliminate duplicates. 

2850 have_forms = set() 

2851 for dt in ret: 

2852 fdt = freeze(dt) 

2853 if fdt in have_forms: 

2854 continue # Don't add duplicates 

2855 # Some Russian words have Declension and Pre-reform declension partially 

2856 # duplicating same data. Don't add "dated" tags variant if already have 

2857 # the same without "dated" from the modern declension table 

2858 

2859 tags = dt.get("tags", []) 

2860 for dated_tag in ("dated",): 

2861 if dated_tag in tags: 

2862 dt2 = dt.copy() 

2863 tags2 = list(x for x in tags if x != dated_tag) 

2864 dt2["tags"] = tags2 

2865 if tags2 and freeze(dt2) in have_forms: 2865 ↛ 2866line 2865 didn't jump to line 2866 because the condition on line 2865 was never true

2866 break # Already have without archaic 

2867 else: 

2868 if "table-tags" not in tags: 

2869 have_forms.add(fdt) 

2870 data_append(data, "forms", dt) 

2871 

2872 

2873def determine_header( 

2874 wxr, 

2875 tablecontext, 

2876 lang, 

2877 word, 

2878 pos, 

2879 table_kind, 

2880 kind, 

2881 style, 

2882 row, 

2883 col, 

2884 celltext, 

2885 titletext, 

2886 cols_headered, 

2887 target, 

2888 cellstyle, 

2889): 

2890 assert isinstance(table_kind, NodeKind) 

2891 assert isinstance(kind, (NodeKind, str)) 

2892 assert style is None or isinstance(style, str) 

2893 assert cellstyle is None or isinstance(cellstyle, str) 

2894 

2895 if table_kind == NodeKind.TABLE: 

2896 header_kind = NodeKind.TABLE_HEADER_CELL 

2897 elif table_kind == NodeKind.HTML: 2897 ↛ 2899line 2897 didn't jump to line 2899 because the condition on line 2897 was always true

2898 header_kind = "th" 

2899 idx = celltext.find(": ") 

2900 is_title = False 

2901 # remove anything in parentheses, compress whitespace, .strip() 

2902 cleaned_titletext = re.sub( 

2903 r"\s+", " ", re.sub(r"\s*\([^)]*\)", "", titletext) 

2904 ).strip() 

2905 cleaned, _, _, _ = extract_cell_content(lang, word, celltext) 

2906 cleaned = re.sub(r"\s+", " ", cleaned) 

2907 hdr_expansion = expand_header( 

2908 wxr, 

2909 tablecontext, 

2910 word, 

2911 lang, 

2912 pos, 

2913 cleaned, 

2914 [], 

2915 silent=True, 

2916 ignore_tags=True, 

2917 ) 

2918 candidate_hdr = not any( 

2919 any(t.startswith("error-") for t in ts) for ts in hdr_expansion 

2920 ) 

2921 # KJ candidate_hdr says that a specific cell is a candidate 

2922 # for being a header because it passed through expand_header 

2923 # without getting any "error-" tags; that is, the contents 

2924 # is "valid" for being a header; these are the false positives 

2925 # we want to catch 

2926 ignored_cell = any( 

2927 any(t.startswith("dummy-") for t in ts) for ts in hdr_expansion 

2928 ) 

2929 # ignored_cell should NOT be used to filter for headers, like 

2930 # candidate_hdr is used, but only to filter for related *debug 

2931 # messages*: some dummy-tags are actually half-way to headers, 

2932 # like ones with "Notes", so they MUST be headers, but later 

2933 # on they're ignored *as* headers so they don't need to print 

2934 # out any cells-as-headers debug messages. 

2935 if ( 

2936 candidate_hdr 

2937 and kind != header_kind 

2938 and cleaned != "" 

2939 and cleaned != "dummy-ignored-text-cell" 

2940 and cleaned not in IGNORED_COLVALUES 

2941 ): 

2942 # print("col: {}".format(col)) 

2943 if not ignored_cell and lang not in LANGUAGES_WITH_CELLS_AS_HEADERS: 

2944 wxr.wtp.debug( 

2945 "rejected heuristic header: " 

2946 "table cell identified as header and given " 

2947 "candidate status, BUT {} is not in " 

2948 "LANGUAGES_WITH_CELLS_AS_HEADERS; " 

2949 "cleaned text: {}".format(lang, cleaned), 

2950 sortid="inflection/2447", 

2951 ) 

2952 candidate_hdr = False 

2953 elif cleaned not in LANGUAGES_WITH_CELLS_AS_HEADERS.get(lang, ""): 

2954 wxr.wtp.debug( 

2955 "rejected heuristic header: " 

2956 "table cell identified as header and given " 

2957 "candidate status, BUT the cleaned text is " 

2958 "not in LANGUAGES_WITH_CELLS_AS_HEADERS[{}]; " 

2959 "cleaned text: {}".format(lang, cleaned), 

2960 sortid="inflection/2457", 

2961 ) 

2962 candidate_hdr = False 

2963 else: 

2964 wxr.wtp.debug( 

2965 "accepted heuristic header: " 

2966 "table cell identified as header and given " 

2967 "candidate status, AND the cleaned text is " 

2968 "in LANGUAGES_WITH_CELLS_AS_HEADERS[{}]; " 

2969 "cleaned text: {}".format(lang, cleaned), 

2970 sortid="inflection/2466", 

2971 ) 

2972 

2973 # If the cell starts with something that could start a 

2974 # definition (typically a reference symbol), make it a candidate 

2975 # regardless of whether the language is listed. 

2976 if re.match(def_re, cleaned) and not re.match(nondef_re, cleaned): 2976 ↛ 2977line 2976 didn't jump to line 2977 because the condition on line 2976 was never true

2977 candidate_hdr = True 

2978 

2979 # print("titletext={!r} hdr_expansion={!r} candidate_hdr={!r} " 

2980 # "lang={} pos={}" 

2981 # .format(titletext, hdr_expansion, candidate_hdr, 

2982 # lang, pos)) 

2983 if idx >= 0 and titletext[:idx] in infl_map: 

2984 target = titletext[idx + 2 :].strip() 

2985 celltext = celltext[:idx] 

2986 is_title = True 

2987 elif ( 

2988 kind == header_kind 

2989 and " + " not in titletext # For "avoir + blah blah"? 

2990 and not any( 

2991 isinstance(x, WikiNode) 

2992 and x.kind == NodeKind.HTML 

2993 and x.sarg == "span" 

2994 and x.attrs.get("lang") in ("az",) 

2995 for x in col.children 

2996 ) 

2997 ): 

2998 is_title = True 

2999 elif ( 

3000 candidate_hdr 

3001 and cleaned_titletext not in IGNORED_COLVALUES 

3002 and distw([cleaned_titletext], word) > 0.3 

3003 and cleaned_titletext not in ("I", "es") 

3004 ): 

3005 is_title = True 

3006 # if first column or same style as first column 

3007 elif ( 

3008 style == cellstyle 

3009 and 

3010 # and title is not identical to word name 

3011 titletext != word 

3012 and cleaned not in IGNORED_COLVALUES 

3013 and cleaned != "dummy-ignored-text-cell" 

3014 and 

3015 # the style composite string is not broken 

3016 not style.startswith("////") 

3017 and " + " not in titletext 

3018 ): 

3019 if not ignored_cell and lang not in LANGUAGES_WITH_CELLS_AS_HEADERS: 3019 ↛ 3020line 3019 didn't jump to line 3020 because the condition on line 3019 was never true

3020 wxr.wtp.debug( 

3021 "rejected heuristic header: " 

3022 "table cell identified as header based " 

3023 "on style, BUT {} is not in " 

3024 "LANGUAGES_WITH_CELLS_AS_HEADERS; " 

3025 "cleaned text: {}, style: {}".format(lang, cleaned, style), 

3026 sortid="inflection/2512", 

3027 ) 

3028 elif ( 3028 ↛ 3032line 3028 didn't jump to line 3032 because the condition on line 3028 was never true

3029 not ignored_cell 

3030 and cleaned not in LANGUAGES_WITH_CELLS_AS_HEADERS.get(lang, "") 

3031 ): 

3032 wxr.wtp.debug( 

3033 "rejected heuristic header: " 

3034 "table cell identified as header based " 

3035 "on style, BUT the cleaned text is " 

3036 "not in LANGUAGES_WITH_CELLS_AS_HEADERS[{}]; " 

3037 "cleaned text: {}, style: {}".format(lang, cleaned, style), 

3038 sortid="inflection/2522", 

3039 ) 

3040 else: 

3041 wxr.wtp.debug( 

3042 "accepted heuristic header: " 

3043 "table cell identified as header based " 

3044 "on style, AND the cleaned text is " 

3045 "in LANGUAGES_WITH_CELLS_AS_HEADERS[{}]; " 

3046 "cleaned text: {}, style: {}".format(lang, cleaned, style), 

3047 sortid="inflection/2530", 

3048 ) 

3049 is_title = True 

3050 if ( 3050 ↛ 3057line 3050 didn't jump to line 3057 because the condition on line 3050 was never true

3051 not is_title 

3052 and len(row) < len(cols_headered) 

3053 and cols_headered[len(row)] 

3054 ): 

3055 # Whole column has title suggesting they are headers 

3056 # (e.g. "Case") 

3057 is_title = True 

3058 if re.match( 

3059 r"Conjugation of |Declension of |Inflection of |" 

3060 r"Mutation of |Notes\b", # \b is word-boundary 

3061 titletext, 

3062 ): 

3063 is_title = True 

3064 return is_title, hdr_expansion, target, celltext 

3065 

3066 

3067class TableContext: 

3068 """Saved context used when parsing a table and its subtables.""" 

3069 

3070 __slot__ = ( 

3071 "stored_hdrspans", 

3072 "section_header", 

3073 "template_name", 

3074 ) 

3075 

3076 def __init__(self, template_name=None): 

3077 self.stored_hdrspans = [] 

3078 self.section_header = [] 

3079 if not template_name: 

3080 self.template_name = "" 

3081 else: 

3082 self.template_name = template_name 

3083 

3084 

3085def handle_wikitext_or_html_table( 

3086 wxr, word, lang, pos, data, tree, titles, source, after, tablecontext=None 

3087): 

3088 """Parses a table from parsed Wikitext format into rows and columns of 

3089 InflCell objects and then calls handle_generic_table() to parse it into 

3090 forms. This adds the forms into ``data``.""" 

3091 assert isinstance(wxr, WiktextractContext) 

3092 assert isinstance(word, str) 

3093 assert isinstance(lang, str) 

3094 assert isinstance(pos, str) 

3095 assert isinstance(data, dict) 

3096 assert isinstance(tree, WikiNode) 

3097 assert tree.kind == NodeKind.TABLE or ( 

3098 tree.kind == NodeKind.HTML and tree.sarg == "table" 

3099 ) 

3100 assert isinstance(titles, list) 

3101 assert isinstance(source, str) 

3102 for x in titles: 

3103 assert isinstance(x, str) 

3104 assert isinstance(after, str) 

3105 assert tablecontext is None or isinstance(tablecontext, TableContext) 

3106 # Imported here to avoid a circular import 

3107 from wiktextract.page import clean_node, recursively_extract 

3108 

3109 # from wikitextprocessor.parser import print_tree 

3110 # print_tree(tree) 

3111 # print("-------==========-------") 

3112 

3113 if not tablecontext: 

3114 tablecontext = TableContext() 

3115 

3116 def handle_table1( 

3117 wxr, 

3118 tablecontext, 

3119 word, 

3120 lang, 

3121 pos, 

3122 data, 

3123 tree, 

3124 titles, 

3125 source, 

3126 after, 

3127 depth, 

3128 ): 

3129 """Helper function allowing the 'flattening' out of the table 

3130 recursion: instead of handling the tables in the wrong order 

3131 (recursively), this function adds to new_row that is then 

3132 iterated through in the main function at the end, creating 

3133 a longer table (still in pieces) in the correct order.""" 

3134 

3135 assert isinstance(data, dict) 

3136 assert isinstance(titles, list) 

3137 assert isinstance(source, str) 

3138 for x in titles: 

3139 assert isinstance(x, str) 

3140 assert isinstance(after, str) 

3141 assert isinstance(depth, int) 

3142 # print("HANDLE_WIKITEXT_TABLE", titles) 

3143 

3144 col_gap_data = [] # Filling for columns with rowspan > 1 

3145 # col_gap_data contains None or InflCell 

3146 vertical_still_left = [] # Number of remaining rows for which to fill 

3147 # the column; vertical_still_left contains int 

3148 cols_headered = [] # [F, T, F, F...] 

3149 # True when the whole column contains headers, even 

3150 # when the cell is not considered a header; triggered 

3151 # by the "*" inflmap meta-tag. 

3152 rows = [] 

3153 

3154 sub_ret = [] 

3155 

3156 # from wikitextprocessor.parser import print_tree 

3157 # print_tree(tree) 

3158 for node in tree.children: 

3159 if not isinstance(node, WikiNode): 

3160 continue 

3161 if node.kind == NodeKind.HTML: 

3162 kind = node.sarg 

3163 else: 

3164 kind = node.kind 

3165 

3166 # print(" {}".format(node)) 

3167 if kind in (NodeKind.TABLE_CAPTION, "caption"): 

3168 # print(" CAPTION:", node) 

3169 pass 

3170 elif kind in (NodeKind.TABLE_ROW, "tr"): 

3171 if "vsShow" in node.attrs.get("class", "").split(): 

3172 # vsShow rows are those that are intially shown in tables 

3173 # that have more data. The hidden data duplicates these 

3174 # rows, so we skip it and just process the hidden data. 

3175 continue 

3176 

3177 # if ( 

3178 # len(node.children) == 1 

3179 # and node.children[0].attrs.get("class") == "separator" 

3180 # ): 

3181 # print("------------------ skip separator") 

3182 # continue 

3183 

3184 # Parse a table row. 

3185 row = [] 

3186 style = None 

3187 row_has_nonempty_cells = False 

3188 # Have nonempty cell not from rowspan 

3189 for col in get_table_cells(node): 

3190 # loop through each cell in the ROW 

3191 

3192 # The below skip is not needed anymore, because we "skip" in 

3193 # get_table_cells, but left here as a comment 

3194 # if not isinstance(col, WikiNode): 

3195 # # This skip is not used for counting, 

3196 # # "None" is not used in 

3197 # # indexing or counting or looping. 

3198 # continue 

3199 if col.kind == NodeKind.HTML: 

3200 kind = col.sarg 

3201 else: 

3202 kind = col.kind 

3203 if kind not in ( 3203 ↛ 3209line 3203 didn't jump to line 3209 because the condition on line 3203 was never true

3204 NodeKind.TABLE_HEADER_CELL, 

3205 NodeKind.TABLE_CELL, 

3206 "th", 

3207 "td", 

3208 ): 

3209 print(" UNEXPECTED ROW CONTENT: {}".format(col)) 

3210 continue 

3211 

3212 while ( 

3213 len(row) < len(vertical_still_left) 

3214 and vertical_still_left[len(row)] > 0 

3215 ): 

3216 # vertical_still_left is [...0, 0, 2...] for each 

3217 # column. It is populated at the end of the loop, at the 

3218 # same time as col_gap_data. This needs to be looped and 

3219 # filled this way because each `for col`-looping jumps 

3220 # straight to the next meaningful cell; there is no 

3221 # "None" cells, only emptiness between, and rowspan and 

3222 # colspan are just to generate the "fill- 

3223 vertical_still_left[len(row)] -= 1 

3224 row.append(col_gap_data[len(row)]) 

3225 

3226 # appending row is how "indexing" is 

3227 # done here; something is appended, 

3228 # like a filler-cell here or a "start" 

3229 # cell at the end of the row-loop, 

3230 # which increased len(row) which is 

3231 # then used as the target-index to check 

3232 # for gaps. vertical_still_left is 

3233 # the countdown to when to stop 

3234 # filling in gaps, and goes down to 0, 

3235 # and col_gap_data is not touched 

3236 # except when a new rowspan is needed, 

3237 # at the same time that 

3238 # vertical_still_left gets reassigned. 

3239 

3240 try: 

3241 rowspan = int(col.attrs.get("rowspan", "1")) # 🡙 

3242 colspan = int(col.attrs.get("colspan", "1")) # 🡘 

3243 except ValueError: 

3244 rowspan = 1 

3245 colspan = 1 

3246 # print("COL:", col) 

3247 

3248 # Too many of these errors 

3249 if colspan > 100: 

3250 # wxr.wtp.error( 

3251 # f"Colspan {colspan} over 30, set to 1", 

3252 # sortid="inflection/20250113a", 

3253 # ) 

3254 colspan = 100 

3255 if rowspan > 100: 3255 ↛ 3260line 3255 didn't jump to line 3260 because the condition on line 3255 was never true

3256 # wxr.wtp.error( 

3257 # f"Rowspan {rowspan} over 30, set to 1", 

3258 # sortid="inflection/20250113b", 

3259 # ) 

3260 rowspan = 100 

3261 

3262 # Process any nested tables recursively. 

3263 tables, rest = recursively_extract( 

3264 col, 

3265 lambda x: isinstance(x, WikiNode) 

3266 and (x.kind == NodeKind.TABLE or x.sarg == "table"), 

3267 ) 

3268 

3269 # Clean the rest of the cell. 

3270 celltext = clean_node(wxr, None, rest) 

3271 # print("CLEANED:", celltext) 

3272 # print(f"SUBTABLES: {tables}") 

3273 

3274 # Handle nested tables. 

3275 for tbl in tables: 

3276 # Some nested tables (e.g., croí/Irish) have subtitles 

3277 # as normal paragraphs in the same cell under a descrip- 

3278 # tive text that should be treated as a title (e.g., 

3279 # "Forms with the definite article", with "definite" not 

3280 # mentioned elsewhere). 

3281 new_titles = list(titles) 

3282 if celltext: 

3283 new_titles.append(celltext) 

3284 subtbl = handle_table1( 

3285 wxr, 

3286 tablecontext, 

3287 word, 

3288 lang, 

3289 pos, 

3290 data, 

3291 tbl, 

3292 new_titles, 

3293 source, 

3294 "", 

3295 depth + 1, 

3296 ) 

3297 if subtbl: 3297 ↛ 3275line 3297 didn't jump to line 3275 because the condition on line 3297 was always true

3298 sub_ret.append((rows, titles, after, depth)) 

3299 rows = [] 

3300 titles = [] 

3301 after = "" 

3302 sub_ret.extend(subtbl) 

3303 

3304 # This magic value is used as part of header detection 

3305 cellstyle = ( 

3306 col.attrs.get("style", "") 

3307 + "//" 

3308 + col.attrs.get("class", "") 

3309 + "//" 

3310 + str(kind) 

3311 ) 

3312 

3313 if not row: # if first column in row 

3314 style = cellstyle 

3315 target = None 

3316 titletext = celltext.strip() 

3317 while titletext and is_superscript(titletext[-1]): 

3318 titletext = titletext[:-1] 

3319 

3320 ( 

3321 is_title, 

3322 hdr_expansion, 

3323 target, 

3324 celltext, 

3325 ) = determine_header( 

3326 wxr, 

3327 tablecontext, 

3328 lang, 

3329 word, 

3330 pos, 

3331 tree.kind, 

3332 kind, 

3333 style, 

3334 row, 

3335 col, 

3336 celltext, 

3337 titletext, 

3338 cols_headered, 

3339 None, 

3340 cellstyle, 

3341 ) 

3342 

3343 if is_title: 

3344 # If this cell gets a "*" tag, make the whole column 

3345 # below it (toggling it in cols_headered = [F, F, T...]) 

3346 # into headers. 

3347 while len(cols_headered) <= len(row): 

3348 cols_headered.append(False) 

3349 if any("*" in tt for tt in hdr_expansion): 

3350 cols_headered[len(row)] = True 

3351 celltext = "" 

3352 # if row_has_nonempty_cells has been True at some point, it 

3353 # keeps on being True. 

3354 # if row_has_nonempty_cells or is_title or celltext != "": 

3355 # row_has_nonempty_cells = True 

3356 # ⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓ 

3357 row_has_nonempty_cells |= is_title or celltext != "" 

3358 cell = InflCell( 

3359 celltext, is_title, colspan, rowspan, target 

3360 ) 

3361 for _ in range(0, colspan): 

3362 # colspan🡘 current loop (col) or 1 

3363 # All the data-filling for colspan 

3364 # is done simply in this loop, 

3365 # while rowspan needs to use 

3366 # vertical_still_left to count gaps 

3367 # and col_gap_data to fill in 

3368 # those gaps with InflCell data. 

3369 if rowspan > 1: # rowspan🡙 current loop (col) or 1 

3370 while len(col_gap_data) <= len(row): 

3371 # Initialize col_gap_data/ed if 

3372 # it is lacking slots 

3373 # for each column; col_gap_data and 

3374 # vertical_still_left are never 

3375 # reset to [], during 

3376 # the whole table function. 

3377 col_gap_data.append(None) 

3378 vertical_still_left.append(0) 

3379 # Below is where the "rectangle" block of rowspan 

3380 # and colspan is filled for the future. 

3381 col_gap_data[len(row)] = cell 

3382 # col_gap_data contains cells that 

3383 # will be used in the 

3384 # future, or None 

3385 vertical_still_left[len(row)] = rowspan - 1 

3386 # A counter for how many gaps🡙 are still left to be 

3387 # filled (row.append or 

3388 # row[col_gap_data[len(row)] => 

3389 # rows), it is not reset to [], but decremented to 0 

3390 # each time a row gets something from col_gap_data. 

3391 # Append this cell 1+ times for colspan🡘 

3392 row.append(cell) 

3393 if not row: 

3394 continue 

3395 # After looping the original row-nodes above, fill 

3396 # in the rest of the row if the final cell has colspan 

3397 # (inherited from above, so a cell with rowspan and colspan) 

3398 for i in range(len(row), len(vertical_still_left)): 

3399 if vertical_still_left[i] <= 0: 

3400 continue 

3401 vertical_still_left[i] -= 1 

3402 while len(row) < i: 

3403 row.append(InflCell("", False, 1, 1, None)) 

3404 row.append(col_gap_data[i]) 

3405 # print(" ROW {!r}".format(row)) 

3406 if row_has_nonempty_cells: 3406 ↛ 3158line 3406 didn't jump to line 3158 because the condition on line 3406 was always true

3407 rows.append(row) 

3408 elif kind in ( 3408 ↛ 3158line 3408 didn't jump to line 3158 because the condition on line 3408 was always true

3409 NodeKind.TABLE_HEADER_CELL, 

3410 NodeKind.TABLE_CELL, 

3411 "th", 

3412 "td", 

3413 "span", 

3414 ): 

3415 # print(" TOP-LEVEL CELL", node) 

3416 pass 

3417 

3418 if sub_ret: 

3419 main_ret = sub_ret 

3420 main_ret.append((rows, titles, after, depth)) 

3421 else: 

3422 main_ret = [(rows, titles, after, depth)] 

3423 return main_ret 

3424 

3425 new_rows = handle_table1( 

3426 wxr, tablecontext, word, lang, pos, data, tree, titles, source, after, 0 

3427 ) 

3428 

3429 # Now we have a table that has been parsed into rows and columns of 

3430 # InflCell objects. Parse the inflection table from that format. 

3431 if new_rows: 3431 ↛ exitline 3431 didn't return from function 'handle_wikitext_or_html_table' because the condition on line 3431 was always true

3432 for rows, titles, after, depth in new_rows: 

3433 handle_generic_table( 

3434 wxr, 

3435 tablecontext, 

3436 data, 

3437 word, 

3438 lang, 

3439 pos, 

3440 rows, 

3441 titles, 

3442 source, 

3443 after, 

3444 depth, 

3445 ) 

3446 

3447 

3448def get_table_cells(node: WikiNode) -> Generator[WikiNode, None, None]: 

3449 """If a wikitext table cell contains HTML cells `<td>`, as they sometimes 

3450 do because it is easier to write wikitext conditionals that way, 

3451 those td-elements are parsed as child elements of the Wikitext cell. 

3452 This generator will yield wikitext and HTML direct children of 

3453 `node` and if a Wikitext TABLE_CELL has direct td-element children, 

3454 those are also yielded.""" 

3455 for col in node.children: 

3456 if not isinstance(col, WikiNode): 

3457 continue 

3458 if any( 

3459 isinstance(c, HTMLNode) and c.sarg in ("th", "td") 

3460 for c in col.children 

3461 ): 

3462 html_cells = [] 

3463 content = [] 

3464 for c in col.children: 

3465 if isinstance(c, HTMLNode) and c.sarg in ("th", "td"): 

3466 html_cells.append(c) 

3467 else: 

3468 content.append(c) 

3469 # Remove td-elements from col so they are not returned twice 

3470 col.children = content 

3471 yield col 

3472 for c in html_cells: 

3473 yield c 

3474 else: 

3475 yield col 

3476 

3477 

3478def handle_html_table( 

3479 wxr, word, lang, pos, data, tree, titles, source, after, tablecontext=None 

3480): 

3481 """A passer-on function for html-tables, XXX, remove these?""" 

3482 handle_wikitext_or_html_table( 

3483 wxr, word, lang, pos, data, tree, titles, source, after, tablecontext 

3484 ) 

3485 

3486 

3487def handle_wikitext_table( 

3488 wxr, word, lang, pos, data, tree, titles, source, after, tablecontext=None 

3489): 

3490 """A passer-on function for html-tables, XXX, remove these?""" 

3491 handle_wikitext_or_html_table( 

3492 wxr, word, lang, pos, data, tree, titles, source, after, tablecontext 

3493 ) 

3494 

3495 

3496def parse_inflection_section( 

3497 wxr, data, word, lang, pos, section, tree, tablecontext=None 

3498): 

3499 """Parses an inflection section on a page. ``data`` should be the 

3500 data for a part-of-speech, and inflections will be added to it.""" 

3501 

3502 # print("PARSE_INFLECTION_SECTION {}/{}/{}/{}" 

3503 # .format(word, lang, pos, section)) 

3504 assert isinstance(wxr, WiktextractContext) 

3505 assert isinstance(data, dict) 

3506 assert isinstance(word, str) 

3507 assert isinstance(lang, str) 

3508 assert isinstance(section, str) 

3509 assert isinstance(tree, WikiNode) 

3510 assert tablecontext is None or isinstance(tablecontext, TableContext) 

3511 source = section 

3512 tables = [] 

3513 titleparts = [] 

3514 preceding_bolded_title = "" 

3515 

3516 # from wikitextprocessor.parser import print_tree 

3517 # print_tree(tree) 

3518 # print("--------------******************----------------") 

3519 

3520 def process_tables(): 

3521 for kind, node, titles, after in tables: 

3522 after = "".join(after).strip() 

3523 after = clean_value(wxr, after) 

3524 if kind == "wikitext": 

3525 handle_wikitext_table( 

3526 wxr, 

3527 word, 

3528 lang, 

3529 pos, 

3530 data, 

3531 node, 

3532 titles, 

3533 source, 

3534 after, 

3535 tablecontext=tablecontext, 

3536 ) 

3537 elif kind == "html": 3537 ↛ 3551line 3537 didn't jump to line 3551 because the condition on line 3537 was always true

3538 handle_html_table( 

3539 wxr, 

3540 word, 

3541 lang, 

3542 pos, 

3543 data, 

3544 node, 

3545 titles, 

3546 source, 

3547 after, 

3548 tablecontext=tablecontext, 

3549 ) 

3550 else: 

3551 raise RuntimeError( 

3552 "{}: unimplemented table kind {}".format(word, kind) 

3553 ) 

3554 

3555 def recurse_navframe(node, titles): 

3556 nonlocal tables 

3557 nonlocal titleparts 

3558 titleparts = [] 

3559 old_tables = tables 

3560 tables = [] 

3561 

3562 recurse(node, [], navframe=True) 

3563 

3564 process_tables() 

3565 tables = old_tables 

3566 

3567 def recurse(node, titles, navframe=False): 

3568 nonlocal tables 

3569 if isinstance(node, (list, tuple)): 

3570 for x in node: 

3571 recurse(x, titles, navframe) 

3572 return 

3573 if isinstance(node, str): 

3574 if tables: 

3575 tables[-1][-1].append(node) 

3576 elif navframe: 

3577 titleparts.append(node) 

3578 return 

3579 if not isinstance(node, WikiNode): 3579 ↛ 3580line 3579 didn't jump to line 3580 because the condition on line 3579 was never true

3580 if navframe: 

3581 wxr.wtp.debug( 

3582 "inflection table: unhandled in NavFrame: {}".format(node), 

3583 sortid="inflection/2907", 

3584 ) 

3585 return 

3586 kind = node.kind 

3587 if navframe: 

3588 if kind == NodeKind.HTML: 

3589 classes = node.attrs.get("class", "").split() 

3590 if "NavToggle" in classes: 3590 ↛ 3591line 3590 didn't jump to line 3591 because the condition on line 3590 was never true

3591 return 

3592 if "NavHead" in classes: 

3593 # print("NAVHEAD:", node) 

3594 recurse(node.children, titles, navframe) 

3595 return 

3596 if "NavContent" in classes: 

3597 # print("NAVCONTENT:", node) 

3598 title = "".join(titleparts).strip() 

3599 title = html.unescape(title) 

3600 title = title.strip() 

3601 new_titles = list(titles) 

3602 if not re.match(r"(Note:|Notes:)", title): 3602 ↛ 3604line 3602 didn't jump to line 3604 because the condition on line 3602 was always true

3603 new_titles.append(title) 

3604 recurse(node, new_titles, navframe=False) 

3605 return 

3606 else: 

3607 if kind == NodeKind.TABLE: 

3608 tables.append(["wikitext", node, titles, []]) 

3609 return 

3610 elif kind == NodeKind.HTML and node.sarg == "table": 

3611 classes = node.attrs.get("class", ()) 

3612 if "audiotable" in classes: 

3613 return 

3614 tables.append(["html", node, titles, []]) 

3615 return 

3616 elif kind in ( 3616 ↛ 3623line 3616 didn't jump to line 3623 because the condition on line 3616 was never true

3617 NodeKind.LEVEL2, 

3618 NodeKind.LEVEL3, 

3619 NodeKind.LEVEL4, 

3620 NodeKind.LEVEL5, 

3621 NodeKind.LEVEL6, 

3622 ): 

3623 return # Skip subsections 

3624 if ( 

3625 kind == NodeKind.HTML 

3626 and node.sarg == "div" 

3627 and "NavFrame" in node.attrs.get("class", "").split() 

3628 ): 

3629 recurse_navframe(node, titles) 

3630 return 

3631 if kind == NodeKind.LINK: 

3632 if len(node.largs) > 1: 

3633 recurse(node.largs[1:], titles, navframe) 

3634 else: 

3635 recurse(node.largs[0], titles, navframe) 

3636 return 

3637 if kind == NodeKind.HTML and node.sarg == "ref": 

3638 return 

3639 if kind == NodeKind.LIST and node.sarg == ";": 

3640 nonlocal preceding_bolded_title 

3641 from wiktextract.page import clean_node 

3642 

3643 preceding_bolded_title = clean_node(wxr, None, node).strip("; ") 

3644 for x in node.children: 

3645 recurse(x, titles, navframe) 

3646 

3647 assert tree.kind == NodeKind.ROOT 

3648 for x in tree.children: 

3649 if preceding_bolded_title != "": 

3650 recurse(x, [preceding_bolded_title]) 

3651 else: 

3652 recurse(x, []) 

3653 

3654 # Process the tables we found 

3655 process_tables() 

3656 

3657 # XXX this code is used for extracting tables for inflection tests 

3658 if wxr.config.expand_tables: 3658 ↛ 3659line 3658 didn't jump to line 3659 because the condition on line 3658 was never true

3659 if section != "Mutation": 

3660 with open(wxr.config.expand_tables, "w") as f: 

3661 f.write(word + "\n") 

3662 f.write(lang + "\n") 

3663 f.write(pos + "\n") 

3664 f.write(section + "\n") 

3665 text = wxr.wtp.node_to_wikitext(tree) 

3666 f.write(text + "\n")