Coverage for src/wiktextract/extractor/en/inflection.py: 87%

1518 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-06-13 07:43 +0000

1# Code for parsing inflection tables. 

2# 

3# Copyright (c) 2021-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org. 

4 

5import collections 

6import copy 

7import functools 

8import html 

9import itertools 

10import re 

11import unicodedata 

12from typing import Generator, Optional, Union 

13 

14from wikitextprocessor import MAGIC_FIRST, HTMLNode, NodeKind, WikiNode 

15 

16from ...clean import clean_value 

17from ...datautils import data_append, freeze, split_at_comma_semi 

18from ...tags import valid_tags 

19from ...wxr_context import WiktextractContext 

20from .form_descriptions import ( 

21 classify_desc, 

22 decode_tags, 

23 distw, 

24 parse_head_final_tags, 

25) 

26from .inflectiondata import infl_map, infl_start_map, infl_start_re 

27from .lang_specific_configs import get_lang_conf, lang_specific_tags 

28from .table_headers_heuristics_data import LANGUAGES_WITH_CELLS_AS_HEADERS 

29from .type_utils import FormData 

30 

31# --debug-text-cell WORD 

32# Command-line parameter for debugging. When parsing inflection tables, 

33# print out debug messages when encountering this text. 

34debug_cell_text: Optional[str] = None 

35 

36 

37def set_debug_cell_text(text: str) -> None: 

38 global debug_cell_text 

39 debug_cell_text = text 

40 

41 

42TagSets = list[tuple[str, ...]] 

43 

44# Column texts that are interpreted as an empty column. 

45IGNORED_COLVALUES = { 

46 "-", 

47 "־", 

48 "᠆", 

49 "‐", 

50 "‑", 

51 "‒", 

52 "–", 

53 "—", 

54 "―", 

55 "−", 

56 "⸺", 

57 "⸻", 

58 "﹘", 

59 "﹣", 

60 "-", 

61 "/", 

62 "?", 

63 "not used", 

64 "not applicable", 

65} 

66 

67# These tags are never inherited from above 

68# XXX merge with lang_specific 

69noinherit_tags = { 

70 "infinitive-i", 

71 "infinitive-i-long", 

72 "infinitive-ii", 

73 "infinitive-iii", 

74 "infinitive-iv", 

75 "infinitive-v", 

76} 

77 

78# Subject->object transformation mapping, when using dummy-object-concord 

79# to replace subject concord tags with object concord tags 

80object_concord_replacements = { 

81 "first-person": "object-first-person", 

82 "second-person": "object-second-person", 

83 "third-person": "object-third-person", 

84 "singular": "object-singular", 

85 "plural": "object-plural", 

86 "definite": "object-definite", 

87 "indefinite": "object-indefinite", 

88 "class-1": "object-class-1", 

89 "class-2": "object-class-2", 

90 "class-3": "object-class-3", 

91 "class-4": "object-class-4", 

92 "class-5": "object-class-5", 

93 "class-6": "object-class-6", 

94 "class-7": "object-class-7", 

95 "class-8": "object-class-8", 

96 "class-9": "object-class-9", 

97 "class-10": "object-class-10", 

98 "class-11": "object-class-11", 

99 "class-12": "object-class-12", 

100 "class-13": "object-class-13", 

101 "class-14": "object-class-14", 

102 "class-15": "object-class-15", 

103 "class-16": "object-class-16", 

104 "class-17": "object-class-17", 

105 "class-18": "object-class-18", 

106 "masculine": "object-masculine", 

107 "feminine": "object-feminine", 

108} 

109 

110# Words in title that cause addition of tags in all entries 

111title_contains_global_map = { 

112 "possessive": "possessive", 

113 "possessed forms of": "possessive", 

114 "predicative forms of": "predicative", 

115 "negative": "negative", 

116 "positive definite forms": "positive definite", 

117 "positive indefinite forms": "positive indefinite", 

118 "comparative": "comparative", 

119 "superlative": "superlative", 

120 "combined forms": "combined-form", 

121 "mutation": "mutation", 

122 "definite article": "definite", 

123 "indefinite article": "indefinite", 

124 "indefinite declension": "indefinite", 

125 "bare forms": "indefinite", # e.g., cois/Irish 

126 "definite declension": "definite", 

127 "pre-reform": "dated", 

128 "personal pronouns": "personal pronoun", 

129 "composed forms of": "multiword-construction", 

130 "subordinate-clause forms of": "subordinate-clause", 

131 "participles of": "participle", 

132 "variation of": "dummy-skip-this", # a'/Scottish Gaelic 

133 "command form of": "imperative", # a راتلل/Pashto 

134 "historical inflection of": "dummy-skip-this", # kork/Norwegian Nynorsk 

135 "obsolete declension": "obsolete", # März/German 20241111 

136} 

137for k, v in title_contains_global_map.items(): 

138 if any(t not in valid_tags for t in v.split()): 138 ↛ 139line 138 didn't jump to line 139 because the condition on line 138 was never true

139 print("TITLE_CONTAINS_GLOBAL_MAP UNRECOGNIZED TAG: {}: {}".format(k, v)) 

140table_hdr_ign_part = r"(Inflection|Conjugation|Declension|Mutation) of [^\s]" 

141 

142table_hdr_ign_part_re = re.compile(r"(?i)(" + table_hdr_ign_part + ")") 

143# (?i) python regex extension, ignore case 

144title_contains_global_re = re.compile( 

145 r"(?i)(^|\b)({}|{})($|\b)".format( 

146 table_hdr_ign_part, 

147 "|".join(re.escape(x) for x in title_contains_global_map.keys()), 

148 ) 

149) 

150 

151# Words in title that cause addition of tags to table-tags "form" 

152title_contains_wordtags_map = { 

153 "pf": "perfective", 

154 "impf": "imperfective", 

155 "strong": "strong", 

156 "weak": "weak", 

157 "countable": "countable", 

158 "uncountable": "uncountable", 

159 "inanimate": "inanimate", 

160 "animate": "animate", 

161 "transitive": "transitive", 

162 "intransitive": "intransitive", 

163 "ditransitive": "ditransitive", 

164 "ambitransitive": "ambitransitive", 

165 "archaic": "archaic", 

166 "dated": "dated", 

167 "affirmative": "affirmative", 

168 "negative": "negative", 

169 "subject pronouns": "subjective", 

170 "object pronouns": "objective", 

171 "emphatic": "emphatic", 

172 "proper noun": "proper-noun", 

173 "no plural": "no-plural", 

174 "imperfective": "imperfective", 

175 "perfective": "perfective", 

176 "no supine stem": "no-supine", 

177 "no perfect stem": "no-perfect", 

178 "deponent": "deponent", 

179 "irregular": "irregular", 

180 "no short forms": "no-short-form", 

181 "iō-variant": "iō-variant", 

182 "1st declension": "declension-1", 

183 "2nd declension": "declension-2", 

184 "3rd declension": "declension-3", 

185 "4th declension": "declension-4", 

186 "5th declension": "declension-5", 

187 "6th declension": "declension-6", 

188 "first declension": "declension-1", 

189 "second declension": "declension-2", 

190 "third declension": "declension-3", 

191 "fourth declension": "declension-4", 

192 "fifth declension": "declension-5", 

193 "sixth declension": "declension-6", 

194 "1st conjugation": "conjugation-1", 

195 "2nd conjugation": "conjugation-2", 

196 "3rd conjugation": "conjugation-3", 

197 "4th conjugation": "conjugation-4", 

198 "5th conjugation": "conjugation-5", 

199 "6th conjugation": "conjugation-6", 

200 "7th conjugation": "conjugation-7", 

201 "first conjugation": "conjugation-1", 

202 "second conjugation": "conjugation-2", 

203 "third conjugation": "conjugation-3", 

204 "fourth conjugation": "conjugation-4", 

205 "fifth conjugation": "conjugation-5", 

206 "sixth conjugation": "conjugation-6", 

207 "seventh conjugation": "conjugation-7", 

208 # Corsican regional tags in table header 

209 "cismontane": "Cismontane", 

210 "ultramontane": "Ultramontane", 

211 "western lombard": "Western-Lombard", 

212 "eastern lombard": "Eastern-Lombard", 

213} 

214for k, v in title_contains_wordtags_map.items(): 

215 if any(t not in valid_tags for t in v.split()): 215 ↛ 216line 215 didn't jump to line 216 because the condition on line 215 was never true

216 print( 

217 "TITLE_CONTAINS_WORDTAGS_MAP UNRECOGNIZED TAG: {}: {}".format(k, v) 

218 ) 

219title_contains_wordtags_re = re.compile( 

220 r"(?i)(^|\b)({}|{})($|\b)".format( 

221 table_hdr_ign_part, 

222 "|".join(re.escape(x) for x in title_contains_wordtags_map.keys()), 

223 ) 

224) 

225 

226# Parenthesized elements in title that are converted to tags in 

227# "table-tags" form 

228title_elements_map = { 

229 "weak": "weak", 

230 "strong": "strong", 

231 "separable": "separable", 

232 "masculine": "masculine", 

233 "feminine": "feminine", 

234 "neuter": "neuter", 

235 "singular": "singular", 

236 "plural": "plural", 

237 "archaic": "archaic", 

238 "dated": "dated", 

239 "Attic": "Attic", # e.g. καλός/Greek/Adj 

240 "Epic": "Epic", # e.g. καλός/Greek/Adj 

241} 

242for k, v in title_elements_map.items(): 

243 if any(t not in valid_tags for t in v.split()): 243 ↛ 244line 243 didn't jump to line 244 because the condition on line 243 was never true

244 print("TITLE_ELEMENTS_MAP UNRECOGNIZED TAG: {}: {}".format(k, v)) 

245 

246# Parenthized element starts to map them to tags for form for the rest of 

247# the element 

248title_elemstart_map = { 

249 "auxiliary": "auxiliary", 

250 "Kotus type": "class", 

251 "ÕS type": "class", 

252 "class": "class", 

253 "short class": "class", 

254 "type": "class", 

255 "strong class": "class", 

256 "weak class": "class", 

257 "accent paradigm": "accent-paradigm", 

258 "stem in": "class", 

259} 

260for k, v in title_elemstart_map.items(): 

261 if any(t not in valid_tags for t in v.split()): 261 ↛ 262line 261 didn't jump to line 262 because the condition on line 261 was never true

262 print("TITLE_ELEMSTART_MAP UNRECOGNIZED TAG: {}: {}".format(k, v)) 

263title_elemstart_re = re.compile( 

264 r"^({}) ".format("|".join(re.escape(x) for x in title_elemstart_map.keys())) 

265) 

266 

267 

268# Regexp for cell starts that are likely definitions of reference symbols. 

269# See also nondef_re. 

270def_re = re.compile( 

271 r"(\s*•?\s+)?" 

272 r"((\*+|[△†0123456789⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻]+)([⁾):]|\s|(?=[A-Z]))|" 

273 r"\^(\*+|[△†])|" 

274 r"([¹²³⁴⁵⁶⁷⁸⁹])|" 

275 r"([ᴬᴮᴰᴱᴳᴴᴵᴶᴷᴸᴹᴺᴼᴾᴿᵀᵁⱽᵂᵃᵇᶜᵈᵉᶠᵍʰⁱʲᵏˡᵐⁿᵒᵖʳˢᵗᵘᵛʷˣʸᶻᵝᵞᵟᶿᶥᵠᵡ]))" 

276) 

277# ᴺᴸᴴ persan/Old Irish 

278 

279# Regexp for cell starts that are exceptions to def_re and do not actually 

280# start a definition. 

281nondef_re = re.compile( 

282 r"(^\s*(1|2|3)\s+(sg|pl)\s*$|" # 1s or 3p etc. 

283 r"\s*\d\d?\s*/\s*\d\d?\s*$)" 

284) # taka/Swahili "15 / 17" 

285 

286# Certain tags are moved from headers in tables into word tags, as they always 

287# apply to the whole word. 

288TAGS_FORCED_WORDTAGS: set[str] = set( 

289 [ 

290 # This was originally created for a issue with number paradigms in 

291 # Arabic, but that is being handled elsewhere now. 

292 ] 

293) 

294 

295 

296class InflCell: 

297 """Cell in an inflection table.""" 

298 

299 __slots__ = ( 

300 "text", 

301 "is_title", 

302 "colspan", 

303 "rowspan", 

304 "target", 

305 ) 

306 

307 def __init__( 

308 self, 

309 text: str, 

310 is_title: bool, 

311 colspan: int, 

312 rowspan: int, 

313 target: Optional[str], 

314 ) -> None: 

315 assert isinstance(text, str) 

316 assert is_title in (True, False) 

317 assert isinstance(colspan, int) and colspan >= 1 

318 assert isinstance(rowspan, int) and rowspan >= 1 

319 assert target is None or isinstance(target, str) 

320 self.text = text.strip() 

321 self.is_title = text and is_title 

322 self.colspan = colspan 

323 self.rowspan = rowspan 

324 self.target = target 

325 

326 def __str__(self) -> str: 

327 v = "{}/{}/{}/{!r}".format( 

328 self.text, self.is_title, self.colspan, self.rowspan 

329 ) 

330 if self.target: 

331 v += ": {!r}".format(self.target) 

332 return v 

333 

334 def __repr__(self) -> str: 

335 return str(self) 

336 

337 

338class HdrSpan: 

339 """Saved information about a header cell/span during the parsing 

340 of a table.""" 

341 

342 __slots__ = ( 

343 "start", 

344 "colspan", 

345 "rowspan", 

346 "rownum", # Row number where this occurred 

347 "tagsets", # list of tuples 

348 "text", # For debugging 

349 "all_headers_row", 

350 "expanded", # The header has been expanded to cover whole row/part 

351 ) 

352 

353 def __init__( 

354 self, 

355 start: int, 

356 colspan: int, 

357 rowspan: int, 

358 rownum: int, 

359 tagsets: TagSets, 

360 text: str, 

361 all_headers_row: bool, 

362 ) -> None: 

363 assert isinstance(start, int) and start >= 0 

364 assert isinstance(colspan, int) and colspan >= 1 

365 assert isinstance(rownum, int) 

366 assert isinstance(tagsets, list) 

367 for x in tagsets: 

368 assert isinstance(x, tuple) 

369 assert all_headers_row in (True, False) 

370 self.start = start 

371 self.colspan = colspan 

372 self.rowspan = rowspan 

373 self.rownum = rownum 

374 self.tagsets = list(tuple(sorted(set(tags))) for tags in tagsets) 

375 self.text = text 

376 self.all_headers_row = all_headers_row 

377 self.expanded = False 

378 

379 

380def is_superscript(ch: str) -> bool: 

381 """Returns True if the argument is a superscript character.""" 

382 assert isinstance(ch, str) and len(ch) == 1 

383 try: 

384 name = unicodedata.name(ch) 

385 except ValueError: 

386 return False 

387 return ( 

388 re.match( 

389 r"SUPERSCRIPT |" 

390 r"MODIFIER LETTER SMALL |" 

391 r"MODIFIER LETTER CAPITAL ", 

392 name, 

393 ) 

394 is not None 

395 ) 

396 

397 

398def remove_useless_tags(lang: str, pos: str, tags: set[str]) -> None: 

399 """Remove certain tag combinations from ``tags`` when they serve no purpose 

400 together (cover all options).""" 

401 assert isinstance(lang, str) 

402 assert isinstance(pos, str) 

403 assert isinstance(tags, set) 

404 if ( 

405 "animate" in tags 

406 and "inanimate" in tags 

407 and get_lang_conf(lang, "animate_inanimate_remove") 

408 ): 

409 tags.remove("animate") 

410 tags.remove("inanimate") 

411 if ( 

412 "virile" in tags 

413 and "nonvirile" in tags 

414 and get_lang_conf(lang, "virile_nonvirile_remove") 

415 ): 

416 tags.remove("virile") 

417 tags.remove("nonvirile") 

418 # If all numbers in the language are listed, remove them all 

419 numbers = get_lang_conf(lang, "numbers") 

420 if numbers and all(x in tags for x in numbers): 

421 for x in numbers: 

422 tags.remove(x) 

423 # If all genders in the language are listed, remove them all 

424 genders = get_lang_conf(lang, "genders") 

425 if genders and all(x in tags for x in genders): 

426 for x in genders: 

427 tags.remove(x) 

428 # If all voices in the language are listed, remove them all 

429 voices = get_lang_conf(lang, "voices") 

430 if voices and all(x in tags for x in voices): 

431 for x in voices: 

432 tags.remove(x) 

433 # If all strengths of the language are listed, remove them all 

434 strengths = get_lang_conf(lang, "strengths") 

435 if strengths and all(x in tags for x in strengths): 

436 for x in strengths: 

437 tags.remove(x) 

438 # If all persons of the language are listed, remove them all 

439 persons = get_lang_conf(lang, "persons") 

440 if persons and all(x in tags for x in persons): 

441 for x in persons: 

442 tags.remove(x) 

443 # If all definitenesses of the language are listed, remove them all 

444 definitenesses = get_lang_conf(lang, "definitenesses") 

445 if definitenesses and all(x in tags for x in definitenesses): 

446 for x in definitenesses: 

447 tags.remove(x) 

448 

449 

450def tagset_cats(tagset: TagSets) -> set[str]: 

451 """Returns a set of tag categories for the tagset (merged from all 

452 alternatives).""" 

453 return set(valid_tags[t] for ts in tagset for t in ts) 

454 

455 

456def or_tagsets( 

457 lang: str, pos: str, tagsets1: TagSets, tagsets2: TagSets 

458) -> TagSets: 

459 """Merges two tagsets (the new tagset just merges the tags from both, in 

460 all combinations). If they contain simple alternatives (differ in 

461 only one category), they are simply merged; otherwise they are split to 

462 more alternatives. The tagsets are assumed be sets of sorted tuples.""" 

463 assert isinstance(tagsets1, list) 

464 assert all(isinstance(x, tuple) for x in tagsets1) 

465 assert isinstance(tagsets2, list) 

466 assert all(isinstance(x, tuple) for x in tagsets1) 

467 tagsets: TagSets = [] # This will be the result 

468 

469 def add_tags(tags1: tuple[str, ...]) -> None: 

470 # CONTINUE 

471 if not tags1: 

472 return # empty set would merge with anything, won't change result 

473 if not tagsets: 

474 tagsets.append(tags1) 

475 return 

476 for tags2 in tagsets: 

477 # Determine if tags1 can be merged with tags2 

478 num_differ = 0 

479 if tags1 and tags2: 479 ↛ 497line 479 didn't jump to line 497 because the condition on line 479 was always true

480 cats1 = set(valid_tags[t] for t in tags1) 

481 cats2 = set(valid_tags[t] for t in tags2) 

482 cats = cats1 | cats2 

483 for cat in cats: 

484 tags1_in_cat = set(t for t in tags1 if valid_tags[t] == cat) 

485 tags2_in_cat = set(t for t in tags2 if valid_tags[t] == cat) 

486 if ( 

487 tags1_in_cat != tags2_in_cat 

488 or not tags1_in_cat 

489 or not tags2_in_cat 

490 ): 

491 num_differ += 1 

492 if not tags1_in_cat or not tags2_in_cat: 

493 # Prevent merging if one is empty 

494 num_differ += 1 

495 # print("tags1={} tags2={} num_differ={}" 

496 # .format(tags1, tags2, num_differ)) 

497 if num_differ <= 1: 

498 # Yes, they can be merged 

499 tagsets.remove(tags2) 

500 tags_s = set(tags1) | set(tags2) 

501 remove_useless_tags(lang, pos, tags_s) 

502 tags_t = tuple(sorted(tags_s)) 

503 add_tags(tags_t) # Could result in further merging 

504 return 

505 # If we could not merge, add to tagsets 

506 tagsets.append(tags1) 

507 

508 for tags in tagsets1: 

509 add_tags(tags) 

510 for tags in tagsets2: 

511 add_tags(tags) 

512 if not tagsets: 

513 tagsets.append(()) 

514 

515 # print("or_tagsets: {} + {} -> {}" 

516 # .format(tagsets1, tagsets2, tagsets)) 

517 return tagsets 

518 

519 

520def and_tagsets( 

521 lang: str, 

522 pos: str, 

523 tagsets1: list[tuple[str, ...]], 

524 tagsets2: list[tuple[str, ...]], 

525) -> list[tuple[str, ...]]: 

526 """Merges tagsets by taking union of all cobinations, without trying 

527 to determine whether they are compatible.""" 

528 assert isinstance(tagsets1, list) and len(tagsets1) >= 1 

529 assert all(isinstance(x, tuple) for x in tagsets1) 

530 assert isinstance(tagsets2, list) and len(tagsets2) >= 1 

531 assert all(isinstance(x, tuple) for x in tagsets1) 

532 new_tagsets = [] 

533 tags: Union[set[str], tuple[str, ...]] 

534 for tags1 in tagsets1: 

535 for tags2 in tagsets2: 

536 tags = set(tags1) | set(tags2) 

537 remove_useless_tags(lang, pos, tags) 

538 if "dummy-ignored-text-cell" in tags: 538 ↛ 539line 538 didn't jump to line 539 because the condition on line 538 was never true

539 tags.remove("dummy-ignored-text-cell") 

540 tags = tuple(sorted(tags)) 

541 if tags not in new_tagsets: 541 ↛ 535line 541 didn't jump to line 535 because the condition on line 541 was always true

542 new_tagsets.append(tags) 

543 # print("and_tagsets: {} + {} -> {}" 

544 # .format(tagsets1, tagsets2, new_tagsets)) 

545 return new_tagsets 

546 

547 

548@functools.lru_cache(65536) 

549def extract_cell_content( 

550 lang: str, word: str, col: str 

551) -> tuple[str, list[str], list[tuple[str, str]], list[str]]: 

552 """Cleans a row/column header for later processing. This returns 

553 (cleaned, refs, defs, tags).""" 

554 # print("EXTRACT_CELL_CONTENT {!r}".format(col)) 

555 hdr_tags = [] 

556 col = re.sub(r"(?s)\s*,\s*$", "", col) 

557 col = re.sub(r"(?s)\s*•\s*$", "", col) 

558 col = re.sub(r"\s+", " ", col) 

559 col = col.strip() 

560 if re.search( 

561 r"^\s*(There are |" 

562 r"\* |" 

563 r"see |" 

564 r"Use |" 

565 r"use the |" 

566 r"Only used |" 

567 r"The forms in |" 

568 r"these are also written |" 

569 r"The genitive can be |" 

570 r"Genitive forms are rare or non-existant|" 

571 r"Accusative Note: |" 

572 r"Classifier Note: |" 

573 r"Noun: Assamese nouns are |" 

574 r"the active conjugation|" 

575 r"the instrumenal singular|" 

576 r"Note:|" 

577 r"\^* Note:|" 

578 r"possible mutated form |" 

579 r"The future tense: )", 

580 col, 

581 ): 

582 return "dummy-ignored-text-cell", [], [], [] 

583 

584 # Temporarily remove final parenthesized part (if separated by whitespace), 

585 # so that we can extract reference markers before it. 

586 final_paren = "" 

587 m = re.search(r"\s+\([^)]*\)$", col) 

588 if m is not None: 

589 final_paren = m.group(0) 

590 col = col[: m.start()] 

591 

592 # Extract references and tag markers 

593 refs = [] 

594 special_references = get_lang_conf(lang, "special_references") 

595 while True: 

596 m = re.search(r"\^(.|\([^)]*\))$", col) 

597 if not m: 

598 break 

599 r = m.group(1) 

600 if r.startswith("(") and r.endswith(")"): 

601 r = r[1:-1] 

602 for r1 in r.split(","): 

603 if r1 == "rare": 603 ↛ 604line 603 didn't jump to line 604 because the condition on line 603 was never true

604 hdr_tags.append("rare") 

605 elif special_references and r1 in special_references: 

606 hdr_tags.extend(special_references[r1].split()) 

607 else: 

608 # v = m.group(1) 

609 if r1.startswith("(") and r1.endswith(")"): 609 ↛ 610line 609 didn't jump to line 610 because the condition on line 609 was never true

610 r1 = r1[1:-1] 

611 refs.append(unicodedata.normalize("NFKD", r1)) 

612 col = col[: m.start()] 

613 # See if it is a ref definition 

614 # print("BEFORE REF CHECK: {!r}".format(col)) 

615 m = def_re.match(col) 

616 # print(f"Before def_re: {refs=}") 

617 if m and not nondef_re.match(col): 

618 ofs = 0 

619 ref = None 

620 deflst = [] 

621 for m in re.finditer(def_re, col): 

622 if ref: 

623 deflst.append((ref, col[ofs : m.start()].strip())) 

624 ref = unicodedata.normalize( 

625 "NFKD", m.group(3) or m.group(5) or m.group(6) or "" 

626 ) 

627 ofs = m.end() 

628 if ref: 628 ↛ 631line 628 didn't jump to line 631 because the condition on line 628 was always true

629 deflst.append((ref, col[ofs:].strip())) 

630 # print("deflst:", deflst) 

631 return "", [], deflst, [] 

632 # See if it *looks* like a reference to a definition 

633 # print(f"After def_re: {refs=}") 

634 while col: 

635 if is_superscript(col[-1]) or col[-1] in ("†",): 

636 if col.endswith("ʳᵃʳᵉ"): 

637 hdr_tags.append("rare") 

638 col = col[:-4].strip() 

639 continue 

640 if special_references: 

641 stop_flag = False 

642 for r in special_references: 

643 if col.endswith(r): 

644 hdr_tags.extend(special_references[r].split()) 

645 col = col[: -len(r)].strip() 

646 stop_flag = True 

647 break # this for loop 

648 if stop_flag: 

649 continue # this while loop 

650 # Numbers and H/L/N are useful information 

651 refs.append(unicodedata.normalize("NFKD", col[-1])) 

652 col = col[:-1] 

653 else: 

654 break 

655 

656 # Check for another form of note definition 

657 if ( 657 ↛ 663line 657 didn't jump to line 663 because the condition on line 657 was never true

658 len(col) > 2 

659 and col[1] in (")", " ", ":") 

660 and col[0].isdigit() 

661 and not re.match(nondef_re, col) 

662 ): 

663 return "", [], [(col[0], col[2:].strip())], [] 

664 col = col.strip() 

665 

666 # Extract final "*" reference symbols. Sometimes there are multiple. 

667 m = re.search(r"\*+$", col) 

668 if m is not None: 

669 col = col[: m.start()] 

670 refs.append(unicodedata.normalize("NFKD", m.group(0))) 

671 if col.endswith("(*)"): 671 ↛ 672line 671 didn't jump to line 672 because the condition on line 671 was never true

672 col = col[:-3].strip() 

673 refs.append("*") 

674 

675 # Put back the final parenthesized part 

676 col = col.strip() + final_paren 

677 # print("EXTRACT_CELL_CONTENT: orig_col={!r} col={!r} refs={!r} hdr_tags={}" 

678 # .format(orig_col, col, refs, hdr_tags)) 

679 return col.strip(), refs, [], hdr_tags 

680 

681 

682@functools.lru_cache(10000) 

683def parse_title( 

684 title: str, source: str 

685) -> tuple[list[str], list[str], list[FormData]]: 

686 """Parses inflection table title. This returns (global_tags, table_tags, 

687 extra_forms), where ``global_tags`` is tags to be added to each inflection 

688 entry, ``table_tags`` are tags for the word but not to be added to every 

689 form, and ``extra_forms`` is dictionary describing additional forms to be 

690 included in the part-of-speech entry).""" 

691 assert isinstance(title, str) 

692 assert isinstance(source, str) 

693 title = html.unescape(title) 

694 title = re.sub(r"(?i)<[^>]*>", "", title).strip() 

695 title = re.sub(r"\s+", " ", title) 

696 # print("PARSE_TITLE:", title) 

697 global_tags = [] 

698 table_tags = [] 

699 extra_forms = [] 

700 # Add certain global tags based on contained words 

701 for m in re.finditer(title_contains_global_re, title): 

702 v = m.group(0).lower() 

703 if re.match(table_hdr_ign_part_re, v): 703 ↛ 704line 703 didn't jump to line 704 because the condition on line 703 was never true

704 continue 

705 global_tags.extend(title_contains_global_map[v].split()) 

706 # Add certain tags to table-tags "form" based on contained words 

707 for m in re.finditer(title_contains_wordtags_re, title): 

708 v = m.group(0).lower() 

709 if re.match(table_hdr_ign_part_re, v): 709 ↛ 710line 709 didn't jump to line 710 because the condition on line 709 was never true

710 continue 

711 table_tags.extend(title_contains_wordtags_map[v].split()) 

712 if re.search(r"Conjugation of (s’|se ).*French verbs", title): 712 ↛ 713line 712 didn't jump to line 713 because the condition on line 712 was never true

713 global_tags.append("reflexive") 

714 # Check for <x>-type at the beginning of title (e.g., Armenian) and various 

715 # other ways of specifying an inflection class. 

716 for m in re.finditer( 

717 r"\b(" 

718 r"[\w/]+-type|" 

719 r"accent-\w+|" 

720 r"[\w/]+-stem|" 

721 r"[^ ]+ gradation|" 

722 r"\b(stem in [\w/ ]+)|" 

723 r"[^ ]+ alternation|" 

724 r"(First|Second|Third|Fourth|Fifth|Sixth|Seventh) " 

725 r"(Conjugation|declension)|" 

726 r"First and second declension|" 

727 r"(1st|2nd|3rd|4th|5th|6th) declension|" 

728 r"\w[\w/ ]* harmony" 

729 r")\b", 

730 title, 

731 ): 

732 dt: FormData = {"form": m.group(1), "source": source, "tags": ["class"]} 

733 extra_forms.append(dt) 

734 # Parse parenthesized part from title 

735 for m in re.finditer(r"\(([^)]*)\)", title): 

736 for elem in m.group(1).split(","): 

737 # group(0) is the whole string, group(1) first parens 

738 elem = elem.strip() 

739 if elem in title_elements_map: 

740 table_tags.extend(title_elements_map[elem].split()) 

741 else: 

742 m1 = re.match(title_elemstart_re, elem) 

743 if m1: 

744 tags = title_elemstart_map[m1.group(1)].split() 

745 dt = { 

746 "form": elem[m1.end() :], 

747 "source": source, 

748 "tags": tags, 

749 } 

750 extra_forms.append(dt) 

751 # For titles that contains no parenthesized parts, do some special 

752 # handling to still interpret parts from them 

753 if "(" not in title: 

754 # No parenthesized parts 

755 m1 = re.search(r"\b(Portuguese) (-.* verb) ", title) 

756 if m1 is not None: 

757 dt = {"form": m1.group(2), "tags": ["class"], "source": source} 

758 extra_forms.append(dt) 

759 for elem in title.split(","): 

760 elem = elem.strip() 

761 if elem in title_elements_map: 761 ↛ 762line 761 didn't jump to line 762 because the condition on line 761 was never true

762 table_tags.extend(title_elements_map[elem].split()) 

763 elif elem.endswith("-stem"): 763 ↛ 764line 763 didn't jump to line 764 because the condition on line 763 was never true

764 dt = {"form": elem, "tags": ["class"], "source": source} 

765 extra_forms.append(dt) 

766 return global_tags, table_tags, extra_forms 

767 

768 

769def expand_header( 

770 wxr: WiktextractContext, 

771 tablecontext: "TableContext", 

772 word: str, 

773 lang: str, 

774 pos: str, 

775 text: str, 

776 base_tags: Union[list[str], set[str], tuple[str, ...]], 

777 silent=False, 

778 ignore_tags=False, 

779 depth=0, 

780) -> list[tuple[str, ...]]: 

781 """Expands a cell header to tagset, handling conditional expressions 

782 in infl_map. This returns list of tuples of tags, each list element 

783 describing an alternative interpretation. ``base_tags`` is combined 

784 column and row tags for the cell in which the text is being interpreted 

785 (conditional expressions in inflection data may depend on it). 

786 If ``silent`` is True, then no warnings will be printed. If ``ignore_tags`` 

787 is True, then tags listed in "if" will be ignored in the test (this is 

788 used when trying to heuristically detect whether a non-<th> cell is anyway 

789 a header).""" 

790 assert isinstance(wxr, WiktextractContext) 

791 assert isinstance(word, str) 

792 assert isinstance(lang, str) 

793 assert isinstance(pos, str) 

794 assert isinstance(text, str) 

795 assert isinstance(base_tags, (list, tuple, set)) 

796 assert silent in (True, False) 

797 assert isinstance(depth, int) 

798 # print("EXPAND_HDR: text={!r} base_tags={!r}".format(text, base_tags)) 

799 # First map the text using the inflection map 

800 text = clean_value(wxr, text) 

801 combined_return: list[tuple[str, ...]] = [] 

802 parts = split_at_comma_semi(text, separators=[";"]) 

803 for text in parts: 

804 if not text: 804 ↛ 805line 804 didn't jump to line 805 because the condition on line 804 was never true

805 continue 

806 if text in infl_map: 

807 v = infl_map[text] # list or string 

808 else: 

809 m = re.match(infl_start_re, text) 

810 if m is not None: 810 ↛ 811line 810 didn't jump to line 811 because the condition on line 810 was never true

811 v = infl_start_map[m.group(1)] 

812 # print("INFL_START {} -> {}".format(text, v)) 

813 elif re.match(r"Notes", text): 

814 # Ignored header 

815 # print("IGNORING NOTES") 

816 combined_return = or_tagsets( 

817 lang, pos, combined_return, [("dummy-skip-this",)] 

818 ) 

819 # this just adds dummy-skip-this 

820 continue 

821 elif text in IGNORED_COLVALUES: 

822 combined_return = or_tagsets( 

823 lang, pos, combined_return, [("dummy-ignore-skipped",)] 

824 ) 

825 continue 

826 # Try without final parenthesized part 

827 text_without_parens = re.sub(r"[,/]?\s+\([^)]*\)\s*$", "", text) 

828 if text_without_parens in infl_map: 

829 v = infl_map[text_without_parens] 

830 elif m is None: 830 ↛ 846line 830 didn't jump to line 846 because the condition on line 830 was always true

831 if not silent: 

832 wxr.wtp.debug( 

833 "inflection table: unrecognized header: {}".format( 

834 repr(text) 

835 ), 

836 sortid="inflection/735", 

837 ) 

838 # Unrecognized header 

839 combined_return = or_tagsets( 

840 lang, pos, combined_return, [("error-unrecognized-form",)] 

841 ) 

842 continue 

843 

844 # Then loop interpreting the value, until the value is a simple string. 

845 # This may evaluate nested conditional expressions. 

846 default_then = None 

847 while True: 

848 # If it is a string, we are done. 

849 if isinstance(v, str): 

850 tags = set(v.split()) 

851 remove_useless_tags(lang, pos, tags) 

852 tagset = [tuple(sorted(tags))] 

853 break 

854 # For a list, just interpret it as alternatives. (Currently the 

855 # alternatives must directly be strings.) 

856 if isinstance(v, (list, tuple)): 

857 tagset = [] 

858 for x in v: 

859 tags = set(x.split()) 

860 remove_useless_tags(lang, pos, tags) 

861 tags_t = tuple(sorted(tags)) 

862 if tags_t not in tagset: 862 ↛ 858line 862 didn't jump to line 858 because the condition on line 862 was always true

863 tagset.append(tags_t) 

864 break 

865 # Otherwise the value should be a dictionary describing a 

866 # conditional expression. 

867 if not isinstance(v, dict): 867 ↛ 868line 867 didn't jump to line 868 because the condition on line 867 was never true

868 wxr.wtp.debug( 

869 "inflection table: internal: " 

870 "UNIMPLEMENTED INFL_MAP VALUE: {}".format(infl_map[text]), 

871 sortid="inflection/767", 

872 ) 

873 tagset = [()] 

874 break 

875 # Evaluate the conditional expression. 

876 assert isinstance(v, dict) 

877 cond: Union[bool, str] = "default-true" 

878 c: Union[str, list[str], set[str]] = "" 

879 # Handle "lang" condition. The value must be either a 

880 # single language or a list of languages, and the 

881 # condition evaluates to True if the table is one of 

882 # those languages. 

883 if "lang" in v: 

884 c = v["lang"] 

885 if isinstance(c, str): 

886 cond = c == lang 

887 else: 

888 assert isinstance(c, (list, tuple, set)) 

889 cond = lang in c 

890 # Handle "nested-table-depth" condition. The value must 

891 # be an int or list of ints, and the condition evaluates 

892 # True if the depth is one of those values. 

893 # "depth" is how deep into a nested table tree the current 

894 # table lies. It is first started in handle_wikitext_table, 

895 # so only applies to tables-within-tables, not other 

896 # WikiNode content. `depth` is currently only passed as a 

897 # parameter down the table parsing stack, and not stored. 

898 if cond and "nested-table-depth" in v: 898 ↛ 899line 898 didn't jump to line 899 because the condition on line 898 was never true

899 d = v["nested-table-depth"] 

900 if isinstance(d, int): 

901 cond = d == depth 

902 else: 

903 assert isinstance(d, (list, tuple, set)) 

904 cond = depth in d 

905 # Handle inflection-template condition. Must be a string 

906 # or list of strings, and if tablecontext.template_name is in 

907 # those, accept the condition. 

908 # TableContext.template_name is passed down from page/ 

909 # parse_inflection, before parsing and expanding itself 

910 # has begun. 

911 if cond and tablecontext and "inflection-template" in v: 

912 d1 = v["inflection-template"] 

913 if isinstance(d1, str): 913 ↛ 916line 913 didn't jump to line 916 because the condition on line 913 was always true

914 cond = d1 == tablecontext.template_name 

915 else: 

916 assert isinstance(d1, (list, tuple, set)) 

917 cond = tablecontext.template_name in d1 

918 # Handle "pos" condition. The value must be either a single 

919 # part-of-speech or a list of them, and the condition evaluates to 

920 # True if the part-of-speech is any of those listed. 

921 if cond and "pos" in v: 

922 c = v["pos"] 

923 if isinstance(c, str): 

924 cond = c == pos 

925 else: 

926 assert isinstance(c, (list, tuple, set)) 

927 cond = pos in c 

928 # Handle "if" condition. The value must be a string containing a 

929 # space-separated list of tags. The condition evaluates to True if 

930 # ``base_tags`` contains all of the listed tags. If the condition 

931 # is of the form "any: ...tags...", then any of the tags will be 

932 # enough. 

933 if cond and "if" in v and not ignore_tags: 

934 c = v["if"] 

935 assert isinstance(c, str) 

936 # "if" condition is true if any of the listed tags is present if 

937 # it starts with "any:", otherwise all must be present 

938 if c.startswith("any: "): 

939 cond = any(t in base_tags for t in c[5:].split()) 

940 else: 

941 cond = all(t in base_tags for t in c.split()) 

942 

943 # Handle "default" assignment. Store the value to be used 

944 # as a default later. 

945 if "default" in v: 

946 assert isinstance(v["default"], str) 

947 default_then = v["default"] 

948 

949 # Warning message about missing conditions for debugging. 

950 

951 if cond == "default-true" and not default_then and not silent: 

952 wxr.wtp.debug( 

953 "inflection table: IF MISSING COND: word={} " 

954 "lang={} text={} base_tags={} c={} cond={}".format( 

955 word, lang, text, base_tags, c, cond 

956 ), 

957 sortid="inflection/851", 

958 ) 

959 # Based on the result of evaluating the condition, select either 

960 # "then" part or "else" part. 

961 if cond: 

962 v = v.get("then", "") 

963 else: 

964 v1 = v.get("else") 

965 if v1 is None: 

966 if default_then: 

967 v = default_then 

968 else: 

969 if not silent: 

970 wxr.wtp.debug( 

971 "inflection table: IF WITHOUT ELSE EVALS " 

972 "False: " 

973 "{}/{} {!r} base_tags={}".format( 

974 word, lang, text, base_tags 

975 ), 

976 sortid="inflection/865", 

977 ) 

978 v = "error-unrecognized-form" 

979 else: 

980 v = v1 

981 

982 # Merge the resulting tagset from this header part with the other 

983 # tagsets from the whole header 

984 combined_return = or_tagsets(lang, pos, combined_return, tagset) 

985 

986 # Return the combined tagsets, or empty tagset if we got no tagsets 

987 if not combined_return: 

988 combined_return = [()] 

989 return combined_return 

990 

991 

992def compute_coltags( 

993 lang: str, 

994 pos: str, 

995 hdrspans: list[str], 

996 start: int, 

997 colspan: int, 

998 celltext: int, 

999) -> list[tuple[str]]: 

1000 """Computes column tags for a column of the given width based on the 

1001 current header spans.""" 

1002 assert isinstance(lang, str) 

1003 assert isinstance(pos, str) 

1004 assert isinstance(hdrspans, list) 

1005 assert isinstance(start, int) and start >= 0 

1006 assert isinstance(colspan, int) and colspan >= 1 

1007 assert isinstance(celltext, str) # For debugging only 

1008 # print("COMPUTE_COLTAGS CALLED start={} colspan={} celltext={!r}" 

1009 # .format(start, colspan, celltext)) 

1010 # For debugging, set this to the form for whose cell you want debug prints 

1011 if celltext == debug_cell_text: 1011 ↛ 1012line 1011 didn't jump to line 1012 because the condition on line 1011 was never true

1012 print( 

1013 "COMPUTE_COLTAGS CALLED start={} colspan={} celltext={!r}".format( 

1014 start, colspan, celltext 

1015 ) 

1016 ) 

1017 for hdrspan in hdrspans: 

1018 print( 

1019 " row={} start={} colspans={} tagsets={}".format( 

1020 hdrspan.rownum, 

1021 hdrspan.start, 

1022 hdrspan.colspan, 

1023 hdrspan.tagsets, 

1024 ) 

1025 ) 

1026 used = set() 

1027 coltags = [()] 

1028 last_header_row = 1000000 

1029 # Iterate through the headers in reverse order, i.e., headers lower in the 

1030 # table (closer to the cell) first. 

1031 row_tagsets = [()] 

1032 row_tagsets_rownum = 1000000 

1033 used_hdrspans = set() 

1034 for hdrspan in reversed(hdrspans): 

1035 if ( 

1036 hdrspan.start + hdrspan.colspan <= start 

1037 or hdrspan.start >= start + colspan 

1038 ): 

1039 # Does not horizontally overlap current cell. Ignore this hdrspan. 

1040 if celltext == debug_cell_text: 1040 ↛ 1041line 1040 didn't jump to line 1041 because the condition on line 1040 was never true

1041 print( 

1042 "Ignoring row={} start={} colspan={} tagsets={}".format( 

1043 hdrspan.rownum, 

1044 hdrspan.start, 

1045 hdrspan.colspan, 

1046 hdrspan.tagsets, 

1047 ) 

1048 ) 

1049 continue 

1050 # If the cell partially overlaps the current cell, assume we have 

1051 # reached something unrelated and abort. 

1052 if ( 

1053 hdrspan.start < start 

1054 and hdrspan.start + hdrspan.colspan > start 

1055 and hdrspan.start + hdrspan.colspan < start + colspan 

1056 ): 

1057 if celltext == debug_cell_text: 1057 ↛ 1058line 1057 didn't jump to line 1058 because the condition on line 1057 was never true

1058 print( 

1059 "break on partial overlap at start {} {} {}".format( 

1060 hdrspan.start, hdrspan.colspan, hdrspan.tagsets 

1061 ) 

1062 ) 

1063 break 

1064 if ( 

1065 hdrspan.start < start + colspan 

1066 and hdrspan.start > start 

1067 and hdrspan.start + hdrspan.colspan > start + colspan 

1068 and not hdrspan.expanded 

1069 ): 

1070 if celltext == debug_cell_text: 1070 ↛ 1071line 1070 didn't jump to line 1071 because the condition on line 1070 was never true

1071 print( 

1072 "break on partial overlap at end {} {} {}".format( 

1073 hdrspan.start, hdrspan.colspan, hdrspan.tagsets 

1074 ) 

1075 ) 

1076 break 

1077 # Check if we have already used this cell. 

1078 if id(hdrspan) in used_hdrspans: 

1079 continue 

1080 # We are going to use this cell. 

1081 used_hdrspans.add(id(hdrspan)) 

1082 tagsets = hdrspan.tagsets 

1083 # If the hdrspan is fully inside the current cell and does not cover 

1084 # it fully, check if we should merge information from multiple cells. 

1085 if not hdrspan.expanded and ( 

1086 hdrspan.start > start 

1087 or hdrspan.start + hdrspan.colspan < start + colspan 

1088 ): 

1089 # Multiple columns apply to the current cell, only 

1090 # gender/number/case tags present 

1091 # If there are no tags outside the range in any of the 

1092 # categories included in these cells, don't add anything 

1093 # (assume all choices valid in the language are possible). 

1094 in_cats = set( 

1095 valid_tags[t] 

1096 for x in hdrspans 

1097 if x.rownum == hdrspan.rownum 

1098 and x.start >= start 

1099 and x.start + x.colspan <= start + colspan 

1100 for tt in x.tagsets 

1101 for t in tt 

1102 ) 

1103 if celltext == debug_cell_text: 1103 ↛ 1104line 1103 didn't jump to line 1104 because the condition on line 1103 was never true

1104 print("in_cats={} tagsets={}".format(in_cats, tagsets)) 

1105 # Merge the tagsets into existing tagsets. This merges 

1106 # alternatives into the same tagset if there is only one 

1107 # category different; otherwise this splits the tagset into 

1108 # more alternatives. 

1109 includes_all_on_row = True 

1110 for x in hdrspans: 

1111 # print("X: x.rownum={} x.start={}".format(x.rownum, x.start)) 

1112 if x.rownum != hdrspan.rownum: 

1113 continue 

1114 if x.start < start or x.start + x.colspan > start + colspan: 

1115 if celltext == debug_cell_text: 1115 ↛ 1116line 1115 didn't jump to line 1116 because the condition on line 1115 was never true

1116 print( 

1117 "NOT IN RANGE: {} {} {}".format( 

1118 x.start, x.colspan, x.tagsets 

1119 ) 

1120 ) 

1121 includes_all_on_row = False 

1122 continue 

1123 if id(x) in used_hdrspans: 

1124 if celltext == debug_cell_text: 1124 ↛ 1125line 1124 didn't jump to line 1125 because the condition on line 1124 was never true

1125 print( 

1126 "ALREADY USED: {} {} {}".format( 

1127 x.start, x.colspan, x.tagsets 

1128 ) 

1129 ) 

1130 continue 

1131 used_hdrspans.add(id(x)) 

1132 if celltext == debug_cell_text: 1132 ↛ 1133line 1132 didn't jump to line 1133 because the condition on line 1132 was never true

1133 print( 

1134 "Merging into wide col: x.rownum={} " 

1135 "x.start={} x.colspan={} " 

1136 "start={} colspan={} tagsets={} x.tagsets={}".format( 

1137 x.rownum, 

1138 x.start, 

1139 x.colspan, 

1140 start, 

1141 colspan, 

1142 tagsets, 

1143 x.tagsets, 

1144 ) 

1145 ) 

1146 tagsets = or_tagsets(lang, pos, tagsets, x.tagsets) 

1147 # If all headers on the row were included, ignore them. 

1148 # See e.g. kunna/Swedish/Verb. 

1149 ts_cats = tagset_cats(tagsets) 

1150 if ( 

1151 includes_all_on_row 

1152 or 

1153 # Kludge, see fut/Hungarian/Verb 

1154 ("tense" in ts_cats and "object" in ts_cats) 

1155 ): 

1156 tagsets = [()] 

1157 # For limited categories, if the category doesn't appear 

1158 # outside, we won't include the category 

1159 if not in_cats - set( 

1160 ("gender", "number", "person", "case", "category", "voice") 

1161 ): 

1162 # Sometimes we have masc, fem, neut and plural, so treat 

1163 # number and gender as the same here (if one given, look for 

1164 # the other too) 

1165 if "number" in in_cats or "gender" in in_cats: 

1166 in_cats.update(("number", "gender")) 

1167 # Determine which categories occur outside on 

1168 # the same row. Ignore headers that have been expanded 

1169 # to cover the whole row/part of it. 

1170 out_cats = set( 

1171 valid_tags[t] 

1172 for x in hdrspans 

1173 if x.rownum == hdrspan.rownum 

1174 and not x.expanded 

1175 and ( 

1176 x.start < start or x.start + x.colspan > start + colspan 

1177 ) 

1178 for tt in x.tagsets 

1179 for t in tt 

1180 ) 

1181 if celltext == debug_cell_text: 1181 ↛ 1182line 1181 didn't jump to line 1182 because the condition on line 1181 was never true

1182 print("in_cats={} out_cats={}".format(in_cats, out_cats)) 

1183 # Remove all inside categories that do not appear outside 

1184 

1185 new_tagsets = [] 

1186 for ts in tagsets: 

1187 tags = tuple( 

1188 sorted(t for t in ts if valid_tags[t] in out_cats) 

1189 ) 

1190 if tags not in new_tagsets: 1190 ↛ 1186line 1190 didn't jump to line 1186 because the condition on line 1190 was always true

1191 new_tagsets.append(tags) 

1192 if celltext == debug_cell_text and new_tagsets != tagsets: 1192 ↛ 1193line 1192 didn't jump to line 1193 because the condition on line 1192 was never true

1193 print( 

1194 "Removed tags that do not " 

1195 "appear outside {} -> {}".format( 

1196 # have_hdr never used? 

1197 tagsets, 

1198 new_tagsets, 

1199 ) 

1200 ) 

1201 tagsets = new_tagsets 

1202 key = (hdrspan.start, hdrspan.colspan) 

1203 if key in used: 

1204 if celltext == debug_cell_text: 1204 ↛ 1205line 1204 didn't jump to line 1205 because the condition on line 1204 was never true

1205 print( 

1206 "Cellspan already used: start={} " 

1207 "colspan={} rownum={} {}".format( 

1208 hdrspan.start, 

1209 hdrspan.colspan, 

1210 hdrspan.rownum, 

1211 hdrspan.tagsets, 

1212 ) 

1213 ) 

1214 action = get_lang_conf(lang, "reuse_cellspan") 

1215 # can be "stop", "skip" or "reuse" 

1216 if action == "stop": 

1217 break 

1218 if action == "skip": 

1219 continue 

1220 assert action == "reuse" 

1221 tcats = tagset_cats(tagsets) 

1222 # Most headers block using the same column position above. However, 

1223 # "register" tags don't do this (cf. essere/Italian/verb: "formal") 

1224 if len(tcats) != 1 or "register" not in tcats: 

1225 used.add(key) 

1226 # If we have moved to a different row, merge into column tagsets 

1227 # (we use different and_tagsets within the row) 

1228 if row_tagsets_rownum != hdrspan.rownum: 

1229 # row_tagsets_rownum was initialized as 10000000 

1230 ret = and_tagsets(lang, pos, coltags, row_tagsets) 

1231 if celltext == debug_cell_text: 1231 ↛ 1232line 1231 didn't jump to line 1232 because the condition on line 1231 was never true

1232 print( 

1233 "merging rows: {} {} -> {}".format( 

1234 coltags, row_tagsets, ret 

1235 ) 

1236 ) 

1237 coltags = ret 

1238 row_tagsets = [()] 

1239 row_tagsets_rownum = hdrspan.rownum 

1240 # Merge into coltags 

1241 if hdrspan.all_headers_row and hdrspan.rownum + 1 == last_header_row: 

1242 # If this row is all headers and immediately preceeds the last 

1243 # header we accepted, take any header from there. 

1244 row_tagsets = and_tagsets(lang, pos, row_tagsets, tagsets) 

1245 if celltext == debug_cell_text: 1245 ↛ 1246line 1245 didn't jump to line 1246 because the condition on line 1245 was never true

1246 print("merged (next header row): {}".format(row_tagsets)) 

1247 else: 

1248 # new_cats is for the new tags (higher up in the table) 

1249 new_cats = tagset_cats(tagsets) 

1250 # cur_cats is for the tags already collected (lower in the table) 

1251 cur_cats = tagset_cats(coltags) 

1252 if celltext == debug_cell_text: 1252 ↛ 1253line 1252 didn't jump to line 1253 because the condition on line 1252 was never true

1253 print( 

1254 "row={} start={} colspan={} tagsets={} coltags={} " 

1255 "new_cats={} cur_cats={}".format( 

1256 hdrspan.rownum, 

1257 hdrspan.start, 

1258 hdrspan.colspan, 

1259 tagsets, 

1260 coltags, 

1261 new_cats, 

1262 cur_cats, 

1263 ) 

1264 ) 

1265 if "detail" in new_cats: 

1266 if not any(coltags): # Only if no tags so far 

1267 coltags = or_tagsets(lang, pos, coltags, tagsets) 

1268 if celltext == debug_cell_text: 1268 ↛ 1269line 1268 didn't jump to line 1269 because the condition on line 1268 was never true

1269 print("stopping on detail after merge") 

1270 break 

1271 # Here, we block bleeding of categories from above 

1272 elif "non-finite" in cur_cats and "non-finite" in new_cats: 

1273 stop = get_lang_conf(lang, "stop_non_finite_non_finite") 

1274 if stop: 1274 ↛ 1300line 1274 didn't jump to line 1300 because the condition on line 1274 was always true

1275 if celltext == debug_cell_text: 1275 ↛ 1276line 1275 didn't jump to line 1276 because the condition on line 1275 was never true

1276 print("stopping on non-finite-non-finite") 

1277 break 

1278 elif "non-finite" in cur_cats and "voice" in new_cats: 

1279 stop = get_lang_conf(lang, "stop_non_finite_voice") 

1280 if stop: 1280 ↛ 1300line 1280 didn't jump to line 1300 because the condition on line 1280 was always true

1281 if celltext == debug_cell_text: 1281 ↛ 1282line 1281 didn't jump to line 1282 because the condition on line 1281 was never true

1282 print("stopping on non-finite-voice") 

1283 break 

1284 elif "non-finite" in new_cats and cur_cats & set( 

1285 ("person", "number") 

1286 ): 

1287 if celltext == debug_cell_text: 1287 ↛ 1288line 1287 didn't jump to line 1288 because the condition on line 1287 was never true

1288 print("stopping on non-finite new") 

1289 break 

1290 elif "non-finite" in new_cats and "tense" in new_cats: 

1291 stop = get_lang_conf(lang, "stop_non_finite_tense") 

1292 if stop: 

1293 if celltext == debug_cell_text: 1293 ↛ 1294line 1293 didn't jump to line 1294 because the condition on line 1293 was never true

1294 print("stopping on non-finite new") 

1295 break 

1296 elif "non-finite" in cur_cats and new_cats & set(("mood",)): 1296 ↛ 1297line 1296 didn't jump to line 1297 because the condition on line 1296 was never true

1297 if celltext == debug_cell_text: 

1298 print("stopping on non-finite cur") 

1299 break 

1300 if ( 

1301 "tense" in new_cats 

1302 and any("imperative" in x for x in coltags) 

1303 and get_lang_conf(lang, "imperative_no_tense") 

1304 ): 

1305 if celltext == debug_cell_text: 1305 ↛ 1306line 1305 didn't jump to line 1306 because the condition on line 1305 was never true

1306 print("skipping tense in imperative") 

1307 continue 

1308 elif ( 

1309 "mood" in new_cats 

1310 and "mood" in cur_cats 

1311 and 

1312 # Allow if all new tags are already in current set 

1313 any( 

1314 t not in ts1 

1315 for ts1 in coltags # current 

1316 for ts2 in tagsets # new (from above) 

1317 for t in ts2 

1318 ) 

1319 ): 

1320 skip = get_lang_conf(lang, "skip_mood_mood") 

1321 if skip: 

1322 if celltext == debug_cell_text: 1322 ↛ 1323line 1322 didn't jump to line 1323 because the condition on line 1322 was never true

1323 print("skipping on mood-mood") 

1324 # we continue to next header 

1325 else: 

1326 if celltext == debug_cell_text: 1326 ↛ 1327line 1326 didn't jump to line 1327 because the condition on line 1326 was never true

1327 print("stopping on mood-mood") 

1328 break 

1329 elif "tense" in new_cats and "tense" in cur_cats: 

1330 skip = get_lang_conf(lang, "skip_tense_tense") 

1331 if skip: 

1332 if celltext == debug_cell_text: 1332 ↛ 1333line 1332 didn't jump to line 1333 because the condition on line 1332 was never true

1333 print("skipping on tense-tense") 

1334 # we continue to next header 

1335 else: 

1336 if celltext == debug_cell_text: 1336 ↛ 1337line 1336 didn't jump to line 1337 because the condition on line 1336 was never true

1337 print("stopping on tense-tense") 

1338 break 

1339 elif "aspect" in new_cats and "aspect" in cur_cats: 

1340 if celltext == debug_cell_text: 1340 ↛ 1341line 1340 didn't jump to line 1341 because the condition on line 1340 was never true

1341 print("skipping on aspect-aspect") 

1342 continue 

1343 elif "number" in cur_cats and "number" in new_cats: 

1344 if celltext == debug_cell_text: 1344 ↛ 1345line 1344 didn't jump to line 1345 because the condition on line 1344 was never true

1345 print("stopping on number-number") 

1346 break 

1347 elif "number" in cur_cats and "gender" in new_cats: 

1348 if celltext == debug_cell_text: 1348 ↛ 1349line 1348 didn't jump to line 1349 because the condition on line 1348 was never true

1349 print("stopping on number-gender") 

1350 break 

1351 elif "person" in cur_cats and "person" in new_cats: 

1352 if celltext == debug_cell_text: 1352 ↛ 1353line 1352 didn't jump to line 1353 because the condition on line 1352 was never true

1353 print("stopping on person-person") 

1354 break 

1355 else: 

1356 # Merge tags and continue to next header up/left in the table. 

1357 row_tagsets = and_tagsets(lang, pos, row_tagsets, tagsets) 

1358 if celltext == debug_cell_text: 1358 ↛ 1359line 1358 didn't jump to line 1359 because the condition on line 1358 was never true

1359 print("merged: {}".format(coltags)) 

1360 # Update the row number from which we have last taken headers 

1361 last_header_row = hdrspan.rownum 

1362 # Merge the final row tagset into coltags 

1363 coltags = and_tagsets(lang, pos, coltags, row_tagsets) 

1364 # print( 

1365 # "HDRSPANS:", list((x.start, x.colspan, x.tagsets) for x in hdrspans) 

1366 # ) 

1367 if celltext == debug_cell_text: 1367 ↛ 1368line 1367 didn't jump to line 1368 because the condition on line 1367 was never true

1368 print("COMPUTE_COLTAGS {} {}: {}".format(start, colspan, coltags)) 

1369 assert isinstance(coltags, list) 

1370 assert all(isinstance(x, tuple) for x in coltags) 

1371 return coltags 

1372 

1373 

1374def parse_simple_table( 

1375 wxr, tablecontext, word, lang, pos, rows, titles, source, after, depth 

1376): 

1377 """This is the default table parser. Despite its name, it can parse 

1378 complex tables. This returns a list of forms to be added to the 

1379 part-of-speech, or None if the table could not be parsed.""" 

1380 assert isinstance(wxr, WiktextractContext) 

1381 assert isinstance(tablecontext, TableContext) 

1382 assert isinstance(word, str) 

1383 assert isinstance(lang, str) 

1384 assert isinstance(pos, str) 

1385 assert isinstance(rows, list) 

1386 assert isinstance(source, str) 

1387 assert isinstance(after, str) 

1388 assert isinstance(depth, int) 

1389 for row in rows: 

1390 for col in row: 

1391 assert isinstance(col, InflCell) 

1392 assert isinstance(titles, list) 

1393 for x in titles: 

1394 assert isinstance(x, str) 

1395 

1396 # print("PARSE_SIMPLE_TABLE: TITLES:", titles) 

1397 if debug_cell_text: 1397 ↛ 1398line 1397 didn't jump to line 1398 because the condition on line 1397 was never true

1398 print("ROWS:") 

1399 for row in rows: 

1400 print(" ", row) 

1401 

1402 # Check for forced rowspan kludge. See e.g. 

1403 # maorski/Serbo-Croatian. These are essentially multi-row 

1404 # cells implemented using <br> rather than separate cell. We fix this 

1405 # by identifying rows where this happens, and splitting the current row 

1406 # to multiple rows by synthesizing additional cells. 

1407 new_rows = [] 

1408 for row in rows: 

1409 split_row = ( 

1410 any(x.is_title and x.text in ("inanimate\nanimate",) for x in row) 

1411 and 

1412 # x is an InflCell 

1413 all(x.rowspan == 1 for x in row) 

1414 ) 

1415 if not split_row: 

1416 new_rows.append(row) 

1417 continue 

1418 row1 = [] 

1419 row2 = [] 

1420 for cell in row: 

1421 cell1 = copy.deepcopy(cell) 

1422 if "\n" in cell.text: 

1423 # Has more than one line - split this cell 

1424 parts = cell.text.strip().splitlines() 

1425 if len(parts) != 2: 1425 ↛ 1426line 1425 didn't jump to line 1426 because the condition on line 1425 was never true

1426 wxr.wtp.debug( 

1427 "forced rowspan kludge got {} parts: {!r}".format( 

1428 len(parts), cell.text 

1429 ), 

1430 sortid="inflection/1234", 

1431 ) 

1432 cell2 = copy.deepcopy(cell) 

1433 cell1.text = parts[0] 

1434 cell2.text = parts[1] 

1435 else: 

1436 cell1.rowspan = 2 

1437 cell2 = cell1 # ref, not a copy 

1438 row1.append(cell1) 

1439 row2.append(cell2) 

1440 new_rows.append(row1) 

1441 new_rows.append(row2) 

1442 rows = new_rows 

1443 # print("ROWS AFTER FORCED ROWSPAN KLUDGE:") 

1444 # for row in rows: 

1445 # print(" ", row) 

1446 

1447 # Parse definitions for references (from table itself and from text 

1448 # after it) 

1449 def_ht = {} 

1450 

1451 def add_defs(defs: list[tuple[str, str]]) -> None: 

1452 for ref, d in defs: 

1453 # print("DEF: ref={} d={}".format(ref, d)) 

1454 d = d.strip() 

1455 d = d.split(". ")[0].strip() # text before ". " 

1456 if not d: 1456 ↛ 1457line 1456 didn't jump to line 1457 because the condition on line 1456 was never true

1457 continue 

1458 if d.endswith("."): # catc ".."?? 

1459 d = d[:-1] 

1460 tags, topics = decode_tags(d, no_unknown_starts=True) 

1461 # print(f"{ref=}, {d=}, {tags=}") 

1462 if topics or any("error-unknown-tag" in ts for ts in tags): 

1463 d = d[0].lower() + d[1:] 

1464 tags, topics = decode_tags(d, no_unknown_starts=True) 

1465 if topics or any("error-unknown-tag" in ts for ts in tags): 

1466 # Failed to parse as tags 

1467 # print("Failed: topics={} tags={}" 

1468 # .format(topics, tags)) 

1469 continue 

1470 tags1_s: set[str] = set() 

1471 for ts in tags: 

1472 tags1_s.update(ts) 

1473 tags1 = tuple(sorted(tags1_s)) 

1474 # print("DEFINED: {} -> {}".format(ref, tags1)) 

1475 def_ht[ref] = tags1 

1476 

1477 def generate_tags( 

1478 rowtags: list[tuple[str]], table_tags: list[str] 

1479 ) -> tuple[ 

1480 list[tuple[str, ...]], list[tuple[str, ...]], list[tuple[str, ...]] 

1481 ]: 

1482 new_coltags = [] 

1483 all_hdr_tags = [] # list of tuples 

1484 new_rowtags = [] 

1485 for rt0 in rowtags: 

1486 for ct0 in compute_coltags( 

1487 lang, 

1488 pos, 

1489 hdrspans, 

1490 col_idx, # col_idx=>start 

1491 colspan, 

1492 col, # cell_text 

1493 ): 

1494 base_tags: set[str] = ( 

1495 set(rt0) 

1496 | set(ct0) 

1497 | set(global_tags) 

1498 | set(itertools.chain.from_iterable(table_tags)) 

1499 ) # Union. 

1500 alt_tags = expand_header( 

1501 wxr, 

1502 tablecontext, 

1503 word, 

1504 lang, 

1505 pos, 

1506 text, 

1507 base_tags, 

1508 depth=depth, 

1509 ) 

1510 # base_tags are used in infl_map "if"-conds. 

1511 for tt in alt_tags: 

1512 if tt not in all_hdr_tags: 

1513 all_hdr_tags.append(tt) 

1514 tt_s = set(tt) 

1515 # Certain tags are always moved to word-level tags 

1516 if tt_s & TAGS_FORCED_WORDTAGS: 1516 ↛ 1517line 1516 didn't jump to line 1517 because the condition on line 1516 was never true

1517 table_tags.extend(tt_s & TAGS_FORCED_WORDTAGS) 

1518 tt_s = tt_s - TAGS_FORCED_WORDTAGS 

1519 # Add tags from referenced footnotes 

1520 tt_s.update(refs_tags) 

1521 # Sort, convert to tuple, and add to set of 

1522 # alternatives. 

1523 tt = tuple(sorted(tt_s)) 

1524 if tt not in new_coltags: 

1525 new_coltags.append(tt) 

1526 # Kludge (saprast/Latvian/Verb): ignore row tags 

1527 # if trying to add a non-finite after mood. 

1528 if any(valid_tags[t] == "mood" for t in rt0) and any( 

1529 valid_tags[t] == "non-finite" for t in tt 

1530 ): 

1531 tags = tuple(sorted(set(tt) | set(hdr_tags))) 

1532 else: 

1533 tags = tuple(sorted(set(tt) | set(rt0) | set(hdr_tags))) 

1534 if tags not in new_rowtags: 

1535 new_rowtags.append(tags) 

1536 return new_rowtags, new_coltags, all_hdr_tags 

1537 

1538 def add_new_hdrspan( 

1539 col: str, 

1540 hdrspans: list[HdrSpan], 

1541 store_new_hdrspan: bool, 

1542 col0_followed_by_nonempty: bool, 

1543 col0_hdrspan: Optional[HdrSpan], 

1544 ) -> tuple[str, bool, Optional[HdrSpan]]: 

1545 hdrspan = HdrSpan( 

1546 col_idx, colspan, rowspan, rownum, new_coltags, col, all_headers 

1547 ) 

1548 hdrspans.append(hdrspan) 

1549 

1550 # infl-map tag "dummy-store-hdrspan" causes this new hdrspan 

1551 # to be added to a register of stored hdrspans to be used 

1552 # later with "dummy-load-stored-hdrspans". 

1553 if store_new_hdrspan: 1553 ↛ 1554line 1553 didn't jump to line 1554 because the condition on line 1553 was never true

1554 tablecontext.stored_hdrspans.append(hdrspan) 

1555 

1556 # Handle headers that are above left-side header 

1557 # columns and are followed by personal pronouns in 

1558 # remaining columns (basically headers that 

1559 # evaluate to no tags). In such cases widen the 

1560 # left-side header to the full row. 

1561 if previously_seen: # id(cell) in seen_cells previously 

1562 col0_followed_by_nonempty = True 

1563 return col, col0_followed_by_nonempty, col0_hdrspan 

1564 elif col0_hdrspan is None: 

1565 col0_hdrspan = hdrspan 

1566 elif any(all_hdr_tags): 1566 ↛ 1634line 1566 didn't jump to line 1634 because the condition on line 1566 was always true

1567 col0_cats = tagset_cats(col0_hdrspan.tagsets) 

1568 later_cats = tagset_cats(all_hdr_tags) 

1569 col0_allowed = get_lang_conf(lang, "hdr_expand_first") 

1570 later_allowed = get_lang_conf(lang, "hdr_expand_cont") 

1571 later_allowed = later_allowed | set(["dummy"]) 

1572 # dummy2 has different behavior than plain dummy 

1573 # and does not belong here. 

1574 

1575 # print("col0_cats={} later_cats={} " 

1576 # "fol_by_nonempty={} col_idx={} end={} " 

1577 # "tagsets={}" 

1578 # .format(col0_cats, later_cats, 

1579 # col0_followed_by_nonempty, col_idx, 

1580 # col0_hdrspan.start + 

1581 # col0_hdrspan.colspan, 

1582 # col0_hdrspan.tagsets)) 

1583 # print("col0.rowspan={} rowspan={}" 

1584 # .format(col0_hdrspan.rowspan, rowspan)) 

1585 # Only expand if [col0_cats and later_cats are allowed 

1586 # and don't overlap] and [col0 has tags], and there have 

1587 # been [no disallowed cells in between]. 

1588 # 

1589 # There are three cases here: 

1590 # - col0_hdrspan set, continue with allowed current 

1591 # - col0_hdrspan set, expand, start new 

1592 # - col0_hdrspan set, no expand, start new 

1593 if ( 

1594 not col0_followed_by_nonempty 

1595 and 

1596 # XXX Only one cat of tags: kunna/Swedish 

1597 # XXX len(col0_cats) == 1 and 

1598 col0_hdrspan.rowspan >= rowspan 

1599 and 

1600 # from hdrspan 

1601 not (later_cats - later_allowed) 

1602 and not (col0_cats & later_cats) 

1603 ): 

1604 # First case: col0 set, continue 

1605 return col, col0_followed_by_nonempty, col0_hdrspan 

1606 # We are going to start new col0_hdrspan. Check if 

1607 # we should expand. 

1608 if ( 

1609 not col0_followed_by_nonempty 

1610 and not (col0_cats - col0_allowed) 

1611 and 

1612 # Only "allowed" allowed 

1613 # XXX len(col0_cats) == 1 and 

1614 col_idx > col0_hdrspan.start + col0_hdrspan.colspan 

1615 ): 

1616 # col_idx is beyond current colspan 

1617 # *Expand* current col0_hdrspan 

1618 # print("EXPANDING COL0 MID: {} from {} to {} " 

1619 # "cols {}" 

1620 # .format(col0_hdrspan.text, 

1621 # col0_hdrspan.colspan, 

1622 # col_idx - col0_hdrspan.start, 

1623 # col0_hdrspan.tagsets)) 

1624 col0_hdrspan.colspan = col_idx - col0_hdrspan.start 

1625 col0_hdrspan.expanded = True 

1626 # Clear old col0_hdrspan 

1627 if col == debug_cell_text: 1627 ↛ 1628line 1627 didn't jump to line 1628 because the condition on line 1627 was never true

1628 print("START NEW {}".format(hdrspan.tagsets)) 

1629 col0_hdrspan = None 

1630 # Now start new, unless it comes from previous row 

1631 if not previously_seen: 1631 ↛ 1634line 1631 didn't jump to line 1634 because the condition on line 1631 was always true

1632 col0_hdrspan = hdrspan 

1633 col0_followed_by_nonempty = False 

1634 return col, col0_followed_by_nonempty, col0_hdrspan 

1635 

1636 def split_text_into_alts(col: str) -> tuple[str, list[str], list[str]]: 

1637 # Split the cell text into alternatives 

1638 split_extra_tags = [] 

1639 if col and is_superscript(col[0]): 1639 ↛ 1640line 1639 didn't jump to line 1640 because the condition on line 1639 was never true

1640 alts = [col] 

1641 else: 

1642 separators = [";", "•", r"\n", " or "] 

1643 if " + " not in col: 

1644 separators.append(",") 

1645 if not col.endswith("/"): 

1646 separators.append("/") 

1647 if col in special_phrase_splits: 

1648 # Use language-specific special splits. 

1649 # These are phrases and constructions that have 

1650 # unique ways of splitting, not specific characters 

1651 # to split on like with the default splitting. 

1652 alts, tags = special_phrase_splits[col] 

1653 split_extra_tags = tags.split() 

1654 for x in split_extra_tags: 

1655 assert x in valid_tags 

1656 assert isinstance(alts, (list, tuple)) 

1657 assert isinstance(tags, str) 

1658 else: 

1659 # Use default splitting. However, recognize 

1660 # language-specific replacements and change them to magic 

1661 # characters before splitting. This way we won't split 

1662 # them. This is important for, e.g., recognizing 

1663 # alternative pronouns. 

1664 # The magic characters are characters out of Unicode scope 

1665 # that are given a simple incremental value, int > unicode. 

1666 repls = {} 

1667 magic_ch = MAGIC_FIRST 

1668 trs = get_lang_conf(lang, "form_transformations") 

1669 # trs is a list of lists of strings 

1670 for _, v, _, _ in trs: 

1671 # v is a pattern string, like "^ich" 

1672 # form_transformations data is doing double-duty here, 

1673 # because the pattern strings are already known to us and 

1674 # not meant to be split. 

1675 m = re.search(v, col) 

1676 if m is not None: 

1677 # if pattern found in text 

1678 magic = chr(magic_ch) 

1679 magic_ch += 1 # next magic character value 

1680 col = re.sub(v, magic, col) # replace with magic ch 

1681 repls[magic] = m.group(0) 

1682 # remember what regex match string each magic char 

1683 # replaces. .group(0) is the whole match. 

1684 alts0 = split_at_comma_semi(col, separators=separators) 

1685 # with magic characters in place, split the text so that 

1686 # pre-transformation text is out of the way. 

1687 alts = [] 

1688 for alt in alts0: 

1689 # create a new list with the separated items and 

1690 # the magic characters replaced with the original texts. 

1691 for k, v in repls.items(): 

1692 alt = re.sub(k, v, alt) 

1693 alts.append(alt) 

1694 

1695 # Remove "*" from beginning of forms, as in non-attested 

1696 # or reconstructed forms. Otherwise it might confuse romanization 

1697 # detection. 

1698 alts = list(re.sub(r"^\*\*?([^ ])", r"\1", x) for x in alts) 

1699 alts = list( 

1700 x for x in alts if not re.match(r"pronounced with |\(with ", x) 

1701 ) 

1702 alts = list( 

1703 re.sub(r"^\((in the sense [^)]*)\)\s+", "", x) for x in alts 

1704 ) 

1705 # Check for parenthesized alternatives, e.g. ripromettersi/Italian 

1706 if all( 1706 ↛ 1717line 1706 didn't jump to line 1717 because the condition on line 1706 was never true

1707 re.match(r"\w+( \w+)* \(\w+( \w+)*(, \w+( \w+)*)*\)$", alt) 

1708 # word word* \(word word*(, word word*)*\) 

1709 and all( 

1710 distw([re.sub(r" \(.*", "", alt)], x) < 0.5 

1711 # Levenshtein distance 

1712 for x in re.sub(r".*\((.*)\)", r"\1", alt).split(", ") 

1713 ) 

1714 # Extract from parentheses for testin 

1715 for alt in alts 

1716 ): 

1717 new_alts = [] 

1718 for alt in alts: 

1719 # Replace parentheses before splitting 

1720 alt = alt.replace(" (", ", ") 

1721 alt = alt.replace(")", "") 

1722 for new_alt in alt.split(", "): 

1723 new_alts.append(new_alt) 

1724 alts = new_alts 

1725 return col, alts, split_extra_tags 

1726 

1727 def handle_mixed_lines(alts: list[str]) -> list[tuple[str, str, str]]: 

1728 # Handle the special case where romanization is given under 

1729 # normal form, e.g. in Russian. There can be multiple 

1730 # comma-separated forms in each case. We also handle the case 

1731 # where instead of romanization we have IPA pronunciation 

1732 # (e.g., avoir/French/verb). 

1733 len2 = len(alts) // 2 

1734 # Check for IPAs (forms first, IPAs under) 

1735 # base, base, IPA, IPA 

1736 if ( 

1737 len(alts) % 2 == 0 # Divisibly by two 

1738 and all( 

1739 re.match(r"^\s*/.*/\s*$", x) # Inside slashes = IPA 

1740 for x in alts[len2:] 

1741 ) 

1742 ): # In the second half of alts 

1743 nalts = list( 

1744 (alts[i], "", alts[i + len2]) 

1745 # List of tuples: (base, "", ipa) 

1746 for i in range(len2) 

1747 ) 

1748 # base, base, base, IPA 

1749 elif ( 

1750 len(alts) > 2 

1751 and re.match(r"^\s*/.*/\s*$", alts[-1]) 

1752 and all(not x.startswith("/") for x in alts[:-1]) 

1753 ): 

1754 # Only if the last alt is IPA 

1755 nalts = list((alts[i], "", alts[-1]) for i in range(len(alts) - 1)) 

1756 # base, IPA, IPA, IPA 

1757 elif ( 

1758 len(alts) > 2 

1759 and not alts[0].startswith("/") 

1760 and all( 

1761 re.match(r"^\s*/.*/\s*$", alts[i]) for i in range(1, len(alts)) 

1762 ) 

1763 ): 

1764 # First is base and the rest is IPA alternatives 

1765 nalts = list((alts[0], "", alts[i]) for i in range(1, len(alts))) 

1766 

1767 # Check for romanizations, forms first, romanizations under 

1768 elif ( 

1769 len(alts) % 2 == 0 

1770 and not any("(" in x for x in alts) 

1771 and all( 

1772 classify_desc( 

1773 re.sub( 

1774 r"\^.*$", 

1775 "", 

1776 # Remove ends of strings starting from ^. 

1777 # Supescripts have been already removed 

1778 # from the string, while ^xyz needs to be 

1779 # removed separately, though it's usually 

1780 # something with a single letter? 

1781 "".join(xx for xx in x if not is_superscript(xx)), 

1782 ) 

1783 ) 

1784 == "other" 

1785 for x in alts[:len2] 

1786 ) 

1787 and all( 

1788 classify_desc( 

1789 re.sub( 

1790 r"\^.*$", 

1791 "", 

1792 "".join(xx for xx in x if not is_superscript(xx)), 

1793 ) 

1794 ) 

1795 in ("romanization", "english") 

1796 for x in alts[len2:] 

1797 ) 

1798 ): 

1799 nalts = list((alts[i], alts[i + len2], "") for i in range(len2)) 

1800 # Check for romanizations, forms and romanizations alternating 

1801 elif ( 

1802 len(alts) % 2 == 0 

1803 and not any("(" in x for x in alts) 

1804 and all( 

1805 classify_desc( 

1806 re.sub( 

1807 r"\^.*$", 

1808 "", 

1809 "".join(xx for xx in alts[i] if not is_superscript(xx)), 

1810 ) 

1811 ) 

1812 == "other" 

1813 for i in range(0, len(alts), 2) 

1814 ) 

1815 and all( 

1816 classify_desc( 

1817 re.sub( 

1818 r"\^.*$", 

1819 "", 

1820 "".join(xx for xx in alts[i] if not is_superscript(xx)), 

1821 ) 

1822 ) 

1823 in ("romanization", "english") 

1824 for i in range(1, len(alts), 2) 

1825 ) 

1826 ): 

1827 # odds 

1828 nalts = list( 

1829 (alts[i], alts[i + 1], "") for i in range(0, len(alts), 2) 

1830 ) 

1831 # evens 

1832 else: 

1833 new_alts = [] 

1834 for alt in alts: 

1835 lst = [""] 

1836 idx = 0 

1837 for m in re.finditer( 

1838 r"(^|\w|\*)\((\w+" r"(/\w+)*)\)", 

1839 # start OR letter OR asterisk (word/word*) 

1840 # \\___________group 1_______/ \ \_g3_/// 

1841 # \ \__gr. 2_// 

1842 # \_____________group 0________________/ 

1843 alt, 

1844 ): 

1845 v = m.group(2) # (word/word/word...) 

1846 if ( 

1847 classify_desc(v) == "tags" # Tags inside parens 

1848 or m.group(0) == alt 

1849 ): # All in parens 

1850 continue 

1851 new_lst = [] 

1852 for x in lst: 

1853 x += alt[idx : m.start()] + m.group(1) 

1854 # alt until letter or asterisk 

1855 idx = m.end() 

1856 vparts = v.split("/") 

1857 # group(2) = ["word", "wörd"...] 

1858 if len(vparts) == 1: 

1859 new_lst.append(x) 

1860 new_lst.append(x + v) 

1861 # "kind(er)" -> ["kind", "kinder"] 

1862 else: 

1863 for vv in vparts: 

1864 new_lst.append(x + vv) 

1865 # "lampai(tten/den)" -> 

1866 # ["lampaitten", "lampaiden"] 

1867 lst = new_lst 

1868 for x in lst: 

1869 new_alts.append(x + alt[idx:]) 

1870 # add the end of alt 

1871 nalts = list((x, "", "") for x in new_alts) 

1872 # [form, no romz, no ipa] 

1873 return nalts 

1874 

1875 def find_semantic_parens(form: str) -> tuple[str, list[str]]: 

1876 # "Some languages" (=Greek) use brackets to mark things that 

1877 # require tags, like (informality), [rarity] and {archaicity}. 

1878 extra_tags = [] 

1879 if re.match(r"\([^][(){}]*\)$", form): 

1880 if get_lang_conf(lang, "parentheses_for_informal"): 

1881 form = form[1:-1] 

1882 extra_tags.append("informal") 

1883 else: 

1884 form = form[1:-1] 

1885 elif re.match(r"\{\[[^][(){}]*\]\}$", form): 

1886 if get_lang_conf( 1886 ↛ 1893line 1886 didn't jump to line 1893 because the condition on line 1886 was always true

1887 lang, "square_brackets_for_rare" 

1888 ) and get_lang_conf(lang, "curly_brackets_for_archaic"): 

1889 # είμαι/Greek/Verb 

1890 form = form[2:-2] 

1891 extra_tags.extend(["rare", "archaic"]) 

1892 else: 

1893 form = form[2:-2] 

1894 elif re.match(r"\{[^][(){}]*\}$", form): 

1895 if get_lang_conf(lang, "curly_brackets_for_archaic"): 1895 ↛ 1900line 1895 didn't jump to line 1900 because the condition on line 1895 was always true

1896 # είμαι/Greek/Verb 

1897 form = form[1:-1] 

1898 extra_tags.extend(["archaic"]) 

1899 else: 

1900 form = form[1:-1] 

1901 elif re.match(r"\[[^][(){}]*\]$", form): 

1902 if get_lang_conf(lang, "square_brackets_for_rare"): 1902 ↛ 1907line 1902 didn't jump to line 1907 because the condition on line 1902 was always true

1903 # είμαι/Greek/Verb 

1904 form = form[1:-1] 

1905 extra_tags.append("rare") 

1906 else: 

1907 form = form[1:-1] 

1908 return form, extra_tags 

1909 

1910 def handle_parens( 

1911 form: str, roman: str, clitic: str, extra_tags: list[str] 

1912 ) -> tuple[str, str, str]: 

1913 if re.match(r"[’'][a-z]([a-z][a-z]?)?$", paren): 

1914 # is there a clitic starting with apostrophe? 

1915 clitic = paren 

1916 # assume the whole paren is a clitic 

1917 # then remove paren from form 

1918 form = (form[: m.start()] + subst + form[m.end() :]).strip() 

1919 elif classify_desc(paren) == "tags": 

1920 tagsets1, topics1 = decode_tags(paren) 

1921 if not topics1: 1921 ↛ 1942line 1921 didn't jump to line 1942 because the condition on line 1921 was always true

1922 for ts in tagsets1: 

1923 ts = tuple(x for x in ts if " " not in x) 

1924 # There are some generated tags containing 

1925 # spaces; do not let them through here. 

1926 extra_tags.extend(ts) 

1927 form = (form[: m.start()] + subst + form[m.end() :]).strip() 

1928 # brackets contain romanization 

1929 elif ( 

1930 m.start() > 0 

1931 and not roman 

1932 and classify_desc(form[: m.start()]) == "other" 

1933 and 

1934 # "other" ~ text 

1935 classify_desc(paren) in ("romanization", "english") 

1936 and not re.search(r"^with |-form$", paren) 

1937 ): 

1938 roman = paren 

1939 form = (form[: m.start()] + subst + form[m.end() :]).strip() 

1940 elif re.search(r"^with |-form", paren): 1940 ↛ 1941line 1940 didn't jump to line 1941 because the condition on line 1940 was never true

1941 form = (form[: m.start()] + subst + form[m.end() :]).strip() 

1942 return form, roman, clitic 

1943 

1944 def merge_row_and_column_tags(form, some_has_covered_text): 

1945 # Merge column tags and row tags. We give preference 

1946 # to moods etc coming from rowtags (cf. austteigen/German/Verb 

1947 # imperative forms). 

1948 

1949 # In certain cases, what a tag means depends on whether 

1950 # it is a row or column header. Depending on the language, 

1951 # we replace certain tags with others if they're in 

1952 # a column or row 

1953 

1954 ret = [] 

1955 # rtagreplacs = get_lang_conf(lang, "rowtag_replacements") 

1956 # ctagreplacs = get_lang_conf(lang, "coltag_replacements") 

1957 for rt in sorted(rowtags): 

1958 if "dummy-use-as-coltags" in rt: 1958 ↛ 1959line 1958 didn't jump to line 1959 because the condition on line 1958 was never true

1959 continue 

1960 # if lang was in rowtag_replacements) 

1961 # if not rtagreplacs == None: 

1962 # rt = replace_directional_tags(rt, rtagreplacs) 

1963 for ct in sorted(coltags): 

1964 if "dummy-use-as-rowtags" in ct: 1964 ↛ 1965line 1964 didn't jump to line 1965 because the condition on line 1964 was never true

1965 continue 

1966 # if lang was in coltag_replacements 

1967 # if not ctagreplacs == None: 

1968 # ct = replace_directional_tags(ct, 

1969 # ctagreplacs) 

1970 tags = set(global_tags) 

1971 tags.update(extra_tags) 

1972 tags.update(rt) 

1973 tags.update(refs_tags) 

1974 tags.update(tablecontext.section_header) 

1975 # Merge tags from column. For certain kinds of tags, 

1976 # those coming from row take precedence. 

1977 old_tags = set(tags) 

1978 for t in ct: 

1979 c = valid_tags[t] 

1980 if c in ("mood", "case", "number") and any( 

1981 valid_tags[tt] == c for tt in old_tags 

1982 ): 

1983 continue 

1984 tags.add(t) 

1985 

1986 # Extract language-specific tags from the 

1987 # form. This may also adjust the form. 

1988 form, lang_tags = lang_specific_tags(lang, pos, form) 

1989 tags.update(lang_tags) 

1990 

1991 # For non-finite verb forms, see if they have 

1992 # a gender/class suffix 

1993 if pos == "verb" and any( 

1994 valid_tags[t] == "non-finite" for t in tags 

1995 ): 

1996 form, tt = parse_head_final_tags(wxr, lang, form) 

1997 tags.update(tt) 

1998 

1999 # Remove "personal" tag if have nth person; these 

2000 # come up with e.g. reconhecer/Portuguese/Verb. But 

2001 # not if we also have "pronoun" 

2002 if ( 

2003 "personal" in tags 

2004 and "pronoun" not in tags 

2005 and any( 

2006 x in tags 

2007 for x in [ 

2008 "first-person", 

2009 "second-person", 

2010 "third-person", 

2011 ] 

2012 ) 

2013 ): 

2014 tags.remove("personal") 

2015 

2016 # If we have impersonal, remove person and number. 

2017 # This happens with e.g. viajar/Portuguese/Verb 

2018 if "impersonal" in tags: 

2019 tags = tags - set( 

2020 [ 

2021 "first-person", 

2022 "second-person", 

2023 "third-person", 

2024 "singular", 

2025 "plural", 

2026 ] 

2027 ) 

2028 

2029 # Remove unnecessary "positive" tag from verb forms 

2030 if pos == "verb" and "positive" in tags: 

2031 if "negative" in tags: 2031 ↛ 2032line 2031 didn't jump to line 2032 because the condition on line 2031 was never true

2032 tags.remove("negative") 

2033 tags.remove("positive") 

2034 

2035 # Many Russian (and other Slavic) inflection tables 

2036 # have animate/inanimate distinction that generates 

2037 # separate entries for neuter/feminine, but the 

2038 # distinction only applies to masculine. Remove them 

2039 # form neuter/feminine and eliminate duplicates. 

2040 if get_lang_conf(lang, "masc_only_animate"): 

2041 for t1 in ("animate", "inanimate"): 

2042 for t2 in ("neuter", "feminine"): 

2043 if ( 

2044 t1 in tags 

2045 and t2 in tags 

2046 and "masculine" not in tags 

2047 and "plural" not in tags 

2048 ): 

2049 tags.remove(t1) 

2050 

2051 # German adjective tables contain "(keiner)" etc 

2052 # for mixed declension plural. When the adjective 

2053 # disappears and it becomes just one word, remove 

2054 # the "includes-article" tag. e.g. eiskalt/German 

2055 if "includes-article" in tags and " " not in form: 

2056 tags.remove("includes-article") 

2057 

2058 # Handle ignored forms. We mark that the form was 

2059 # provided. This is important information; some words 

2060 # just do not have a certain form. However, there also 

2061 # many cases where no word in a language has a 

2062 # particular form. Post-processing could detect and 

2063 # remove such cases. 

2064 if form in IGNORED_COLVALUES: 

2065 # if cell text seems to be ignorable 

2066 if "dummy-ignore-skipped" in tags: 

2067 continue 

2068 if ( 

2069 col_idx not in has_covering_hdr 

2070 and some_has_covered_text 

2071 ): 

2072 continue 

2073 # don't ignore this cell if there's been a header 

2074 # above it 

2075 form = "-" 

2076 elif col_idx in has_covering_hdr: 

2077 some_has_covered_text = True 

2078 

2079 # Handle ambiguous object concord. If a header 

2080 # gives the "dummy-object-concord"-tag to a word, 

2081 # replace person, number and gender tags with 

2082 # their "object-" counterparts so that the verb 

2083 # agrees with the object instead. 

2084 # Use only when the verb has ONLY object agreement! 

2085 # a پخول/Pashto 

2086 if "dummy-object-concord" in tags: 2086 ↛ 2087line 2086 didn't jump to line 2087 because the condition on line 2086 was never true

2087 for subtag, objtag in object_concord_replacements.items(): 

2088 if subtag in tags: 

2089 tags.remove(subtag) 

2090 tags.add(objtag) 

2091 

2092 # Remove the dummy mood tag that we sometimes 

2093 # use to block adding other mood and related 

2094 # tags 

2095 tags = tags - set( 

2096 [ 

2097 "dummy-mood", 

2098 "dummy-tense", 

2099 "dummy-ignore-skipped", 

2100 "dummy-object-concord", 

2101 "dummy-reset-headers", 

2102 "dummy-use-as-coltags", 

2103 "dummy-use-as-rowtags", 

2104 "dummy-store-hdrspan", 

2105 "dummy-load-stored-hdrspans", 

2106 "dummy-reset-stored-hdrspans", 

2107 "dummy-section-header", 

2108 ] 

2109 ) 

2110 

2111 # Perform language-specific tag replacements according 

2112 # to rules in a table. 

2113 lang_tag_mappings = get_lang_conf(lang, "lang_tag_mappings") 

2114 if lang_tag_mappings is not None: 2114 ↛ 2115line 2114 didn't jump to line 2115 because the condition on line 2114 was never true

2115 for pre, post in lang_tag_mappings.items(): 

2116 if all(t in tags for t in pre): 

2117 tags = (tags - set(pre)) | set(post) 

2118 

2119 # Warn if there are entries with empty tags 

2120 if not tags: 

2121 wxr.wtp.debug( 

2122 "inflection table: empty tags for {}".format(form), 

2123 sortid="inflection/1826", 

2124 ) 

2125 

2126 # Warn if form looks like IPA 

2127 ########## XXX ######## 

2128 # Because IPA is its own unicode block, we could also 

2129 # technically do a Unicode name check to see if a string 

2130 # contains IPA. Not all valid IPA characters are in the 

2131 # IPA extension block, so you can technically have false 

2132 # negatives if it's something like /toki/, but it 

2133 # shouldn't give false positives. 

2134 # Alternatively, you could make a list of IPA-admissible 

2135 # characters and reject non-IPA stuff with that. 

2136 if re.match(r"\s*/.*/\s*$", form): 2136 ↛ 2137line 2136 didn't jump to line 2137 because the condition on line 2136 was never true

2137 wxr.wtp.debug( 

2138 "inflection table form looks like IPA: " 

2139 "form={} tags={}".format(form, tags), 

2140 sortid="inflection/1840", 

2141 ) 

2142 

2143 # Note that this checks `form`, not `in tags` 

2144 if form == "dummy-ignored-text-cell": 2144 ↛ 2145line 2144 didn't jump to line 2145 because the condition on line 2144 was never true

2145 continue 

2146 

2147 if "dummy-remove-this-cell" in tags: 2147 ↛ 2148line 2147 didn't jump to line 2148 because the condition on line 2147 was never true

2148 continue 

2149 

2150 # Add the form 

2151 tags = list(sorted(tags)) 

2152 dt = {"form": form, "tags": tags, "source": source} 

2153 if roman: 

2154 dt["roman"] = roman 

2155 if ipa: 

2156 dt["ipa"] = ipa 

2157 ret.append(dt) 

2158 # If we got separate clitic form, add it 

2159 if clitic: 

2160 dt = { 

2161 "form": clitic, 

2162 "tags": tags + ["clitic"], 

2163 "source": source, 

2164 } 

2165 ret.append(dt) 

2166 return ret, form, some_has_covered_text 

2167 

2168 # First extract definitions from cells 

2169 # See defs_ht for footnote defs stuff 

2170 for row in rows: 

2171 for cell in row: 

2172 text, refs, defs, hdr_tags = extract_cell_content( 

2173 lang, word, cell.text 

2174 ) 

2175 # refs, defs = footnote stuff, defs -> (ref, def) 

2176 add_defs(defs) 

2177 # Extract definitions from text after table 

2178 text, refs, defs, hdr_tags = extract_cell_content(lang, word, after) 

2179 add_defs(defs) 

2180 

2181 # Then extract the actual forms 

2182 ret = [] 

2183 hdrspans = [] 

2184 first_col_has_text = False 

2185 rownum = 0 

2186 title = None 

2187 global_tags = [] 

2188 table_tags = [] 

2189 special_phrase_splits = get_lang_conf(lang, "special_phrase_splits") 

2190 form_replacements = get_lang_conf(lang, "form_replacements") 

2191 form_transformations = get_lang_conf(lang, "form_transformations") 

2192 possibly_ignored_forms = get_lang_conf(lang, "conditionally_ignored_cells") 

2193 cleanup_rules = get_lang_conf(lang, "minor_text_cleanups") 

2194 

2195 for title in titles: 

2196 more_global_tags, more_table_tags, extra_forms = parse_title( 

2197 title, source 

2198 ) 

2199 global_tags.extend(more_global_tags) 

2200 table_tags.extend(more_table_tags) 

2201 ret.extend(extra_forms) 

2202 cell_rowcnt = collections.defaultdict(int) 

2203 seen_cells = set() 

2204 has_covering_hdr = set() 

2205 some_has_covered_text = False 

2206 for row in rows: 

2207 # print("ROW:", row) 

2208 # print("====") 

2209 # print(f"Start of PREVIOUS row hdrspans:" 

2210 # f"{tuple(sp.tagsets for sp in hdrspans)}") 

2211 # print(f"Start of row txt: {tuple(t.text for t in row)}") 

2212 if not row: 2212 ↛ 2213line 2212 didn't jump to line 2213 because the condition on line 2212 was never true

2213 continue # Skip empty rows 

2214 all_headers = all(x.is_title or not x.text.strip() for x in row) 

2215 text = row[0].text 

2216 if ( 

2217 row[0].is_title 

2218 and text 

2219 and not is_superscript(text[0]) 

2220 and text not in infl_map # zealous inflation map? 

2221 and ( 

2222 re.match(r"Inflection ", text) 

2223 or re.sub( 

2224 r"\s+", 

2225 " ", # flatten whitespace 

2226 re.sub( 

2227 r"\s*\([^)]*\)", 

2228 "", 

2229 # Remove whitespace+parens 

2230 text, 

2231 ), 

2232 ).strip() 

2233 not in infl_map 

2234 ) 

2235 and not re.match(infl_start_re, text) 

2236 and all( 

2237 x.is_title == row[0].is_title and x.text == text 

2238 # all InflCells in `row` have the same is_title and text 

2239 for x in row 

2240 ) 

2241 ): 

2242 if text and title is None: 

2243 # Only if there were no titles previously make the first 

2244 # text that is found the title 

2245 title = text 

2246 if re.match(r"(Note:|Notes:)", title): 2246 ↛ 2247line 2246 didn't jump to line 2247 because the condition on line 2246 was never true

2247 continue # not a title 

2248 more_global_tags, more_table_tags, extra_forms = parse_title( 

2249 title, source 

2250 ) 

2251 global_tags.extend(more_global_tags) 

2252 table_tags.extend(more_table_tags) 

2253 ret.extend(extra_forms) 

2254 continue # Skip title rows without incrementing i 

2255 if "dummy-skip-this" in global_tags: 2255 ↛ 2256line 2255 didn't jump to line 2256 because the condition on line 2255 was never true

2256 return [] 

2257 rowtags = [()] 

2258 # have_hdr = False 

2259 # have_hdr never used? 

2260 have_text = False 

2261 samecell_cnt = 0 

2262 col0_hdrspan = None # col0 or later header (despite its name) 

2263 col0_followed_by_nonempty = False 

2264 row_empty = True 

2265 for col_idx, cell in enumerate(row): 

2266 colspan = cell.colspan # >= 1 

2267 rowspan = cell.rowspan # >= 1 

2268 previously_seen = id(cell) in seen_cells 

2269 # checks to see if this cell was in the previous ROW 

2270 seen_cells.add(id(cell)) 

2271 if samecell_cnt == 0: 

2272 # First column of a (possible multi-column) cell 

2273 samecell_cnt = colspan - 1 

2274 else: 

2275 assert samecell_cnt > 0 

2276 samecell_cnt -= 1 

2277 continue 

2278 

2279 # is_first_row_of_cell = cell_rowcnt[id(cell)] == 0 

2280 # never used? 

2281 

2282 # defaultdict(int) around line 1900 

2283 cell_rowcnt[id(cell)] += 1 

2284 # => how many cols this spans 

2285 col = cell.text 

2286 if not col: 

2287 continue 

2288 row_empty = False 

2289 is_title = cell.is_title 

2290 

2291 # If the cell has a target, i.e., text after colon, interpret 

2292 # it as simply specifying a value for that value and ignore 

2293 # it otherwise. 

2294 if cell.target: 

2295 text, refs, defs, hdr_tags = extract_cell_content( 

2296 lang, word, col 

2297 ) 

2298 if not text: 2298 ↛ 2299line 2298 didn't jump to line 2299 because the condition on line 2298 was never true

2299 continue 

2300 refs_tags = set() 

2301 for ref in refs: # gets tags from footnotes 2301 ↛ 2302line 2301 didn't jump to line 2302 because the loop on line 2301 never started

2302 if ref in def_ht: 

2303 refs_tags.update(def_ht[ref]) 

2304 rowtags = expand_header( 

2305 wxr, 

2306 tablecontext, 

2307 word, 

2308 lang, 

2309 pos, 

2310 text, 

2311 [], 

2312 silent=True, 

2313 depth=depth, 

2314 ) 

2315 rowtags = list( 

2316 set(tuple(sorted(set(x) | refs_tags)) for x in rowtags) 

2317 ) 

2318 is_title = False 

2319 col = cell.target 

2320 

2321 # print(rownum, col_idx, col) 

2322 # print(f"is_title: {is_title}") 

2323 if is_title: 

2324 # It is a header cell 

2325 text, refs, defs, hdr_tags = extract_cell_content( 

2326 lang, word, col 

2327 ) 

2328 if not text: 

2329 continue 

2330 # Extract tags from referenced footnotes 

2331 refs_tags = set() 

2332 for ref in refs: 

2333 if ref in def_ht: 

2334 refs_tags.update(def_ht[ref]) 

2335 

2336 # Expand header to tags 

2337 v = expand_header( 

2338 wxr, 

2339 tablecontext, 

2340 word, 

2341 lang, 

2342 pos, 

2343 text, 

2344 [], 

2345 silent=True, 

2346 depth=depth, 

2347 ) 

2348 # print("EXPANDED {!r} to {}".format(text, v)) 

2349 

2350 if col_idx == 0: 

2351 # first_col_has_text is used for a test to ignore 

2352 # upper-left cells that are just text without 

2353 # header info 

2354 first_col_has_text = True 

2355 # Check if the header expands to reset hdrspans 

2356 if any("dummy-reset-headers" in tt for tt in v): 

2357 new_hdrspans = [] 

2358 for hdrspan in hdrspans: 

2359 # if there are HdrSpan objects (abstract headers with 

2360 # row- and column-spans) that are to the left or at the 

2361 # same row or below, KEEP those; things above and to 

2362 # the right of the hdrspan with dummy-reset-headers 

2363 # are discarded. Tags from the header together with 

2364 # dummy-reset-headers are kept as normal. 

2365 if ( 

2366 hdrspan.start + hdrspan.colspan < col_idx 

2367 or hdrspan.rownum > rownum - cell.rowspan 

2368 ): 

2369 new_hdrspans.append(hdrspan) 

2370 hdrspans = new_hdrspans 

2371 

2372 for tt in v: 

2373 if "dummy-section-header" in tt: 2373 ↛ 2374line 2373 didn't jump to line 2374 because the condition on line 2373 was never true

2374 tablecontext.section_header = tt 

2375 break 

2376 if "dummy-reset-section-header" in tt: 2376 ↛ 2377line 2376 didn't jump to line 2377 because the condition on line 2376 was never true

2377 tablecontext.section_header = [] 

2378 # Text between headers on a row causes earlier headers to 

2379 # be reset 

2380 if have_text: 

2381 # print(" HAVE_TEXT BEFORE HDR:", col) 

2382 # Reset rowtags if new title column after previous 

2383 # text cells 

2384 # +-----+-----+-----+-----+ 

2385 # |hdr-a|txt-a|hdr-B|txt-B| 

2386 # +-----+-----+-----+-----+ 

2387 # ^reset rowtags=> 

2388 # XXX beware of header "—": "" - must not clear on that if 

2389 # it expands to no tags 

2390 rowtags = [()] 

2391 # have_hdr = True 

2392 # have_hdr never used? 

2393 # print("HAVE_HDR: {} rowtags={}".format(col, rowtags)) 

2394 # Update rowtags and coltags 

2395 has_covering_hdr.add(col_idx) # col_idx == current column 

2396 # has_covering_hdr is a set that has the col_idx-ids of columns 

2397 # that have previously had some kind of header. It is never 

2398 # resetted inside the col_idx-loops OR the bigger rows-loop, so 

2399 # applies to the whole table. 

2400 

2401 rowtags, new_coltags, all_hdr_tags = generate_tags( 

2402 rowtags, table_tags 

2403 ) 

2404 

2405 if any("dummy-skip-this" in ts for ts in rowtags): 

2406 continue # Skip this cell 

2407 

2408 if any("dummy-load-stored-hdrspans" in ts for ts in v): 2408 ↛ 2409line 2408 didn't jump to line 2409 because the condition on line 2408 was never true

2409 hdrspans.extend(tablecontext.stored_hdrspans) 

2410 

2411 if any("dummy-reset-stored-hdrspans" in ts for ts in v): 2411 ↛ 2412line 2411 didn't jump to line 2412 because the condition on line 2411 was never true

2412 tablecontext.stored_hdrspans = [] 

2413 

2414 if any("dummy-store-hdrspan" in ts for ts in v): 2414 ↛ 2416line 2414 didn't jump to line 2416 because the condition on line 2414 was never true

2415 # print(f"STORED: {col}") 

2416 store_new_hdrspan = True 

2417 else: 

2418 store_new_hdrspan = False 

2419 

2420 new_coltags = list( 

2421 x 

2422 for x in new_coltags 

2423 if not any(t in noinherit_tags for t in x) 

2424 ) 

2425 # print("new_coltags={} previously_seen={} all_hdr_tags={}" 

2426 # .format(new_coltags, previously_seen, all_hdr_tags)) 

2427 if any(new_coltags): 

2428 ( 

2429 col, 

2430 col0_followed_by_nonempty, 

2431 col0_hdrspan, 

2432 ) = add_new_hdrspan( 

2433 col, 

2434 hdrspans, 

2435 store_new_hdrspan, 

2436 col0_followed_by_nonempty, 

2437 col0_hdrspan, 

2438 ) 

2439 

2440 continue 

2441 

2442 # These values are ignored, at least for now 

2443 if re.match(r"^(# |\(see )", col): 2443 ↛ 2444line 2443 didn't jump to line 2444 because the condition on line 2443 was never true

2444 continue 

2445 

2446 if any("dummy-skip-this" in ts for ts in rowtags): 

2447 continue # Skip this cell 

2448 

2449 # If the word has no rowtags and is a multi-row cell, then 

2450 # ignore this. This happens with empty separator rows 

2451 # within a rowspan>1 cell. cf. wander/English/Conjugation. 

2452 if rowtags == [()] and rowspan > 1: 

2453 continue 

2454 

2455 # Minor cleanup. See e.g. είμαι/Greek/Verb present participle. 

2456 if cleanup_rules: 

2457 for regx, substitution in cleanup_rules.items(): 

2458 col = re.sub(regx, substitution, col) 

2459 

2460 if ( 2460 ↛ 2465line 2460 didn't jump to line 2465 because the condition on line 2460 was never true

2461 col_idx == 0 

2462 and not first_col_has_text 

2463 and get_lang_conf(lang, "ignore_top_left_text_cell") is True 

2464 ): 

2465 continue # Skip text at top left, as in Icelandic, Faroese 

2466 

2467 # if col0_hdrspan is not None: 

2468 # print("COL0 FOLLOWED NONHDR: {!r} by {!r}" 

2469 # .format(col0_hdrspan.text, col)) 

2470 col0_followed_by_nonempty = True 

2471 have_text = True 

2472 

2473 # Determine column tags for the multi-column cell 

2474 combined_coltags = compute_coltags( 

2475 lang, pos, hdrspans, col_idx, colspan, col 

2476 ) 

2477 if any("dummy-ignored-text-cell" in ts for ts in combined_coltags): 2477 ↛ 2478line 2477 didn't jump to line 2478 because the condition on line 2477 was never true

2478 continue 

2479 

2480 # print("HAVE_TEXT:", repr(col)) 

2481 # Split the text into separate forms. First simplify spaces except 

2482 # newline. 

2483 col = re.sub(r"[ \t\r]+", " ", col) 

2484 # Split the cell text into alternatives 

2485 

2486 col, alts, split_extra_tags = split_text_into_alts(col) 

2487 

2488 # Some cells have mixed form content, like text and romanization, 

2489 # or text and IPA. Handle these. 

2490 alts = handle_mixed_lines(alts) 

2491 

2492 alts = list((x, combined_coltags) for x in alts) 

2493 

2494 # Generate forms from the alternatives 

2495 # alts is a list of (tuple of forms, tuple of tags) 

2496 for (form, base_roman, ipa), coltags in alts: 

2497 form = form.strip() 

2498 extra_tags = [] 

2499 extra_tags.extend(split_extra_tags) 

2500 # Handle special splits again here, so that we can have custom 

2501 # mappings from form to form and tags. 

2502 if form in form_replacements: 

2503 replacement, tags = form_replacements[form] 

2504 for x in tags.split(): 

2505 assert x in valid_tags 

2506 assert isinstance(replacement, str) 

2507 assert isinstance(tags, str) 

2508 form = replacement 

2509 extra_tags.extend(tags.split()) 

2510 

2511 check_romanization_form_transformation = False 

2512 # loop over regexes in form_transformation and replace text 

2513 # in form using regex patterns 

2514 # this does a bit of the same stuff the above does, 

2515 # but with regexes and re.sub() instead 

2516 for ( 

2517 form_transformations_pos, 

2518 v, 

2519 subst, 

2520 tags, 

2521 ) in form_transformations: 

2522 # v is a pattern string, like "^ich" 

2523 if pos != form_transformations_pos: 

2524 continue 

2525 m = re.search(v, form) 

2526 if m is not None: 

2527 form = re.sub(v, subst, form) 

2528 for x in tags.split(): 

2529 assert x in valid_tags 

2530 extra_tags.extend(tags.split()) 

2531 check_romanization_form_transformation = True 

2532 break 

2533 

2534 # Clean the value, extracting reference symbols 

2535 form, refs, defs, hdr_tags = extract_cell_content( 

2536 lang, word, form 

2537 ) 

2538 # if refs: 

2539 # print("REFS:", refs) 

2540 extra_tags.extend(hdr_tags) 

2541 # Extract tags from referenced footnotes 

2542 # Extract tags from referenced footnotes 

2543 refs_tags = set() 

2544 for ref in refs: 

2545 if ref in def_ht: 

2546 refs_tags.update(def_ht[ref]) 

2547 

2548 if base_roman: 

2549 if check_romanization_form_transformation: 2549 ↛ 2553line 2549 didn't jump to line 2553 because the condition on line 2549 was never true

2550 # because form_transformations are used to handle things 

2551 # where the romanization has the "same" structure, we 

2552 # need to handle that here too.... 

2553 for ( 

2554 _, 

2555 v, 

2556 subst, 

2557 _, 

2558 ) in form_transformations: 

2559 # v is a pattern string, like "^ich" 

2560 m = re.search(v, base_roman) 

2561 if m is not None: 

2562 base_roman = re.sub(v, subst, base_roman) 

2563 # XXX add tag stuff here if needed 

2564 break 

2565 

2566 base_roman, _, _, hdr_tags = extract_cell_content( 

2567 lang, word, base_roman 

2568 ) 

2569 extra_tags.extend(hdr_tags) 

2570 

2571 # Do some additional cleanup on the cell. 

2572 form = re.sub(r"^\s*,\s*", "", form) 

2573 form = re.sub(r"\s*,\s*$", "", form) 

2574 form = re.sub(r"\s*(,\s*)+", ", ", form) 

2575 form = re.sub(r"(?i)^Main:", "", form) 

2576 form = re.sub(r"\s+", " ", form) 

2577 form = form.strip() 

2578 

2579 # Look for parentheses that have semantic meaning 

2580 form, et = find_semantic_parens(form) 

2581 extra_tags.extend(et) 

2582 

2583 # Handle parentheses in the table element. We parse 

2584 # tags anywhere and romanizations anywhere but beginning. 

2585 roman = base_roman 

2586 paren = None 

2587 clitic = None 

2588 m = re.search(r"(\s+|^)\(([^)]*)\)", form) 

2589 # start|spaces + (anything) 

2590 if m is not None: 

2591 subst = m.group(1) 

2592 paren = m.group(2) 

2593 else: 

2594 m = re.search(r"\(([^)]*)\)(\s+|$)", form) 

2595 # (anything) + spaces|end 

2596 if m is not None: 2596 ↛ 2597line 2596 didn't jump to line 2597 because the condition on line 2596 was never true

2597 paren = m.group(1) 

2598 subst = m.group(2) 

2599 if paren is not None: 

2600 form, roman, clitic = handle_parens( 

2601 form, roman, clitic, extra_tags 

2602 ) 

2603 

2604 # Ignore certain forms that are not really forms, 

2605 # unless they're really, really close to the article title 

2606 if form in ( 2606 ↛ 2611line 2606 didn't jump to line 2611 because the condition on line 2606 was never true

2607 "", 

2608 "unchanged", 

2609 "after an", # in sona/Irish/Adj/Mutation 

2610 ): 

2611 Lev = distw([form], word) 

2612 if form and Lev < 0.1: 

2613 wxr.wtp.debug( 

2614 "accepted possible false positive '{}' with" 

2615 "> 0.1 Levenshtein distance in {}/{}".format( 

2616 form, word, lang 

2617 ), 

2618 sortid="inflection/2213", 

2619 ) 

2620 elif form and Lev < 0.3: 

2621 wxr.wtp.debug( 

2622 "skipped possible match '{}' with > 0.3" 

2623 "Levenshtein distance in {}/{}".format( 

2624 form, word, lang 

2625 ), 

2626 sortid="inflection/2218", 

2627 ) 

2628 continue 

2629 else: 

2630 continue 

2631 # print("ROWTAGS={} COLTAGS={} REFS_TAGS={} " 

2632 # "FORM={!r} ROMAN={!r}" 

2633 # .format(rowtags, coltags, refs_tags, 

2634 # form, roman)) 

2635 

2636 # Merge tags from row and column and do miscellaneous 

2637 # tag-related handling. 

2638 ( 

2639 merge_ret, 

2640 form, 

2641 some_has_covered_text, 

2642 ) = merge_row_and_column_tags(form, some_has_covered_text) 

2643 ret.extend(merge_ret) 

2644 

2645 # End of row. 

2646 rownum += 1 

2647 # For certain languages, if the row was empty, reset 

2648 # hdrspans (saprast/Latvian/Verb, but not aussteigen/German/Verb). 

2649 if row_empty and get_lang_conf(lang, "empty_row_resets"): 

2650 hdrspans = [] 

2651 # Check if we should expand col0_hdrspan. 

2652 if col0_hdrspan is not None: 

2653 col0_allowed = get_lang_conf(lang, "hdr_expand_first") 

2654 col0_cats = tagset_cats(col0_hdrspan.tagsets) 

2655 # Only expand if col0_cats and later_cats are allowed 

2656 # and don't overlap and col0 has tags, and there have 

2657 # been no disallowed cells in between. 

2658 if ( 

2659 not col0_followed_by_nonempty 

2660 and not (col0_cats - col0_allowed) 

2661 and 

2662 # len(col0_cats) == 1 and 

2663 col_idx > col0_hdrspan.start + col0_hdrspan.colspan 

2664 ): 

2665 # If an earlier header is only followed by headers that yield 

2666 # no tags, expand it to entire row 

2667 # print("EXPANDING COL0: {} from {} to {} cols {}" 

2668 # .format(col0_hdrspan.text, col0_hdrspan.colspan, 

2669 # len(row) - col0_hdrspan.start, 

2670 # col0_hdrspan.tagsets)) 

2671 col0_hdrspan.colspan = len(row) - col0_hdrspan.start 

2672 col0_hdrspan.expanded = True 

2673 # XXX handle refs and defs 

2674 # for x in hdrspans: 

2675 # print(" HDRSPAN {} {} {} {!r}" 

2676 # .format(x.start, x.colspan, x.tagsets, x.text)) 

2677 

2678 # Post-process German nouns with articles in separate columns. We move the 

2679 # definite/indefinite/usually-without-article markers into the noun and 

2680 # remove the article entries. 

2681 if get_lang_conf(lang, "articles_in_separate_columns") and any( 

2682 "noun" in x["tags"] for x in ret 

2683 ): 

2684 new_ret = [] 

2685 saved_tags = set() 

2686 had_noun = False 

2687 for dt in ret: 

2688 tags = dt["tags"] 

2689 # print(tags) 

2690 if "noun" in tags: 

2691 tags = list( 

2692 sorted(set(t for t in tags if t != "noun") | saved_tags) 

2693 ) 

2694 had_noun = True 

2695 elif ( 2695 ↛ 2722line 2695 didn't jump to line 2722 because the condition on line 2695 was always true

2696 "indefinite" in tags 

2697 or "definite" in tags 

2698 or "usually-without-article" in tags 

2699 or "without-article" in tags 

2700 ): 

2701 if had_noun: 

2702 saved_tags = set(tags) 

2703 else: 

2704 saved_tags = saved_tags | set(tags) # E.g. Haus/German 

2705 remove_useless_tags(lang, pos, saved_tags) 

2706 saved_tags = saved_tags & set( 

2707 [ 

2708 "masculine", 

2709 "feminine", 

2710 "neuter", 

2711 "singular", 

2712 "plural", 

2713 "indefinite", 

2714 "definite", 

2715 "usually-without-article", 

2716 "without-article", 

2717 ] 

2718 ) 

2719 had_noun = False 

2720 continue # Skip the articles 

2721 

2722 dt = dt.copy() 

2723 dt["tags"] = tags 

2724 new_ret.append(dt) 

2725 ret = new_ret 

2726 

2727 elif possibly_ignored_forms: 

2728 # Some languages have tables with cells that are kind of separated 

2729 # and difficult to handle, like eulersche Formel/German where 

2730 # the definite and indefinite articles are just floating. 

2731 # If a language has a dict of conditionally_ignored_cells, 

2732 # and if the contents of a cell is found in one of the rules 

2733 # there, ignore that cell if it 

2734 # 1. Does not have the appropriate tag (like "definite" for "die") 

2735 # and 

2736 # 2. The title of the article is not one of the other co-words 

2737 # (ie. it's an article for the definite articles in german etc.) 

2738 # pass 

2739 new_ret = [] 

2740 for cell_data in ret: 

2741 tags = cell_data["tags"] 

2742 text = cell_data["form"] 

2743 skip_this = False 

2744 for key_tag, ignored_forms in possibly_ignored_forms.items(): 

2745 if text not in ignored_forms: 2745 ↛ 2747line 2745 didn't jump to line 2747 because the condition on line 2745 was always true

2746 continue 

2747 if word in ignored_forms: 

2748 continue 

2749 if key_tag not in tags: 

2750 skip_this = True 

2751 

2752 if skip_this: 2752 ↛ 2753line 2752 didn't jump to line 2753 because the condition on line 2752 was never true

2753 continue 

2754 new_ret.append(cell_data) 

2755 

2756 ret = new_ret 

2757 

2758 # Post-process English inflection tables, addding "multiword-construction" 

2759 # when the number of words has increased. 

2760 if lang == "English" and pos == "verb": 

2761 word_words = len(word.split()) 

2762 new_ret = [] 

2763 for dt in ret: 

2764 form = dt.get("form", "") 

2765 if len(form.split()) > word_words: 

2766 dt = dt.copy() 

2767 dt["tags"] = list(dt.get("tags", [])) 

2768 # This strange copy-assigning shuffle is preventative black 

2769 # magic; do not touch lest you invoke deep bugs. 

2770 data_append(dt, "tags", "multiword-construction") 

2771 new_ret.append(dt) 

2772 ret = new_ret 

2773 

2774 # Always insert "table-tags" detail as the first entry in any inflection 

2775 # table. This way we can reliably detect where a new table starts. 

2776 # Table-tags applies until the next table-tags entry. 

2777 if ret or table_tags: 

2778 table_tags = list(sorted(set(table_tags))) 

2779 dt = { 

2780 "form": " ".join(table_tags), 

2781 "source": source, 

2782 "tags": ["table-tags"], 

2783 } 

2784 if dt["form"] == "": 

2785 dt["form"] = "no-table-tags" 

2786 if tablecontext.template_name: 

2787 tn = { 

2788 "form": tablecontext.template_name, 

2789 "source": source, 

2790 "tags": ["inflection-template"], 

2791 } 

2792 ret = [dt] + [tn] + ret 

2793 else: 

2794 ret = [dt] + ret 

2795 

2796 return ret 

2797 

2798 

2799def handle_generic_table( 

2800 wxr, tablecontext, data, word, lang, pos, rows, titles, source, after, depth 

2801): 

2802 assert isinstance(wxr, WiktextractContext) 

2803 assert isinstance(data, dict) 

2804 assert isinstance(word, str) 

2805 assert isinstance(lang, str) 

2806 assert isinstance(pos, str) 

2807 assert isinstance(rows, list) 

2808 assert isinstance(source, str) 

2809 assert isinstance(after, str) 

2810 assert isinstance(depth, int) 

2811 for row in rows: 

2812 assert isinstance(row, list) 

2813 for x in row: 

2814 assert isinstance(x, InflCell) 

2815 assert isinstance(titles, list) 

2816 for x in titles: 

2817 assert isinstance(x, str) 

2818 

2819 # Try to parse the table as a simple table 

2820 ret = parse_simple_table( 

2821 wxr, tablecontext, word, lang, pos, rows, titles, source, after, depth 

2822 ) 

2823 if ret is None: 2823 ↛ 2826line 2823 didn't jump to line 2826 because the condition on line 2823 was never true

2824 # XXX handle other table formats 

2825 # We were not able to handle the table 

2826 wxr.wtp.debug( 

2827 "unhandled inflection table format, {}/{}".format(word, lang), 

2828 sortid="inflection/2370", 

2829 ) 

2830 return 

2831 

2832 # Add the returned forms but eliminate duplicates. 

2833 have_forms = set() 

2834 for dt in ret: 

2835 fdt = freeze(dt) 

2836 if fdt in have_forms: 

2837 continue # Don't add duplicates 

2838 # Some Russian words have Declension and Pre-reform declension partially 

2839 # duplicating same data. Don't add "dated" tags variant if already have 

2840 # the same without "dated" from the modern declension table 

2841 

2842 tags = dt.get("tags", []) 

2843 for dated_tag in ("dated",): 

2844 if dated_tag in tags: 

2845 dt2 = dt.copy() 

2846 tags2 = list(x for x in tags if x != dated_tag) 

2847 dt2["tags"] = tags2 

2848 if tags2 and freeze(dt2) in have_forms: 2848 ↛ 2849line 2848 didn't jump to line 2849 because the condition on line 2848 was never true

2849 break # Already have without archaic 

2850 else: 

2851 if "table-tags" not in tags: 

2852 have_forms.add(fdt) 

2853 data_append(data, "forms", dt) 

2854 

2855 

2856def determine_header( 

2857 wxr, 

2858 tablecontext, 

2859 lang, 

2860 word, 

2861 pos, 

2862 table_kind, 

2863 kind, 

2864 style, 

2865 row, 

2866 col, 

2867 celltext, 

2868 titletext, 

2869 cols_headered, 

2870 target, 

2871 cellstyle, 

2872): 

2873 assert isinstance(table_kind, NodeKind) 

2874 assert isinstance(kind, (NodeKind, str)) 

2875 assert style is None or isinstance(style, str) 

2876 assert cellstyle is None or isinstance(cellstyle, str) 

2877 

2878 if table_kind == NodeKind.TABLE: 

2879 header_kind = NodeKind.TABLE_HEADER_CELL 

2880 elif table_kind == NodeKind.HTML: 2880 ↛ 2882line 2880 didn't jump to line 2882 because the condition on line 2880 was always true

2881 header_kind = "th" 

2882 idx = celltext.find(": ") 

2883 is_title = False 

2884 # remove anything in parentheses, compress whitespace, .strip() 

2885 cleaned_titletext = re.sub( 

2886 r"\s+", " ", re.sub(r"\s*\([^)]*\)", "", titletext) 

2887 ).strip() 

2888 cleaned, _, _, _ = extract_cell_content(lang, word, celltext) 

2889 cleaned = re.sub(r"\s+", " ", cleaned) 

2890 hdr_expansion = expand_header( 

2891 wxr, 

2892 tablecontext, 

2893 word, 

2894 lang, 

2895 pos, 

2896 cleaned, 

2897 [], 

2898 silent=True, 

2899 ignore_tags=True, 

2900 ) 

2901 candidate_hdr = not any( 

2902 any(t.startswith("error-") for t in ts) for ts in hdr_expansion 

2903 ) 

2904 # KJ candidate_hdr says that a specific cell is a candidate 

2905 # for being a header because it passed through expand_header 

2906 # without getting any "error-" tags; that is, the contents 

2907 # is "valid" for being a header; these are the false positives 

2908 # we want to catch 

2909 ignored_cell = any( 

2910 any(t.startswith("dummy-") for t in ts) for ts in hdr_expansion 

2911 ) 

2912 # ignored_cell should NOT be used to filter for headers, like 

2913 # candidate_hdr is used, but only to filter for related *debug 

2914 # messages*: some dummy-tags are actually half-way to headers, 

2915 # like ones with "Notes", so they MUST be headers, but later 

2916 # on they're ignored *as* headers so they don't need to print 

2917 # out any cells-as-headers debug messages. 

2918 if ( 

2919 candidate_hdr 

2920 and kind != header_kind 

2921 and cleaned != "" 

2922 and cleaned != "dummy-ignored-text-cell" 

2923 and cleaned not in IGNORED_COLVALUES 

2924 ): 

2925 # print("col: {}".format(col)) 

2926 if not ignored_cell and lang not in LANGUAGES_WITH_CELLS_AS_HEADERS: 

2927 wxr.wtp.debug( 

2928 "rejected heuristic header: " 

2929 "table cell identified as header and given " 

2930 "candidate status, BUT {} is not in " 

2931 "LANGUAGES_WITH_CELLS_AS_HEADERS; " 

2932 "cleaned text: {}".format(lang, cleaned), 

2933 sortid="inflection/2447", 

2934 ) 

2935 candidate_hdr = False 

2936 elif cleaned not in LANGUAGES_WITH_CELLS_AS_HEADERS.get(lang, ""): 

2937 wxr.wtp.debug( 

2938 "rejected heuristic header: " 

2939 "table cell identified as header and given " 

2940 "candidate status, BUT the cleaned text is " 

2941 "not in LANGUAGES_WITH_CELLS_AS_HEADERS[{}]; " 

2942 "cleaned text: {}".format(lang, cleaned), 

2943 sortid="inflection/2457", 

2944 ) 

2945 candidate_hdr = False 

2946 else: 

2947 wxr.wtp.debug( 

2948 "accepted heuristic header: " 

2949 "table cell identified as header and given " 

2950 "candidate status, AND the cleaned text is " 

2951 "in LANGUAGES_WITH_CELLS_AS_HEADERS[{}]; " 

2952 "cleaned text: {}".format(lang, cleaned), 

2953 sortid="inflection/2466", 

2954 ) 

2955 

2956 # If the cell starts with something that could start a 

2957 # definition (typically a reference symbol), make it a candidate 

2958 # regardless of whether the language is listed. 

2959 if re.match(def_re, cleaned) and not re.match(nondef_re, cleaned): 2959 ↛ 2960line 2959 didn't jump to line 2960 because the condition on line 2959 was never true

2960 candidate_hdr = True 

2961 

2962 # print("titletext={!r} hdr_expansion={!r} candidate_hdr={!r} " 

2963 # "lang={} pos={}" 

2964 # .format(titletext, hdr_expansion, candidate_hdr, 

2965 # lang, pos)) 

2966 if idx >= 0 and titletext[:idx] in infl_map: 

2967 target = titletext[idx + 2 :].strip() 

2968 celltext = celltext[:idx] 

2969 is_title = True 

2970 elif ( 

2971 kind == header_kind 

2972 and " + " not in titletext # For "avoir + blah blah"? 

2973 and not any( 

2974 isinstance(x, WikiNode) 

2975 and x.kind == NodeKind.HTML 

2976 and x.sarg == "span" 

2977 and x.attrs.get("lang") in ("az",) 

2978 for x in col.children 

2979 ) 

2980 ): 

2981 is_title = True 

2982 elif ( 

2983 candidate_hdr 

2984 and cleaned_titletext not in IGNORED_COLVALUES 

2985 and distw([cleaned_titletext], word) > 0.3 

2986 and cleaned_titletext not in ("I", "es") 

2987 ): 

2988 is_title = True 

2989 # if first column or same style as first column 

2990 elif ( 

2991 style == cellstyle 

2992 and 

2993 # and title is not identical to word name 

2994 titletext != word 

2995 and cleaned not in IGNORED_COLVALUES 

2996 and cleaned != "dummy-ignored-text-cell" 

2997 and 

2998 # the style composite string is not broken 

2999 not style.startswith("////") 

3000 and " + " not in titletext 

3001 ): 

3002 if not ignored_cell and lang not in LANGUAGES_WITH_CELLS_AS_HEADERS: 3002 ↛ 3003line 3002 didn't jump to line 3003 because the condition on line 3002 was never true

3003 wxr.wtp.debug( 

3004 "rejected heuristic header: " 

3005 "table cell identified as header based " 

3006 "on style, BUT {} is not in " 

3007 "LANGUAGES_WITH_CELLS_AS_HEADERS; " 

3008 "cleaned text: {}, style: {}".format(lang, cleaned, style), 

3009 sortid="inflection/2512", 

3010 ) 

3011 elif ( 3011 ↛ 3015line 3011 didn't jump to line 3015 because the condition on line 3011 was never true

3012 not ignored_cell 

3013 and cleaned not in LANGUAGES_WITH_CELLS_AS_HEADERS.get(lang, "") 

3014 ): 

3015 wxr.wtp.debug( 

3016 "rejected heuristic header: " 

3017 "table cell identified as header based " 

3018 "on style, BUT the cleaned text is " 

3019 "not in LANGUAGES_WITH_CELLS_AS_HEADERS[{}]; " 

3020 "cleaned text: {}, style: {}".format(lang, cleaned, style), 

3021 sortid="inflection/2522", 

3022 ) 

3023 else: 

3024 wxr.wtp.debug( 

3025 "accepted heuristic header: " 

3026 "table cell identified as header based " 

3027 "on style, AND the cleaned text is " 

3028 "in LANGUAGES_WITH_CELLS_AS_HEADERS[{}]; " 

3029 "cleaned text: {}, style: {}".format(lang, cleaned, style), 

3030 sortid="inflection/2530", 

3031 ) 

3032 is_title = True 

3033 if ( 3033 ↛ 3040line 3033 didn't jump to line 3040 because the condition on line 3033 was never true

3034 not is_title 

3035 and len(row) < len(cols_headered) 

3036 and cols_headered[len(row)] 

3037 ): 

3038 # Whole column has title suggesting they are headers 

3039 # (e.g. "Case") 

3040 is_title = True 

3041 if re.match( 

3042 r"Conjugation of |Declension of |Inflection of |" 

3043 r"Mutation of |Notes\b", # \b is word-boundary 

3044 titletext, 

3045 ): 

3046 is_title = True 

3047 return is_title, hdr_expansion, target, celltext 

3048 

3049 

3050class TableContext: 

3051 """Saved context used when parsing a table and its subtables.""" 

3052 

3053 __slot__ = ( 

3054 "stored_hdrspans", 

3055 "section_header", 

3056 "template_name", 

3057 ) 

3058 

3059 def __init__(self, template_name=None): 

3060 self.stored_hdrspans = [] 

3061 self.section_header = [] 

3062 if not template_name: 

3063 self.template_name = "" 

3064 else: 

3065 self.template_name = template_name 

3066 

3067 

3068def handle_wikitext_or_html_table( 

3069 wxr, word, lang, pos, data, tree, titles, source, after, tablecontext=None 

3070): 

3071 """Parses a table from parsed Wikitext format into rows and columns of 

3072 InflCell objects and then calls handle_generic_table() to parse it into 

3073 forms. This adds the forms into ``data``.""" 

3074 assert isinstance(wxr, WiktextractContext) 

3075 assert isinstance(word, str) 

3076 assert isinstance(lang, str) 

3077 assert isinstance(pos, str) 

3078 assert isinstance(data, dict) 

3079 assert isinstance(tree, WikiNode) 

3080 assert tree.kind == NodeKind.TABLE or ( 

3081 tree.kind == NodeKind.HTML and tree.sarg == "table" 

3082 ) 

3083 assert isinstance(titles, list) 

3084 assert isinstance(source, str) 

3085 for x in titles: 

3086 assert isinstance(x, str) 

3087 assert isinstance(after, str) 

3088 assert tablecontext is None or isinstance(tablecontext, TableContext) 

3089 # Imported here to avoid a circular import 

3090 from wiktextract.page import clean_node, recursively_extract 

3091 

3092 # from wikitextprocessor.parser import print_tree 

3093 # print_tree(tree) 

3094 # print("-------==========-------") 

3095 

3096 if not tablecontext: 

3097 tablecontext = TableContext() 

3098 

3099 def handle_table1( 

3100 wxr, 

3101 tablecontext, 

3102 word, 

3103 lang, 

3104 pos, 

3105 data, 

3106 tree, 

3107 titles, 

3108 source, 

3109 after, 

3110 depth, 

3111 ): 

3112 """Helper function allowing the 'flattening' out of the table 

3113 recursion: instead of handling the tables in the wrong order 

3114 (recursively), this function adds to new_row that is then 

3115 iterated through in the main function at the end, creating 

3116 a longer table (still in pieces) in the correct order.""" 

3117 

3118 assert isinstance(data, dict) 

3119 assert isinstance(titles, list) 

3120 assert isinstance(source, str) 

3121 for x in titles: 

3122 assert isinstance(x, str) 

3123 assert isinstance(after, str) 

3124 assert isinstance(depth, int) 

3125 # print("HANDLE_WIKITEXT_TABLE", titles) 

3126 

3127 col_gap_data = [] # Filling for columns with rowspan > 1 

3128 # col_gap_data contains None or InflCell 

3129 vertical_still_left = [] # Number of remaining rows for which to fill 

3130 # the column; vertical_still_left contains int 

3131 cols_headered = [] # [F, T, F, F...] 

3132 # True when the whole column contains headers, even 

3133 # when the cell is not considered a header; triggered 

3134 # by the "*" inflmap meta-tag. 

3135 rows = [] 

3136 

3137 sub_ret = [] 

3138 

3139 # from wikitextprocessor.parser import print_tree 

3140 # print_tree(tree) 

3141 for node in tree.children: 

3142 if not isinstance(node, WikiNode): 

3143 continue 

3144 if node.kind == NodeKind.HTML: 

3145 kind = node.sarg 

3146 else: 

3147 kind = node.kind 

3148 

3149 # print(" {}".format(node)) 

3150 if kind in (NodeKind.TABLE_CAPTION, "caption"): 

3151 # print(" CAPTION:", node) 

3152 pass 

3153 elif kind in (NodeKind.TABLE_ROW, "tr"): 

3154 if "vsShow" in node.attrs.get("class", "").split(): 

3155 # vsShow rows are those that are intially shown in tables 

3156 # that have more data. The hidden data duplicates these 

3157 # rows, so we skip it and just process the hidden data. 

3158 continue 

3159 

3160 # if ( 

3161 # len(node.children) == 1 

3162 # and node.children[0].attrs.get("class") == "separator" 

3163 # ): 

3164 # print("------------------ skip separator") 

3165 # continue 

3166 

3167 # Parse a table row. 

3168 row = [] 

3169 style = None 

3170 row_has_nonempty_cells = False 

3171 # Have nonempty cell not from rowspan 

3172 for col in get_table_cells(node): 

3173 # loop through each cell in the ROW 

3174 

3175 # The below skip is not needed anymore, because we "skip" in 

3176 # get_table_cells, but left here as a comment 

3177 # if not isinstance(col, WikiNode): 

3178 # # This skip is not used for counting, 

3179 # # "None" is not used in 

3180 # # indexing or counting or looping. 

3181 # continue 

3182 if col.kind == NodeKind.HTML: 

3183 kind = col.sarg 

3184 else: 

3185 kind = col.kind 

3186 if kind not in ( 3186 ↛ 3192line 3186 didn't jump to line 3192 because the condition on line 3186 was never true

3187 NodeKind.TABLE_HEADER_CELL, 

3188 NodeKind.TABLE_CELL, 

3189 "th", 

3190 "td", 

3191 ): 

3192 print(" UNEXPECTED ROW CONTENT: {}".format(col)) 

3193 continue 

3194 

3195 while ( 

3196 len(row) < len(vertical_still_left) 

3197 and vertical_still_left[len(row)] > 0 

3198 ): 

3199 # vertical_still_left is [...0, 0, 2...] for each 

3200 # column. It is populated at the end of the loop, at the 

3201 # same time as col_gap_data. This needs to be looped and 

3202 # filled this way because each `for col`-looping jumps 

3203 # straight to the next meaningful cell; there is no 

3204 # "None" cells, only emptiness between, and rowspan and 

3205 # colspan are just to generate the "fill- 

3206 vertical_still_left[len(row)] -= 1 

3207 row.append(col_gap_data[len(row)]) 

3208 

3209 # appending row is how "indexing" is 

3210 # done here; something is appended, 

3211 # like a filler-cell here or a "start" 

3212 # cell at the end of the row-loop, 

3213 # which increased len(row) which is 

3214 # then used as the target-index to check 

3215 # for gaps. vertical_still_left is 

3216 # the countdown to when to stop 

3217 # filling in gaps, and goes down to 0, 

3218 # and col_gap_data is not touched 

3219 # except when a new rowspan is needed, 

3220 # at the same time that 

3221 # vertical_still_left gets reassigned. 

3222 

3223 try: 

3224 rowspan = int(col.attrs.get("rowspan", "1")) # 🡙 

3225 colspan = int(col.attrs.get("colspan", "1")) # 🡘 

3226 except ValueError: 

3227 rowspan = 1 

3228 colspan = 1 

3229 # print("COL:", col) 

3230 

3231 # Too many of these errors 

3232 if colspan > 100: 

3233 # wxr.wtp.error( 

3234 # f"Colspan {colspan} over 30, set to 1", 

3235 # sortid="inflection/20250113a", 

3236 # ) 

3237 colspan = 100 

3238 if rowspan > 100: 3238 ↛ 3243line 3238 didn't jump to line 3243 because the condition on line 3238 was never true

3239 # wxr.wtp.error( 

3240 # f"Rowspan {rowspan} over 30, set to 1", 

3241 # sortid="inflection/20250113b", 

3242 # ) 

3243 rowspan = 100 

3244 

3245 # Process any nested tables recursively. 

3246 tables, rest = recursively_extract( 

3247 col, 

3248 lambda x: isinstance(x, WikiNode) 

3249 and (x.kind == NodeKind.TABLE or x.sarg == "table"), 

3250 ) 

3251 

3252 # Clean the rest of the cell. 

3253 celltext = clean_node(wxr, None, rest) 

3254 # print("CLEANED:", celltext) 

3255 # print(f"SUBTABLES: {tables}") 

3256 

3257 # Handle nested tables. 

3258 for tbl in tables: 

3259 # Some nested tables (e.g., croí/Irish) have subtitles 

3260 # as normal paragraphs in the same cell under a descrip- 

3261 # tive text that should be treated as a title (e.g., 

3262 # "Forms with the definite article", with "definite" not 

3263 # mentioned elsewhere). 

3264 new_titles = list(titles) 

3265 if celltext: 

3266 new_titles.append(celltext) 

3267 subtbl = handle_table1( 

3268 wxr, 

3269 tablecontext, 

3270 word, 

3271 lang, 

3272 pos, 

3273 data, 

3274 tbl, 

3275 new_titles, 

3276 source, 

3277 "", 

3278 depth + 1, 

3279 ) 

3280 if subtbl: 3280 ↛ 3258line 3280 didn't jump to line 3258 because the condition on line 3280 was always true

3281 sub_ret.append((rows, titles, after, depth)) 

3282 rows = [] 

3283 titles = [] 

3284 after = "" 

3285 sub_ret.extend(subtbl) 

3286 

3287 # This magic value is used as part of header detection 

3288 cellstyle = ( 

3289 col.attrs.get("style", "") 

3290 + "//" 

3291 + col.attrs.get("class", "") 

3292 + "//" 

3293 + str(kind) 

3294 ) 

3295 

3296 if not row: # if first column in row 

3297 style = cellstyle 

3298 target = None 

3299 titletext = celltext.strip() 

3300 while titletext and is_superscript(titletext[-1]): 

3301 titletext = titletext[:-1] 

3302 

3303 ( 

3304 is_title, 

3305 hdr_expansion, 

3306 target, 

3307 celltext, 

3308 ) = determine_header( 

3309 wxr, 

3310 tablecontext, 

3311 lang, 

3312 word, 

3313 pos, 

3314 tree.kind, 

3315 kind, 

3316 style, 

3317 row, 

3318 col, 

3319 celltext, 

3320 titletext, 

3321 cols_headered, 

3322 None, 

3323 cellstyle, 

3324 ) 

3325 

3326 if is_title: 

3327 # If this cell gets a "*" tag, make the whole column 

3328 # below it (toggling it in cols_headered = [F, F, T...]) 

3329 # into headers. 

3330 while len(cols_headered) <= len(row): 

3331 cols_headered.append(False) 

3332 if any("*" in tt for tt in hdr_expansion): 

3333 cols_headered[len(row)] = True 

3334 celltext = "" 

3335 # if row_has_nonempty_cells has been True at some point, it 

3336 # keeps on being True. 

3337 # if row_has_nonempty_cells or is_title or celltext != "": 

3338 # row_has_nonempty_cells = True 

3339 # ⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓ 

3340 row_has_nonempty_cells |= is_title or celltext != "" 

3341 cell = InflCell( 

3342 celltext, is_title, colspan, rowspan, target 

3343 ) 

3344 for _ in range(0, colspan): 

3345 # colspan🡘 current loop (col) or 1 

3346 # All the data-filling for colspan 

3347 # is done simply in this loop, 

3348 # while rowspan needs to use 

3349 # vertical_still_left to count gaps 

3350 # and col_gap_data to fill in 

3351 # those gaps with InflCell data. 

3352 if rowspan > 1: # rowspan🡙 current loop (col) or 1 

3353 while len(col_gap_data) <= len(row): 

3354 # Initialize col_gap_data/ed if 

3355 # it is lacking slots 

3356 # for each column; col_gap_data and 

3357 # vertical_still_left are never 

3358 # reset to [], during 

3359 # the whole table function. 

3360 col_gap_data.append(None) 

3361 vertical_still_left.append(0) 

3362 # Below is where the "rectangle" block of rowspan 

3363 # and colspan is filled for the future. 

3364 col_gap_data[len(row)] = cell 

3365 # col_gap_data contains cells that 

3366 # will be used in the 

3367 # future, or None 

3368 vertical_still_left[len(row)] = rowspan - 1 

3369 # A counter for how many gaps🡙 are still left to be 

3370 # filled (row.append or 

3371 # row[col_gap_data[len(row)] => 

3372 # rows), it is not reset to [], but decremented to 0 

3373 # each time a row gets something from col_gap_data. 

3374 # Append this cell 1+ times for colspan🡘 

3375 row.append(cell) 

3376 if not row: 

3377 continue 

3378 # After looping the original row-nodes above, fill 

3379 # in the rest of the row if the final cell has colspan 

3380 # (inherited from above, so a cell with rowspan and colspan) 

3381 for i in range(len(row), len(vertical_still_left)): 

3382 if vertical_still_left[i] <= 0: 

3383 continue 

3384 vertical_still_left[i] -= 1 

3385 while len(row) < i: 

3386 row.append(InflCell("", False, 1, 1, None)) 

3387 row.append(col_gap_data[i]) 

3388 # print(" ROW {!r}".format(row)) 

3389 if row_has_nonempty_cells: 3389 ↛ 3141line 3389 didn't jump to line 3141 because the condition on line 3389 was always true

3390 rows.append(row) 

3391 elif kind in ( 3391 ↛ 3141line 3391 didn't jump to line 3141 because the condition on line 3391 was always true

3392 NodeKind.TABLE_HEADER_CELL, 

3393 NodeKind.TABLE_CELL, 

3394 "th", 

3395 "td", 

3396 "span", 

3397 ): 

3398 # print(" TOP-LEVEL CELL", node) 

3399 pass 

3400 

3401 if sub_ret: 

3402 main_ret = sub_ret 

3403 main_ret.append((rows, titles, after, depth)) 

3404 else: 

3405 main_ret = [(rows, titles, after, depth)] 

3406 return main_ret 

3407 

3408 new_rows = handle_table1( 

3409 wxr, tablecontext, word, lang, pos, data, tree, titles, source, after, 0 

3410 ) 

3411 

3412 # Now we have a table that has been parsed into rows and columns of 

3413 # InflCell objects. Parse the inflection table from that format. 

3414 if new_rows: 3414 ↛ exitline 3414 didn't return from function 'handle_wikitext_or_html_table' because the condition on line 3414 was always true

3415 for rows, titles, after, depth in new_rows: 

3416 handle_generic_table( 

3417 wxr, 

3418 tablecontext, 

3419 data, 

3420 word, 

3421 lang, 

3422 pos, 

3423 rows, 

3424 titles, 

3425 source, 

3426 after, 

3427 depth, 

3428 ) 

3429 

3430 

3431def get_table_cells(node: WikiNode) -> Generator[WikiNode, None, None]: 

3432 """If a wikitext table cell contains HTML cells `<td>`, as they sometimes 

3433 do because it is easier to write wikitext conditionals that way, 

3434 those td-elements are parsed as child elements of the Wikitext cell. 

3435 This generator will yield wikitext and HTML direct children of 

3436 `node` and if a Wikitext TABLE_CELL has direct td-element children, 

3437 those are also yielded.""" 

3438 for col in node.children: 

3439 if not isinstance(col, WikiNode): 

3440 continue 

3441 if any( 

3442 isinstance(c, HTMLNode) and c.sarg in ("th", "td") 

3443 for c in col.children 

3444 ): 

3445 html_cells = [] 

3446 content = [] 

3447 for c in col.children: 

3448 if isinstance(c, HTMLNode) and c.sarg in ("th", "td"): 

3449 html_cells.append(c) 

3450 else: 

3451 content.append(c) 

3452 # Remove td-elements from col so they are not returned twice 

3453 col.children = content 

3454 yield col 

3455 for c in html_cells: 

3456 yield c 

3457 else: 

3458 yield col 

3459 

3460 

3461def handle_html_table( 

3462 wxr, word, lang, pos, data, tree, titles, source, after, tablecontext=None 

3463): 

3464 """A passer-on function for html-tables, XXX, remove these?""" 

3465 handle_wikitext_or_html_table( 

3466 wxr, word, lang, pos, data, tree, titles, source, after, tablecontext 

3467 ) 

3468 

3469 

3470def handle_wikitext_table( 

3471 wxr, word, lang, pos, data, tree, titles, source, after, tablecontext=None 

3472): 

3473 """A passer-on function for html-tables, XXX, remove these?""" 

3474 handle_wikitext_or_html_table( 

3475 wxr, word, lang, pos, data, tree, titles, source, after, tablecontext 

3476 ) 

3477 

3478 

3479def parse_inflection_section( 

3480 wxr, data, word, lang, pos, section, tree, tablecontext=None 

3481): 

3482 """Parses an inflection section on a page. ``data`` should be the 

3483 data for a part-of-speech, and inflections will be added to it.""" 

3484 

3485 # print("PARSE_INFLECTION_SECTION {}/{}/{}/{}" 

3486 # .format(word, lang, pos, section)) 

3487 assert isinstance(wxr, WiktextractContext) 

3488 assert isinstance(data, dict) 

3489 assert isinstance(word, str) 

3490 assert isinstance(lang, str) 

3491 assert isinstance(section, str) 

3492 assert isinstance(tree, WikiNode) 

3493 assert tablecontext is None or isinstance(tablecontext, TableContext) 

3494 source = section 

3495 tables = [] 

3496 titleparts = [] 

3497 preceding_bolded_title = "" 

3498 

3499 # from wikitextprocessor.parser import print_tree 

3500 # print_tree(tree) 

3501 # print("--------------******************----------------") 

3502 

3503 def process_tables(): 

3504 for kind, node, titles, after in tables: 

3505 after = "".join(after).strip() 

3506 after = clean_value(wxr, after) 

3507 if kind == "wikitext": 

3508 handle_wikitext_table( 

3509 wxr, 

3510 word, 

3511 lang, 

3512 pos, 

3513 data, 

3514 node, 

3515 titles, 

3516 source, 

3517 after, 

3518 tablecontext=tablecontext, 

3519 ) 

3520 elif kind == "html": 3520 ↛ 3534line 3520 didn't jump to line 3534 because the condition on line 3520 was always true

3521 handle_html_table( 

3522 wxr, 

3523 word, 

3524 lang, 

3525 pos, 

3526 data, 

3527 node, 

3528 titles, 

3529 source, 

3530 after, 

3531 tablecontext=tablecontext, 

3532 ) 

3533 else: 

3534 raise RuntimeError( 

3535 "{}: unimplemented table kind {}".format(word, kind) 

3536 ) 

3537 

3538 def recurse_navframe(node, titles): 

3539 nonlocal tables 

3540 nonlocal titleparts 

3541 titleparts = [] 

3542 old_tables = tables 

3543 tables = [] 

3544 

3545 recurse(node, [], navframe=True) 

3546 

3547 process_tables() 

3548 tables = old_tables 

3549 

3550 def recurse(node, titles, navframe=False): 

3551 nonlocal tables 

3552 if isinstance(node, (list, tuple)): 

3553 for x in node: 

3554 recurse(x, titles, navframe) 

3555 return 

3556 if isinstance(node, str): 

3557 if tables: 

3558 tables[-1][-1].append(node) 

3559 elif navframe: 

3560 titleparts.append(node) 

3561 return 

3562 if not isinstance(node, WikiNode): 3562 ↛ 3563line 3562 didn't jump to line 3563 because the condition on line 3562 was never true

3563 if navframe: 

3564 wxr.wtp.debug( 

3565 "inflection table: unhandled in NavFrame: {}".format(node), 

3566 sortid="inflection/2907", 

3567 ) 

3568 return 

3569 kind = node.kind 

3570 if navframe: 

3571 if kind == NodeKind.HTML: 

3572 classes = node.attrs.get("class", "").split() 

3573 if "NavToggle" in classes: 3573 ↛ 3574line 3573 didn't jump to line 3574 because the condition on line 3573 was never true

3574 return 

3575 if "NavHead" in classes: 

3576 # print("NAVHEAD:", node) 

3577 recurse(node.children, titles, navframe) 

3578 return 

3579 if "NavContent" in classes: 

3580 # print("NAVCONTENT:", node) 

3581 title = "".join(titleparts).strip() 

3582 title = html.unescape(title) 

3583 title = title.strip() 

3584 new_titles = list(titles) 

3585 if not re.match(r"(Note:|Notes:)", title): 3585 ↛ 3587line 3585 didn't jump to line 3587 because the condition on line 3585 was always true

3586 new_titles.append(title) 

3587 recurse(node, new_titles, navframe=False) 

3588 return 

3589 else: 

3590 if kind == NodeKind.TABLE: 

3591 tables.append(["wikitext", node, titles, []]) 

3592 return 

3593 elif kind == NodeKind.HTML and node.sarg == "table": 

3594 classes = node.attrs.get("class", ()) 

3595 if "audiotable" in classes: 

3596 return 

3597 tables.append(["html", node, titles, []]) 

3598 return 

3599 elif kind in ( 3599 ↛ 3606line 3599 didn't jump to line 3606 because the condition on line 3599 was never true

3600 NodeKind.LEVEL2, 

3601 NodeKind.LEVEL3, 

3602 NodeKind.LEVEL4, 

3603 NodeKind.LEVEL5, 

3604 NodeKind.LEVEL6, 

3605 ): 

3606 return # Skip subsections 

3607 if ( 

3608 kind == NodeKind.HTML 

3609 and node.sarg == "div" 

3610 and "NavFrame" in node.attrs.get("class", "").split() 

3611 ): 

3612 recurse_navframe(node, titles) 

3613 return 

3614 if kind == NodeKind.LINK: 

3615 if len(node.largs) > 1: 

3616 recurse(node.largs[1:], titles, navframe) 

3617 else: 

3618 recurse(node.largs[0], titles, navframe) 

3619 return 

3620 if kind == NodeKind.LIST and node.sarg == ";": 

3621 nonlocal preceding_bolded_title 

3622 from wiktextract.page import clean_node 

3623 

3624 preceding_bolded_title = clean_node(wxr, None, node).strip("; ") 

3625 for x in node.children: 

3626 recurse(x, titles, navframe) 

3627 

3628 assert tree.kind == NodeKind.ROOT 

3629 for x in tree.children: 

3630 if preceding_bolded_title != "": 

3631 recurse(x, [preceding_bolded_title]) 

3632 else: 

3633 recurse(x, []) 

3634 

3635 # Process the tables we found 

3636 process_tables() 

3637 

3638 # XXX this code is used for extracting tables for inflection tests 

3639 if wxr.config.expand_tables: 3639 ↛ 3640line 3639 didn't jump to line 3640 because the condition on line 3639 was never true

3640 if section != "Mutation": 

3641 with open(wxr.config.expand_tables, "w") as f: 

3642 f.write(word + "\n") 

3643 f.write(lang + "\n") 

3644 f.write(pos + "\n") 

3645 f.write(section + "\n") 

3646 text = wxr.wtp.node_to_wikitext(tree) 

3647 f.write(text + "\n")