Coverage for src/wiktextract/extractor/en/inflection.py: 87%

1536 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2025-10-13 10:14 +0000

1# Code for parsing inflection tables. 

2# 

3# Copyright (c) 2021-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org. 

4 

5import collections 

6import copy 

7import functools 

8import html 

9import re 

10import unicodedata 

11from typing import Generator, Optional, Union 

12 

13from mediawiki_langcodes import code_to_name, name_to_code 

14from wikitextprocessor import MAGIC_FIRST, HTMLNode, NodeKind, WikiNode 

15 

16from ...clean import clean_value 

17from ...datautils import data_append, freeze, split_at_comma_semi 

18from ...tags import valid_tags 

19from ...wxr_context import WiktextractContext 

20from .form_descriptions import ( 

21 classify_desc, 

22 decode_tags, 

23 distw, 

24 parse_head_final_tags, 

25) 

26from .inflection_kludges import ka_decl_noun_template_cell 

27from .inflectiondata import infl_map, infl_start_map, infl_start_re 

28from .lang_specific_configs import get_lang_conf, lang_specific_tags 

29from .table_headers_heuristics_data import LANGUAGES_WITH_CELLS_AS_HEADERS 

30from .type_utils import FormData 

31 

32# --debug-text-cell WORD 

33# Command-line parameter for debugging. When parsing inflection tables, 

34# print out debug messages when encountering this text. 

35debug_cell_text: Optional[str] = None 

36 

37 

38def set_debug_cell_text(text: str) -> None: 

39 global debug_cell_text 

40 debug_cell_text = text 

41 

42 

43TagSets = list[tuple[str, ...]] 

44 

45# Column texts that are interpreted as an empty column. 

46IGNORED_COLVALUES = { 

47 "-", 

48 "־", 

49 "᠆", 

50 "‐", 

51 "‑", 

52 "‒", 

53 "–", 

54 "—", 

55 "―", 

56 "−", 

57 "⸺", 

58 "⸻", 

59 "﹘", 

60 "﹣", 

61 "-", 

62 "/", 

63 "?", 

64 "not used", 

65 "not applicable", 

66} 

67 

68# These tags are never inherited from above 

69# XXX merge with lang_specific 

70noinherit_tags = { 

71 "infinitive-i", 

72 "infinitive-i-long", 

73 "infinitive-ii", 

74 "infinitive-iii", 

75 "infinitive-iv", 

76 "infinitive-v", 

77} 

78 

79# Subject->object transformation mapping, when using dummy-object-concord 

80# to replace subject concord tags with object concord tags 

81object_concord_replacements = { 

82 "first-person": "object-first-person", 

83 "second-person": "object-second-person", 

84 "third-person": "object-third-person", 

85 "singular": "object-singular", 

86 "plural": "object-plural", 

87 "definite": "object-definite", 

88 "indefinite": "object-indefinite", 

89 "class-1": "object-class-1", 

90 "class-2": "object-class-2", 

91 "class-3": "object-class-3", 

92 "class-4": "object-class-4", 

93 "class-5": "object-class-5", 

94 "class-6": "object-class-6", 

95 "class-7": "object-class-7", 

96 "class-8": "object-class-8", 

97 "class-9": "object-class-9", 

98 "class-10": "object-class-10", 

99 "class-11": "object-class-11", 

100 "class-12": "object-class-12", 

101 "class-13": "object-class-13", 

102 "class-14": "object-class-14", 

103 "class-15": "object-class-15", 

104 "class-16": "object-class-16", 

105 "class-17": "object-class-17", 

106 "class-18": "object-class-18", 

107 "masculine": "object-masculine", 

108 "feminine": "object-feminine", 

109} 

110 

111# Words in title that cause addition of tags in all entries 

112title_contains_global_map = { 

113 "possessive": "possessive", 

114 "possessed forms of": "possessive", 

115 "predicative forms of": "predicative", 

116 "negative": "negative", 

117 "positive definite forms": "positive definite", 

118 "positive indefinite forms": "positive indefinite", 

119 "comparative": "comparative", 

120 "superlative": "superlative", 

121 "combined forms": "combined-form", 

122 "mutation": "mutation", 

123 "definite article": "definite", 

124 "indefinite article": "indefinite", 

125 "indefinite declension": "indefinite", 

126 "bare forms": "indefinite", # e.g., cois/Irish 

127 "definite declension": "definite", 

128 "pre-reform": "dated", 

129 "personal pronouns": "personal pronoun", 

130 "composed forms of": "multiword-construction", 

131 "subordinate-clause forms of": "subordinate-clause", 

132 "participles of": "participle", 

133 "variation of": "dummy-skip-this", # a'/Scottish Gaelic 

134 "command form of": "imperative", # a راتلل/Pashto 

135 "historical inflection of": "dummy-skip-this", # kork/Norwegian Nynorsk 

136 "obsolete declension": "obsolete", # März/German 20241111 

137} 

138for k, v in title_contains_global_map.items(): 

139 if any(t not in valid_tags for t in v.split()): 139 ↛ 140line 139 didn't jump to line 140 because the condition on line 139 was never true

140 print("TITLE_CONTAINS_GLOBAL_MAP UNRECOGNIZED TAG: {}: {}".format(k, v)) 

141table_hdr_ign_part = r"(Inflection|Conjugation|Declension|Mutation) of [^\s]" 

142 

143table_hdr_ign_part_re = re.compile(r"(?i)(" + table_hdr_ign_part + ")") 

144# (?i) python regex extension, ignore case 

145title_contains_global_re = re.compile( 

146 r"(?i)(^|\b)({}|{})($|\b)".format( 

147 table_hdr_ign_part, 

148 "|".join(re.escape(x) for x in title_contains_global_map.keys()), 

149 ) 

150) 

151 

152# Words in title that cause addition of tags to table-tags "form" 

153title_contains_wordtags_map = { 

154 "pf": "perfective", 

155 "impf": "imperfective", 

156 "strong": "strong", 

157 "weak": "weak", 

158 "countable": "countable", 

159 "uncountable": "uncountable", 

160 "inanimate": "inanimate", 

161 "animate": "animate", 

162 "transitive": "transitive", 

163 "intransitive": "intransitive", 

164 "ditransitive": "ditransitive", 

165 "ambitransitive": "ambitransitive", 

166 "archaic": "archaic", 

167 "dated": "dated", 

168 "affirmative": "affirmative", 

169 "negative": "negative", 

170 "subject pronouns": "subjective", 

171 "object pronouns": "objective", 

172 "emphatic": "emphatic", 

173 "proper noun": "proper-noun", 

174 "no plural": "no-plural", 

175 "imperfective": "imperfective", 

176 "perfective": "perfective", 

177 "no supine stem": "no-supine", 

178 "no perfect stem": "no-perfect", 

179 "deponent": "deponent", 

180 "irregular": "irregular", 

181 "no short forms": "no-short-form", 

182 "iō-variant": "iō-variant", 

183 "1st declension": "declension-1", 

184 "2nd declension": "declension-2", 

185 "3rd declension": "declension-3", 

186 "4th declension": "declension-4", 

187 "5th declension": "declension-5", 

188 "6th declension": "declension-6", 

189 "first declension": "declension-1", 

190 "second declension": "declension-2", 

191 "third declension": "declension-3", 

192 "fourth declension": "declension-4", 

193 "fifth declension": "declension-5", 

194 "sixth declension": "declension-6", 

195 "1st conjugation": "conjugation-1", 

196 "2nd conjugation": "conjugation-2", 

197 "3rd conjugation": "conjugation-3", 

198 "4th conjugation": "conjugation-4", 

199 "5th conjugation": "conjugation-5", 

200 "6th conjugation": "conjugation-6", 

201 "7th conjugation": "conjugation-7", 

202 "first conjugation": "conjugation-1", 

203 "second conjugation": "conjugation-2", 

204 "third conjugation": "conjugation-3", 

205 "fourth conjugation": "conjugation-4", 

206 "fifth conjugation": "conjugation-5", 

207 "sixth conjugation": "conjugation-6", 

208 "seventh conjugation": "conjugation-7", 

209 # Corsican regional tags in table header 

210 "cismontane": "Cismontane", 

211 "ultramontane": "Ultramontane", 

212 "western lombard": "Western-Lombard", 

213 "eastern lombard": "Eastern-Lombard", 

214 "contracted": "contracted", 

215 "present": "present", 

216 "perfect": "perfect", 

217 "imperfect": "imperfect", 

218 "pluperfect": "pluperfect", 

219 "future": "future", 

220 "aorist": "aorist", 

221} 

222for k, v in title_contains_wordtags_map.items(): 

223 if any(t not in valid_tags for t in v.split()): 223 ↛ 224line 223 didn't jump to line 224 because the condition on line 223 was never true

224 print( 

225 "TITLE_CONTAINS_WORDTAGS_MAP UNRECOGNIZED TAG: {}: {}".format(k, v) 

226 ) 

227title_contains_wordtags_re = re.compile( 

228 r"(?i)(^|\b)({}|{})($|\b)".format( 

229 table_hdr_ign_part, 

230 "|".join(re.escape(x) for x in title_contains_wordtags_map.keys()), 

231 ) 

232) 

233 

234# Parenthesized elements in title that are converted to tags in 

235# "table-tags" form 

236title_elements_map = { 

237 "weak": "weak", 

238 "strong": "strong", 

239 "separable": "separable", 

240 "masculine": "masculine", 

241 "feminine": "feminine", 

242 "neuter": "neuter", 

243 "singular": "singular", 

244 "plural": "plural", 

245 "archaic": "archaic", 

246 "dated": "dated", 

247 "iterative": "iterative", 

248 "poetic": "poetic", 

249 "Attic": "Attic", 

250 "Epic": "Epic", 

251 "Aeolic": "Aeolic", 

252 "Arcadocypriot": "Arcadocypriot", 

253 "Old Attic": "Old-Attic", 

254 "Boeotian": "Boeotian", 

255 "Byzantine": "Byzantine", 

256 "Choral Doric": "Choral-Doric", 

257 "Doric": "Doric", 

258 "Elean": "Elean", 

259 "Epirote": "Epirote", 

260 "Ionic": "Ionic", 

261 "Koine": "Koine", 

262 "Cretan": "Cretan", 

263 "Corinthian": "Corinthian", 

264 "Laconian": "Laconian", 

265 "Later poetic": "Later-poetic-Ancient-Greek", 

266 "Lesbian": "Lesbian", 

267 "Locrian": "Locrian", 

268 "Lyric": "Lyric-Ancient-Greek", 

269 "Thessalian": "Thessalian", 

270 "Tragic": "Tragic-Ancient-Greek", 

271} 

272for k, v in title_elements_map.items(): 

273 if any(t not in valid_tags for t in v.split()): 273 ↛ 274line 273 didn't jump to line 274 because the condition on line 273 was never true

274 print("TITLE_ELEMENTS_MAP UNRECOGNIZED TAG: {}: {}".format(k, v)) 

275 

276# Parenthized element starts to map them to tags for form for the rest of 

277# the element 

278title_elemstart_map = { 

279 "auxiliary": "auxiliary", 

280 "Kotus type": "class", 

281 "ÕS type": "class", 

282 "class": "class", 

283 "short class": "class", 

284 "type": "class", 

285 "strong class": "class", 

286 "weak class": "class", 

287 "accent paradigm": "accent-paradigm", 

288 "stem in": "class", 

289} 

290for k, v in title_elemstart_map.items(): 

291 if any(t not in valid_tags for t in v.split()): 291 ↛ 292line 291 didn't jump to line 292 because the condition on line 291 was never true

292 print("TITLE_ELEMSTART_MAP UNRECOGNIZED TAG: {}: {}".format(k, v)) 

293title_elemstart_re = re.compile( 

294 r"^({}) ".format("|".join(re.escape(x) for x in title_elemstart_map.keys())) 

295) 

296 

297 

298# Regexp for cell starts that are likely definitions of reference symbols. 

299# See also nondef_re. 

300def_re = re.compile( 

301 r"(\s*•?\s+)?" 

302 r"((\*+|[△†0123456789⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻]+)([⁾):]|\s|(?=[A-Z]))|" 

303 r"\^(\*+|[△†])|" 

304 r"([¹²³⁴⁵⁶⁷⁸⁹])|" 

305 r"([ᴬᴮᴰᴱᴳᴴᴵᴶᴷᴸᴹᴺᴼᴾᴿᵀᵁⱽᵂᵃᵇᶜᵈᵉᶠᵍʰⁱʲᵏˡᵐⁿᵒᵖʳˢᵗᵘᵛʷˣʸᶻᵝᵞᵟᶿᶥᵠᵡ]))" 

306) 

307# ᴺᴸᴴ persan/Old Irish 

308 

309# Regexp for cell starts that are exceptions to def_re and do not actually 

310# start a definition. 

311nondef_re = re.compile( 

312 r"(^\s*(1|2|3)\s+(sg|pl)\s*$|" # 1s or 3p etc. 

313 r"\s*\d\d?\s*/\s*\d\d?\s*$)" 

314) # taka/Swahili "15 / 17" 

315 

316 

317class InflCell: 

318 """Cell in an inflection table.""" 

319 

320 __slots__ = ( 

321 "text", 

322 "is_title", 

323 "colspan", 

324 "rowspan", 

325 "target", 

326 ) 

327 

328 def __init__( 

329 self, 

330 text: str, 

331 is_title: bool, 

332 colspan: int, 

333 rowspan: int, 

334 target: Optional[str], 

335 ) -> None: 

336 assert isinstance(text, str) 

337 assert is_title in (True, False) 

338 assert isinstance(colspan, int) and colspan >= 1 

339 assert isinstance(rowspan, int) and rowspan >= 1 

340 assert target is None or isinstance(target, str) 

341 self.text = text.strip() 

342 self.is_title = text and is_title 

343 self.colspan = colspan 

344 self.rowspan = rowspan 

345 self.target = target 

346 

347 def __str__(self) -> str: 

348 v = "{}/{}/{}/{!r}".format( 

349 self.text, self.is_title, self.colspan, self.rowspan 

350 ) 

351 if self.target: 

352 v += ": {!r}".format(self.target) 

353 return v 

354 

355 def __repr__(self) -> str: 

356 return str(self) 

357 

358 

359class HdrSpan: 

360 """Saved information about a header cell/span during the parsing 

361 of a table.""" 

362 

363 __slots__ = ( 

364 "start", 

365 "colspan", 

366 "rowspan", 

367 "rownum", # Row number where this occurred 

368 "tagsets", # list of tuples 

369 "text", # For debugging 

370 "all_headers_row", 

371 "expanded", # The header has been expanded to cover whole row/part 

372 ) 

373 

374 def __init__( 

375 self, 

376 start: int, 

377 colspan: int, 

378 rowspan: int, 

379 rownum: int, 

380 tagsets: TagSets, 

381 text: str, 

382 all_headers_row: bool, 

383 ) -> None: 

384 assert isinstance(start, int) and start >= 0 

385 assert isinstance(colspan, int) and colspan >= 1 

386 assert isinstance(rownum, int) 

387 assert isinstance(tagsets, list) 

388 for x in tagsets: 

389 assert isinstance(x, tuple) 

390 assert all_headers_row in (True, False) 

391 self.start = start 

392 self.colspan = colspan 

393 self.rowspan = rowspan 

394 self.rownum = rownum 

395 self.tagsets = list(tuple(sorted(set(tags))) for tags in tagsets) 

396 self.text = text 

397 self.all_headers_row = all_headers_row 

398 self.expanded = False 

399 

400 

401def is_superscript(ch: str) -> bool: 

402 """Returns True if the argument is a superscript character.""" 

403 assert isinstance(ch, str) and len(ch) == 1 

404 try: 

405 name = unicodedata.name(ch) 

406 except ValueError: 

407 return False 

408 return ( 

409 re.match( 

410 r"SUPERSCRIPT |" 

411 r"MODIFIER LETTER SMALL |" 

412 r"MODIFIER LETTER CAPITAL ", 

413 name, 

414 ) 

415 is not None 

416 ) 

417 

418 

419def remove_useless_tags(lang: str, pos: str, tags: set[str]) -> None: 

420 """Remove certain tag combinations from ``tags`` when they serve no purpose 

421 together (cover all options).""" 

422 assert isinstance(lang, str) 

423 assert isinstance(pos, str) 

424 assert isinstance(tags, set) 

425 if ( 

426 "animate" in tags 

427 and "inanimate" in tags 

428 and get_lang_conf(lang, "animate_inanimate_remove") 

429 ): 

430 tags.remove("animate") 

431 tags.remove("inanimate") 

432 if ( 

433 "virile" in tags 

434 and "nonvirile" in tags 

435 and get_lang_conf(lang, "virile_nonvirile_remove") 

436 ): 

437 tags.remove("virile") 

438 tags.remove("nonvirile") 

439 # If all numbers in the language are listed, remove them all 

440 numbers = get_lang_conf(lang, "numbers") 

441 if numbers and all(x in tags for x in numbers): 

442 for x in numbers: 

443 tags.remove(x) 

444 # If all genders in the language are listed, remove them all 

445 genders = get_lang_conf(lang, "genders") 

446 if genders and all(x in tags for x in genders): 

447 for x in genders: 

448 tags.remove(x) 

449 # If all voices in the language are listed, remove them all 

450 voices = get_lang_conf(lang, "voices") 

451 if voices and all(x in tags for x in voices): 

452 for x in voices: 

453 tags.remove(x) 

454 # If all strengths of the language are listed, remove them all 

455 strengths = get_lang_conf(lang, "strengths") 

456 if strengths and all(x in tags for x in strengths): 

457 for x in strengths: 

458 tags.remove(x) 

459 # If all persons of the language are listed, remove them all 

460 persons = get_lang_conf(lang, "persons") 

461 if persons and all(x in tags for x in persons): 

462 for x in persons: 

463 tags.remove(x) 

464 # If all definitenesses of the language are listed, remove them all 

465 definitenesses = get_lang_conf(lang, "definitenesses") 

466 if definitenesses and all(x in tags for x in definitenesses): 

467 for x in definitenesses: 

468 tags.remove(x) 

469 

470 

471def tagset_cats(tagset: TagSets) -> set[str]: 

472 """Returns a set of tag categories for the tagset (merged from all 

473 alternatives).""" 

474 return set(valid_tags[t] for ts in tagset for t in ts) 

475 

476 

477def or_tagsets( 

478 lang: str, pos: str, tagsets1: TagSets, tagsets2: TagSets 

479) -> TagSets: 

480 """Merges two tagsets (the new tagset just merges the tags from both, in 

481 all combinations). If they contain simple alternatives (differ in 

482 only one category), they are simply merged; otherwise they are split to 

483 more alternatives. The tagsets are assumed be sets of sorted tuples.""" 

484 assert isinstance(tagsets1, list) 

485 assert all(isinstance(x, tuple) for x in tagsets1) 

486 assert isinstance(tagsets2, list) 

487 assert all(isinstance(x, tuple) for x in tagsets1) 

488 tagsets: TagSets = [] # This will be the result 

489 

490 def add_tags(tags1: tuple[str, ...]) -> None: 

491 # CONTINUE 

492 if not tags1: 

493 return # empty set would merge with anything, won't change result 

494 if not tagsets: 

495 tagsets.append(tags1) 

496 return 

497 for tags2 in tagsets: 

498 # Determine if tags1 can be merged with tags2 

499 num_differ = 0 

500 if tags1 and tags2: 500 ↛ 518line 500 didn't jump to line 518 because the condition on line 500 was always true

501 cats1 = set(valid_tags[t] for t in tags1) 

502 cats2 = set(valid_tags[t] for t in tags2) 

503 cats = cats1 | cats2 

504 for cat in cats: 

505 tags1_in_cat = set(t for t in tags1 if valid_tags[t] == cat) 

506 tags2_in_cat = set(t for t in tags2 if valid_tags[t] == cat) 

507 if ( 

508 tags1_in_cat != tags2_in_cat 

509 or not tags1_in_cat 

510 or not tags2_in_cat 

511 ): 

512 num_differ += 1 

513 if not tags1_in_cat or not tags2_in_cat: 

514 # Prevent merging if one is empty 

515 num_differ += 1 

516 # print("tags1={} tags2={} num_differ={}" 

517 # .format(tags1, tags2, num_differ)) 

518 if num_differ <= 1: 

519 # Yes, they can be merged 

520 tagsets.remove(tags2) 

521 tags_s = set(tags1) | set(tags2) 

522 remove_useless_tags(lang, pos, tags_s) 

523 tags_t = tuple(sorted(tags_s)) 

524 add_tags(tags_t) # Could result in further merging 

525 return 

526 # If we could not merge, add to tagsets 

527 tagsets.append(tags1) 

528 

529 for tags in tagsets1: 

530 add_tags(tags) 

531 for tags in tagsets2: 

532 add_tags(tags) 

533 if not tagsets: 

534 tagsets.append(()) 

535 

536 # print("or_tagsets: {} + {} -> {}" 

537 # .format(tagsets1, tagsets2, tagsets)) 

538 return tagsets 

539 

540 

541def and_tagsets( 

542 lang: str, 

543 pos: str, 

544 tagsets1: list[tuple[str, ...]], 

545 tagsets2: list[tuple[str, ...]], 

546) -> list[tuple[str, ...]]: 

547 """Merges tagsets by taking union of all cobinations, without trying 

548 to determine whether they are compatible.""" 

549 assert isinstance(tagsets1, list) and len(tagsets1) >= 1 

550 assert all(isinstance(x, tuple) for x in tagsets1) 

551 assert isinstance(tagsets2, list) and len(tagsets2) >= 1 

552 assert all(isinstance(x, tuple) for x in tagsets1) 

553 new_tagsets = [] 

554 tags: Union[set[str], tuple[str, ...]] 

555 for tags1 in tagsets1: 

556 for tags2 in tagsets2: 

557 tags = set(tags1) | set(tags2) 

558 remove_useless_tags(lang, pos, tags) 

559 if "dummy-ignored-text-cell" in tags: 559 ↛ 560line 559 didn't jump to line 560 because the condition on line 559 was never true

560 tags.remove("dummy-ignored-text-cell") 

561 tags = tuple(sorted(tags)) 

562 if tags not in new_tagsets: 562 ↛ 556line 562 didn't jump to line 556 because the condition on line 562 was always true

563 new_tagsets.append(tags) 

564 # print("and_tagsets: {} + {} -> {}" 

565 # .format(tagsets1, tagsets2, new_tagsets)) 

566 return new_tagsets 

567 

568 

569@functools.lru_cache(65536) 

570def extract_cell_content( 

571 lang: str, word: str, col: str 

572) -> tuple[str, list[str], list[tuple[str, str]], list[str]]: 

573 """Cleans a row/column header for later processing. This returns 

574 (cleaned, refs, defs, tags).""" 

575 # print("EXTRACT_CELL_CONTENT {!r}".format(col)) 

576 hdr_tags = [] 

577 col = re.sub(r"(?s)\s*,\s*$", "", col) 

578 col = re.sub(r"(?s)\s*•\s*$", "", col) 

579 col = re.sub(r"\s+", " ", col) 

580 col = col.strip() 

581 if re.search( 

582 r"^\s*(There are |" 

583 r"\* |" 

584 r"see |" 

585 r"Use |" 

586 r"use the |" 

587 r"Only used |" 

588 r"The forms in |" 

589 r"these are also written |" 

590 r"The genitive can be |" 

591 r"Genitive forms are rare or non-existant|" 

592 r"Accusative Note: |" 

593 r"Classifier Note: |" 

594 r"Noun: Assamese nouns are |" 

595 r"the active conjugation|" 

596 r"the instrumenal singular|" 

597 r"Note:|" 

598 r"\^* Note:|" 

599 r"possible mutated form |" 

600 r"The future tense: )", 

601 col, 

602 ): 

603 return "dummy-ignored-text-cell", [], [], [] 

604 

605 # Temporarily remove final parenthesized part (if separated by whitespace), 

606 # so that we can extract reference markers before it. 

607 final_paren = "" 

608 m = re.search(r"\s+\([^)]*\)$", col) 

609 if m is not None: 

610 final_paren = m.group(0) 

611 col = col[: m.start()] 

612 

613 # Extract references and tag markers 

614 refs = [] 

615 special_references = get_lang_conf(lang, "special_references") 

616 while True: 

617 m = re.search(r"\^(.|\([^)]*\))$", col) 

618 if not m: 

619 break 

620 r = m.group(1) 

621 if r.startswith("(") and r.endswith(")"): 

622 r = r[1:-1] 

623 for r1 in r.split(","): 

624 if r1 == "rare": 624 ↛ 625line 624 didn't jump to line 625 because the condition on line 624 was never true

625 hdr_tags.append("rare") 

626 elif special_references and r1 in special_references: 

627 hdr_tags.extend(special_references[r1].split()) 

628 else: 

629 # v = m.group(1) 

630 if r1.startswith("(") and r1.endswith(")"): 630 ↛ 631line 630 didn't jump to line 631 because the condition on line 630 was never true

631 r1 = r1[1:-1] 

632 refs.append(unicodedata.normalize("NFKD", r1)) 

633 col = col[: m.start()] 

634 # See if it is a ref definition 

635 # print("BEFORE REF CHECK: {!r}".format(col)) 

636 m = def_re.match(col) 

637 # print(f"Before def_re: {refs=}") 

638 if m and not nondef_re.match(col): 

639 ofs = 0 

640 ref = None 

641 deflst = [] 

642 for m in re.finditer(def_re, col): 

643 if ref: 

644 deflst.append((ref, col[ofs : m.start()].strip())) 

645 ref = unicodedata.normalize( 

646 "NFKD", m.group(3) or m.group(5) or m.group(6) or "" 

647 ) 

648 ofs = m.end() 

649 if ref: 649 ↛ 652line 649 didn't jump to line 652 because the condition on line 649 was always true

650 deflst.append((ref, col[ofs:].strip())) 

651 # print("deflst:", deflst) 

652 return "", [], deflst, [] 

653 # See if it *looks* like a reference to a definition 

654 # print(f"After def_re: {refs=}") 

655 while col: 

656 if is_superscript(col[-1]) or col[-1] in ("†",): 

657 if col.endswith("ʳᵃʳᵉ"): 

658 hdr_tags.append("rare") 

659 col = col[:-4].strip() 

660 continue 

661 if special_references: 

662 stop_flag = False 

663 for r in special_references: 

664 if col.endswith(r): 

665 hdr_tags.extend(special_references[r].split()) 

666 col = col[: -len(r)].strip() 

667 stop_flag = True 

668 break # this for loop 

669 if stop_flag: 

670 continue # this while loop 

671 # Numbers and H/L/N are useful information 

672 refs.append(unicodedata.normalize("NFKD", col[-1])) 

673 col = col[:-1] 

674 else: 

675 break 

676 

677 # Check for another form of note definition 

678 if ( 678 ↛ 684line 678 didn't jump to line 684 because the condition on line 678 was never true

679 len(col) > 2 

680 and col[1] in (")", " ", ":") 

681 and col[0].isdigit() 

682 and not re.match(nondef_re, col) 

683 ): 

684 return "", [], [(col[0], col[2:].strip())], [] 

685 col = col.strip() 

686 

687 # Extract final "*" reference symbols. Sometimes there are multiple. 

688 m = re.search(r"\*+$", col) 

689 if m is not None: 

690 col = col[: m.start()] 

691 refs.append(unicodedata.normalize("NFKD", m.group(0))) 

692 if col.endswith("(*)"): 692 ↛ 693line 692 didn't jump to line 693 because the condition on line 692 was never true

693 col = col[:-3].strip() 

694 refs.append("*") 

695 

696 # Put back the final parenthesized part 

697 col = col.strip() + final_paren 

698 # print("EXTRACT_CELL_CONTENT: orig_col={!r} col={!r} refs={!r} hdr_tags={}" 

699 # .format(orig_col, col, refs, hdr_tags)) 

700 return col.strip(), refs, [], hdr_tags 

701 

702 

703@functools.lru_cache(10000) 

704def parse_title( 

705 title: str, source: str 

706) -> tuple[list[str], list[str], list[FormData]]: 

707 """Parses inflection table title. This returns (global_tags, table_tags, 

708 extra_forms), where ``global_tags`` is tags to be added to each inflection 

709 entry, ``table_tags`` are tags for the word but not to be added to every 

710 form, and ``extra_forms`` is dictionary describing additional forms to be 

711 included in the part-of-speech entry).""" 

712 assert isinstance(title, str) 

713 assert isinstance(source, str) 

714 title = html.unescape(title) 

715 title = re.sub(r"(?i)<[^>]*>", "", title).strip() 

716 title = re.sub(r"\s+", " ", title) 

717 # print("PARSE_TITLE:", title) 

718 global_tags: list[str] = [] 

719 table_tags: list[str] = [] 

720 extra_forms = [] 

721 # Add certain global tags based on contained words 

722 for m in re.finditer(title_contains_global_re, title): 

723 v = m.group(0).lower() 

724 if re.match(table_hdr_ign_part_re, v): 724 ↛ 725line 724 didn't jump to line 725 because the condition on line 724 was never true

725 continue 

726 global_tags.extend(title_contains_global_map[v].split()) 

727 # Add certain tags to table-tags "form" based on contained words 

728 for m in re.finditer(title_contains_wordtags_re, title): 

729 v = m.group(0).lower() 

730 if re.match(table_hdr_ign_part_re, v): 730 ↛ 731line 730 didn't jump to line 731 because the condition on line 730 was never true

731 continue 

732 table_tags.extend(title_contains_wordtags_map[v].split()) 

733 if re.search(r"Conjugation of (s’|se ).*French verbs", title): 733 ↛ 734line 733 didn't jump to line 734 because the condition on line 733 was never true

734 global_tags.append("reflexive") 

735 # Check for <x>-type at the beginning of title (e.g., Armenian) and various 

736 # other ways of specifying an inflection class. 

737 for m in re.finditer( 

738 r"\b(" 

739 r"[\w/]+-type|" 

740 r"accent-\w+|" 

741 r"[\w/]+-stem|" 

742 r"[^ ]+ gradation|" 

743 r"\b(stem in [\w/ ]+)|" 

744 r"[^ ]+ alternation|" 

745 r"(First|Second|Third|Fourth|Fifth|Sixth|Seventh) " 

746 r"(Conjugation|declension)|" 

747 r"First and second declension|" 

748 r"(1st|2nd|3rd|4th|5th|6th) declension|" 

749 r"\w[\w/ ]* harmony" 

750 r")\b", 

751 title, 

752 ): 

753 dt: FormData = {"form": m.group(1), "source": source, "tags": ["class"]} 

754 extra_forms.append(dt) 

755 # Parse parenthesized part from title 

756 for m in re.finditer(r"\(([^)]*)\)", title): 

757 for elem in m.group(1).split(","): 

758 # group(0) is the whole string, group(1) first parens 

759 elem = elem.strip() 

760 if elem in title_elements_map: 

761 table_tags.extend(title_elements_map[elem].split()) 

762 else: 

763 m1 = re.match(title_elemstart_re, elem) 

764 if m1: 

765 tags = title_elemstart_map[m1.group(1)].split() 

766 dt = { 

767 "form": elem[m1.end() :], 

768 "source": source, 

769 "tags": tags, 

770 } 

771 extra_forms.append(dt) 

772 # For titles that contains no parenthesized parts, do some special 

773 # handling to still interpret parts from them 

774 if "(" not in title: 

775 # No parenthesized parts 

776 m1 = re.search(r"\b(Portuguese) (-.* verb) ", title) 

777 if m1 is not None: 

778 dt = {"form": m1.group(2), "tags": ["class"], "source": source} 

779 extra_forms.append(dt) 

780 for elem in title.split(","): 

781 elem = elem.strip() 

782 if elem in title_elements_map: 782 ↛ 783line 782 didn't jump to line 783 because the condition on line 782 was never true

783 table_tags.extend(title_elements_map[elem].split()) 

784 elif elem.endswith("-stem"): 784 ↛ 785line 784 didn't jump to line 785 because the condition on line 784 was never true

785 dt = {"form": elem, "tags": ["class"], "source": source} 

786 extra_forms.append(dt) 

787 return global_tags, table_tags, extra_forms 

788 

789 

790def expand_header( 

791 wxr: WiktextractContext, 

792 tablecontext: "TableContext", 

793 word: str, 

794 lang: str, 

795 pos: str, 

796 text: str, 

797 base_tags: Union[list[str], set[str], tuple[str, ...]], 

798 silent=False, 

799 ignore_tags=False, 

800 depth=0, 

801 column_number: int | None = None, 

802) -> list[tuple[str, ...]]: 

803 """Expands a cell header to tagset, handling conditional expressions 

804 in infl_map. This returns list of tuples of tags, each list element 

805 describing an alternative interpretation. ``base_tags`` is combined 

806 column and row tags for the cell in which the text is being interpreted 

807 (conditional expressions in inflection data may depend on it). 

808 If ``silent`` is True, then no warnings will be printed. If ``ignore_tags`` 

809 is True, then tags listed in "if" will be ignored in the test (this is 

810 used when trying to heuristically detect whether a non-<th> cell is anyway 

811 a header).""" 

812 assert isinstance(wxr, WiktextractContext) 

813 assert isinstance(word, str) 

814 assert isinstance(lang, str) 

815 assert isinstance(pos, str) 

816 assert isinstance(text, str) 

817 assert isinstance(base_tags, (list, tuple, set)) 

818 assert silent in (True, False) 

819 assert isinstance(depth, int) 

820 # print("EXPAND_HDR: text={!r} base_tags={!r}".format(text, base_tags)) 

821 # First map the text using the inflection map 

822 text = clean_value(wxr, text) 

823 combined_return: list[tuple[str, ...]] = [] 

824 parts = split_at_comma_semi(text, separators=[";"]) 

825 for text in parts: 

826 if not text: 826 ↛ 827line 826 didn't jump to line 827 because the condition on line 826 was never true

827 continue 

828 if text in infl_map: 

829 v = infl_map[text] # list or string 

830 else: 

831 m = re.match(infl_start_re, text) 

832 if m is not None: 832 ↛ 833line 832 didn't jump to line 833 because the condition on line 832 was never true

833 v = infl_start_map[m.group(1)] 

834 # print("INFL_START {} -> {}".format(text, v)) 

835 elif re.match(r"Notes", text): 

836 # Ignored header 

837 # print("IGNORING NOTES") 

838 combined_return = or_tagsets( 

839 lang, pos, combined_return, [("dummy-skip-this",)] 

840 ) 

841 # this just adds dummy-skip-this 

842 continue 

843 elif text in IGNORED_COLVALUES: 

844 combined_return = or_tagsets( 

845 lang, pos, combined_return, [("dummy-ignore-skipped",)] 

846 ) 

847 continue 

848 # Try without final parenthesized part 

849 text_without_parens = re.sub(r"[,/]?\s+\([^)]*\)\s*$", "", text) 

850 if text_without_parens in infl_map: 

851 v = infl_map[text_without_parens] 

852 elif m is None: 852 ↛ 868line 852 didn't jump to line 868 because the condition on line 852 was always true

853 if not silent: 

854 wxr.wtp.debug( 

855 "inflection table: unrecognized header: {}".format( 

856 repr(text) 

857 ), 

858 sortid="inflection/735", 

859 ) 

860 # Unrecognized header 

861 combined_return = or_tagsets( 

862 lang, pos, combined_return, [("error-unrecognized-form",)] 

863 ) 

864 continue 

865 

866 # Then loop interpreting the value, until the value is a simple string. 

867 # This may evaluate nested conditional expressions. 

868 default_else = None 

869 while True: 

870 # If it is a string, we are done. 

871 if isinstance(v, str): 

872 tags = set(v.split()) 

873 remove_useless_tags(lang, pos, tags) 

874 tagset = [tuple(sorted(tags))] 

875 break 

876 # For a list, just interpret it as alternatives. (Currently the 

877 # alternatives must directly be strings.) 

878 if isinstance(v, (list, tuple)): 

879 tagset = [] 

880 for x in v: 

881 tags = set(x.split()) 

882 remove_useless_tags(lang, pos, tags) 

883 tags_t = tuple(sorted(tags)) 

884 if tags_t not in tagset: 884 ↛ 880line 884 didn't jump to line 880 because the condition on line 884 was always true

885 tagset.append(tags_t) 

886 break 

887 # Otherwise the value should be a dictionary describing a 

888 # conditional expression. 

889 if not isinstance(v, dict): 889 ↛ 890line 889 didn't jump to line 890 because the condition on line 889 was never true

890 wxr.wtp.debug( 

891 "inflection table: internal: " 

892 "UNIMPLEMENTED INFL_MAP VALUE: {}".format(infl_map[text]), 

893 sortid="inflection/767", 

894 ) 

895 tagset = [()] 

896 break 

897 # Evaluate the conditional expression. 

898 assert isinstance(v, dict) 

899 cond: Union[bool, str] = "default-true" 

900 c: Union[str, list[str], set[str]] = "" 

901 # Handle "lang" condition. The value must be either a 

902 # single language or a list of languages, and the 

903 # condition evaluates to True if the table is one of 

904 # those languages. 

905 if "lang" in v: 

906 c = v["lang"] 

907 # check if it's a code and transform if necessary 

908 if isinstance(c, str): 

909 if c != lang: 

910 cond = lang == code_to_name(c, "en") 

911 else: 

912 cond = True 

913 else: 

914 assert isinstance(c, (list, tuple, set)) 

915 if lang not in c: 

916 cond = name_to_code(lang, "en") in c 

917 else: 

918 cond = True 

919 # Handle "nested-table-depth" condition. The value must 

920 # be an int or list of ints, and the condition evaluates 

921 # True if the depth is one of those values. 

922 # "depth" is how deep into a nested table tree the current 

923 # table lies. It is first started in handle_wikitext_table, 

924 # so only applies to tables-within-tables, not other 

925 # WikiNode content. `depth` is currently only passed as a 

926 # parameter down the table parsing stack, and not stored. 

927 if cond and "nested-table-depth" in v: 927 ↛ 928line 927 didn't jump to line 928 because the condition on line 927 was never true

928 d = v["nested-table-depth"] 

929 if isinstance(d, int): 

930 cond = d == depth 

931 else: 

932 assert isinstance(d, (list, tuple, set)) 

933 cond = depth in d 

934 # Column index: check if we're in position X of the row 

935 if cond and "column-index" in v: 

936 index = v["column-index"] 

937 if isinstance(index, int): 937 ↛ 940line 937 didn't jump to line 940 because the condition on line 937 was always true

938 cond = index == column_number 

939 else: 

940 assert isinstance(index, (list, tuple, set)) 

941 cond = column_number in index 

942 # Handle inflection-template condition. Must be a string 

943 # or list of strings, and if tablecontext.template_name is in 

944 # those, accept the condition. 

945 # TableContext.template_name is passed down from page/ 

946 # parse_inflection, before parsing and expanding itself 

947 # has begun. 

948 if cond and tablecontext and "inflection-template" in v: 

949 d1 = v["inflection-template"] 

950 if isinstance(d1, str): 950 ↛ 953line 950 didn't jump to line 953 because the condition on line 950 was always true

951 cond = d1 == tablecontext.template_name 

952 else: 

953 assert isinstance(d1, (list, tuple, set)) 

954 cond = tablecontext.template_name in d1 

955 # Handle "pos" condition. The value must be either a single 

956 # part-of-speech or a list of them, and the condition evaluates to 

957 # True if the part-of-speech is any of those listed. 

958 if cond and "pos" in v: 

959 c = v["pos"] 

960 if isinstance(c, str): 

961 cond = c == pos 

962 else: 

963 assert isinstance(c, (list, tuple, set)) 

964 cond = pos in c 

965 # Handle "if" condition. The value must be a string containing a 

966 # space-separated list of tags. The condition evaluates to True if 

967 # ``base_tags`` contains all of the listed tags. If the condition 

968 # is of the form "any: ...tags...", then any of the tags will be 

969 # enough. 

970 if cond and "if" in v and not ignore_tags: 

971 c = v["if"] 

972 assert isinstance(c, str) 

973 # "if" condition is true if any of the listed tags is present if 

974 # it starts with "any:", otherwise all must be present 

975 if c.startswith("any: "): 

976 cond = any(t in base_tags for t in c[5:].split()) 

977 else: 

978 cond = all(t in base_tags for t in c.split()) 

979 

980 # Handle "default" assignment. Store the value to be used 

981 # as a default later. 

982 if "default" in v: 

983 assert isinstance(v["default"], str) 

984 default_else = v["default"] 

985 

986 # Warning message about missing conditions for debugging. 

987 

988 if cond == "default-true" and not default_else and not silent: 

989 wxr.wtp.debug( 

990 "inflection table: IF MISSING COND: word={} " 

991 "lang={} text={} base_tags={} c={} cond={}".format( 

992 word, lang, text, base_tags, c, cond 

993 ), 

994 sortid="inflection/851", 

995 ) 

996 # Based on the result of evaluating the condition, select either 

997 # "then" part or "else" part. 

998 if cond: 

999 v = v.get("then", "") 

1000 else: 

1001 v1 = v.get("else") 

1002 if v1 is None: 

1003 if default_else is not None: 

1004 v = default_else 

1005 else: 

1006 if not silent: 

1007 wxr.wtp.debug( 

1008 "inflection table: IF WITHOUT ELSE EVALS " 

1009 "False: " 

1010 "{}/{} {!r} base_tags={}".format( 

1011 word, lang, text, base_tags 

1012 ), 

1013 sortid="inflection/865", 

1014 ) 

1015 v = "error-unrecognized-form" 

1016 else: 

1017 v = v1 

1018 

1019 # Merge the resulting tagset from this header part with the other 

1020 # tagsets from the whole header 

1021 combined_return = or_tagsets(lang, pos, combined_return, tagset) 

1022 

1023 # Return the combined tagsets, or empty tagset if we got no tagsets 

1024 if not combined_return: 

1025 combined_return = [()] 

1026 return combined_return 

1027 

1028 

1029def compute_coltags( 

1030 lang: str, 

1031 pos: str, 

1032 hdrspans: list[str], 

1033 start: int, 

1034 colspan: int, 

1035 celltext: int, 

1036) -> list[tuple[str]]: 

1037 """Computes column tags for a column of the given width based on the 

1038 current header spans.""" 

1039 assert isinstance(lang, str) 

1040 assert isinstance(pos, str) 

1041 assert isinstance(hdrspans, list) 

1042 assert isinstance(start, int) and start >= 0 

1043 assert isinstance(colspan, int) and colspan >= 1 

1044 assert isinstance(celltext, str) # For debugging only 

1045 # print("COMPUTE_COLTAGS CALLED start={} colspan={} celltext={!r}" 

1046 # .format(start, colspan, celltext)) 

1047 # For debugging, set this to the form for whose cell you want debug prints 

1048 if celltext == debug_cell_text: 1048 ↛ 1049line 1048 didn't jump to line 1049 because the condition on line 1048 was never true

1049 print( 

1050 "COMPUTE_COLTAGS CALLED start={} colspan={} celltext={!r}".format( 

1051 start, colspan, celltext 

1052 ) 

1053 ) 

1054 for hdrspan in hdrspans: 

1055 print( 

1056 " row={} start={} colspans={} tagsets={}".format( 

1057 hdrspan.rownum, 

1058 hdrspan.start, 

1059 hdrspan.colspan, 

1060 hdrspan.tagsets, 

1061 ) 

1062 ) 

1063 used = set() 

1064 coltags = [()] 

1065 last_header_row = 1000000 

1066 # Iterate through the headers in reverse order, i.e., headers lower in the 

1067 # table (closer to the cell) first. 

1068 row_tagsets = [()] 

1069 row_tagsets_rownum = 1000000 

1070 used_hdrspans = set() 

1071 for hdrspan in reversed(hdrspans): 

1072 if ( 

1073 hdrspan.start + hdrspan.colspan <= start 

1074 or hdrspan.start >= start + colspan 

1075 ): 

1076 # Does not horizontally overlap current cell. Ignore this hdrspan. 

1077 if celltext == debug_cell_text: 1077 ↛ 1078line 1077 didn't jump to line 1078 because the condition on line 1077 was never true

1078 print( 

1079 "Ignoring row={} start={} colspan={} tagsets={}".format( 

1080 hdrspan.rownum, 

1081 hdrspan.start, 

1082 hdrspan.colspan, 

1083 hdrspan.tagsets, 

1084 ) 

1085 ) 

1086 continue 

1087 # If the cell partially overlaps the current cell, assume we have 

1088 # reached something unrelated and abort. 

1089 if ( 

1090 hdrspan.start < start 

1091 and hdrspan.start + hdrspan.colspan > start 

1092 and hdrspan.start + hdrspan.colspan < start + colspan 

1093 ): 

1094 if celltext == debug_cell_text: 1094 ↛ 1095line 1094 didn't jump to line 1095 because the condition on line 1094 was never true

1095 print( 

1096 "break on partial overlap at start {} {} {}".format( 

1097 hdrspan.start, hdrspan.colspan, hdrspan.tagsets 

1098 ) 

1099 ) 

1100 break 

1101 if ( 

1102 hdrspan.start < start + colspan 

1103 and hdrspan.start > start 

1104 and hdrspan.start + hdrspan.colspan > start + colspan 

1105 and not hdrspan.expanded 

1106 ): 

1107 if celltext == debug_cell_text: 1107 ↛ 1108line 1107 didn't jump to line 1108 because the condition on line 1107 was never true

1108 print( 

1109 "break on partial overlap at end {} {} {}".format( 

1110 hdrspan.start, hdrspan.colspan, hdrspan.tagsets 

1111 ) 

1112 ) 

1113 break 

1114 # Check if we have already used this cell. 

1115 if id(hdrspan) in used_hdrspans: 

1116 continue 

1117 # We are going to use this cell. 

1118 used_hdrspans.add(id(hdrspan)) 

1119 tagsets = hdrspan.tagsets 

1120 # If the hdrspan is fully inside the current cell and does not cover 

1121 # it fully, check if we should merge information from multiple cells. 

1122 if not hdrspan.expanded and ( 

1123 hdrspan.start > start 

1124 or hdrspan.start + hdrspan.colspan < start + colspan 

1125 ): 

1126 # Multiple columns apply to the current cell, only 

1127 # gender/number/case tags present 

1128 # If there are no tags outside the range in any of the 

1129 # categories included in these cells, don't add anything 

1130 # (assume all choices valid in the language are possible). 

1131 in_cats = set( 

1132 valid_tags[t] 

1133 for x in hdrspans 

1134 if x.rownum == hdrspan.rownum 

1135 and x.start >= start 

1136 and x.start + x.colspan <= start + colspan 

1137 for tt in x.tagsets 

1138 for t in tt 

1139 ) 

1140 if celltext == debug_cell_text: 1140 ↛ 1141line 1140 didn't jump to line 1141 because the condition on line 1140 was never true

1141 print("in_cats={} tagsets={}".format(in_cats, tagsets)) 

1142 # Merge the tagsets into existing tagsets. This merges 

1143 # alternatives into the same tagset if there is only one 

1144 # category different; otherwise this splits the tagset into 

1145 # more alternatives. 

1146 includes_all_on_row = True 

1147 for x in hdrspans: 

1148 # print("X: x.rownum={} x.start={}".format(x.rownum, x.start)) 

1149 if x.rownum != hdrspan.rownum: 

1150 continue 

1151 if x.start < start or x.start + x.colspan > start + colspan: 

1152 if celltext == debug_cell_text: 1152 ↛ 1153line 1152 didn't jump to line 1153 because the condition on line 1152 was never true

1153 print( 

1154 "NOT IN RANGE: {} {} {}".format( 

1155 x.start, x.colspan, x.tagsets 

1156 ) 

1157 ) 

1158 includes_all_on_row = False 

1159 continue 

1160 if id(x) in used_hdrspans: 

1161 if celltext == debug_cell_text: 1161 ↛ 1162line 1161 didn't jump to line 1162 because the condition on line 1161 was never true

1162 print( 

1163 "ALREADY USED: {} {} {}".format( 

1164 x.start, x.colspan, x.tagsets 

1165 ) 

1166 ) 

1167 continue 

1168 used_hdrspans.add(id(x)) 

1169 if celltext == debug_cell_text: 1169 ↛ 1170line 1169 didn't jump to line 1170 because the condition on line 1169 was never true

1170 print( 

1171 "Merging into wide col: x.rownum={} " 

1172 "x.start={} x.colspan={} " 

1173 "start={} colspan={} tagsets={} x.tagsets={}".format( 

1174 x.rownum, 

1175 x.start, 

1176 x.colspan, 

1177 start, 

1178 colspan, 

1179 tagsets, 

1180 x.tagsets, 

1181 ) 

1182 ) 

1183 tagsets = or_tagsets(lang, pos, tagsets, x.tagsets) 

1184 # If all headers on the row were included, ignore them. 

1185 # See e.g. kunna/Swedish/Verb. 

1186 ts_cats = tagset_cats(tagsets) 

1187 if ( 

1188 includes_all_on_row 

1189 or 

1190 # Kludge, see fut/Hungarian/Verb 

1191 ("tense" in ts_cats and "object" in ts_cats) 

1192 ): 

1193 tagsets = [()] 

1194 # For limited categories, if the category doesn't appear 

1195 # outside, we won't include the category 

1196 if not in_cats - set( 

1197 ("gender", "number", "person", "case", "category", "voice") 

1198 ): 

1199 # Sometimes we have masc, fem, neut and plural, so treat 

1200 # number and gender as the same here (if one given, look for 

1201 # the other too) 

1202 if "number" in in_cats or "gender" in in_cats: 

1203 in_cats.update(("number", "gender")) 

1204 # Determine which categories occur outside on 

1205 # the same row. Ignore headers that have been expanded 

1206 # to cover the whole row/part of it. 

1207 out_cats = set( 

1208 valid_tags[t] 

1209 for x in hdrspans 

1210 if x.rownum == hdrspan.rownum 

1211 and not x.expanded 

1212 and ( 

1213 x.start < start or x.start + x.colspan > start + colspan 

1214 ) 

1215 for tt in x.tagsets 

1216 for t in tt 

1217 ) 

1218 if celltext == debug_cell_text: 1218 ↛ 1219line 1218 didn't jump to line 1219 because the condition on line 1218 was never true

1219 print("in_cats={} out_cats={}".format(in_cats, out_cats)) 

1220 # Remove all inside categories that do not appear outside 

1221 

1222 new_tagsets = [] 

1223 for ts in tagsets: 

1224 tags = tuple( 

1225 sorted(t for t in ts if valid_tags[t] in out_cats) 

1226 ) 

1227 if tags not in new_tagsets: 1227 ↛ 1223line 1227 didn't jump to line 1223 because the condition on line 1227 was always true

1228 new_tagsets.append(tags) 

1229 if celltext == debug_cell_text and new_tagsets != tagsets: 1229 ↛ 1230line 1229 didn't jump to line 1230 because the condition on line 1229 was never true

1230 print( 

1231 "Removed tags that do not " 

1232 "appear outside {} -> {}".format( 

1233 # have_hdr never used? 

1234 tagsets, 

1235 new_tagsets, 

1236 ) 

1237 ) 

1238 tagsets = new_tagsets 

1239 key = (hdrspan.start, hdrspan.colspan) 

1240 if key in used: 

1241 if celltext == debug_cell_text: 1241 ↛ 1242line 1241 didn't jump to line 1242 because the condition on line 1241 was never true

1242 print( 

1243 "Cellspan already used: start={} " 

1244 "colspan={} rownum={} {}".format( 

1245 hdrspan.start, 

1246 hdrspan.colspan, 

1247 hdrspan.rownum, 

1248 hdrspan.tagsets, 

1249 ) 

1250 ) 

1251 action = get_lang_conf(lang, "reuse_cellspan") 

1252 # can be "stop", "skip" or "reuse" 

1253 if action == "stop": 

1254 break 

1255 if action == "skip": 

1256 continue 

1257 assert action == "reuse" 

1258 tcats = tagset_cats(tagsets) 

1259 # Most headers block using the same column position above. However, 

1260 # "register" tags don't do this (cf. essere/Italian/verb: "formal") 

1261 if len(tcats) != 1 or "register" not in tcats: 

1262 used.add(key) 

1263 # If we have moved to a different row, merge into column tagsets 

1264 # (we use different and_tagsets within the row) 

1265 if row_tagsets_rownum != hdrspan.rownum: 

1266 # row_tagsets_rownum was initialized as 10000000 

1267 ret = and_tagsets(lang, pos, coltags, row_tagsets) 

1268 if celltext == debug_cell_text: 1268 ↛ 1269line 1268 didn't jump to line 1269 because the condition on line 1268 was never true

1269 print( 

1270 "merging rows: {} {} -> {}".format( 

1271 coltags, row_tagsets, ret 

1272 ) 

1273 ) 

1274 coltags = ret 

1275 row_tagsets = [()] 

1276 row_tagsets_rownum = hdrspan.rownum 

1277 # Merge into coltags 

1278 if hdrspan.all_headers_row and hdrspan.rownum + 1 == last_header_row: 

1279 # If this row is all headers and immediately preceeds the last 

1280 # header we accepted, take any header from there. 

1281 row_tagsets = and_tagsets(lang, pos, row_tagsets, tagsets) 

1282 if celltext == debug_cell_text: 1282 ↛ 1283line 1282 didn't jump to line 1283 because the condition on line 1282 was never true

1283 print("merged (next header row): {}".format(row_tagsets)) 

1284 else: 

1285 # new_cats is for the new tags (higher up in the table) 

1286 new_cats = tagset_cats(tagsets) 

1287 # cur_cats is for the tags already collected (lower in the table) 

1288 cur_cats = tagset_cats(coltags) 

1289 if celltext == debug_cell_text: 1289 ↛ 1290line 1289 didn't jump to line 1290 because the condition on line 1289 was never true

1290 print( 

1291 "row={} start={} colspan={} tagsets={} coltags={} " 

1292 "new_cats={} cur_cats={}".format( 

1293 hdrspan.rownum, 

1294 hdrspan.start, 

1295 hdrspan.colspan, 

1296 tagsets, 

1297 coltags, 

1298 new_cats, 

1299 cur_cats, 

1300 ) 

1301 ) 

1302 if "detail" in new_cats: 

1303 if not any(coltags): # Only if no tags so far 

1304 coltags = or_tagsets(lang, pos, coltags, tagsets) 

1305 if celltext == debug_cell_text: 1305 ↛ 1306line 1305 didn't jump to line 1306 because the condition on line 1305 was never true

1306 print("stopping on detail after merge") 

1307 break 

1308 # Here, we block bleeding of categories from above 

1309 elif "non-finite" in cur_cats and "non-finite" in new_cats: 

1310 stop = get_lang_conf(lang, "stop_non_finite_non_finite") 

1311 if stop: 1311 ↛ 1337line 1311 didn't jump to line 1337 because the condition on line 1311 was always true

1312 if celltext == debug_cell_text: 1312 ↛ 1313line 1312 didn't jump to line 1313 because the condition on line 1312 was never true

1313 print("stopping on non-finite-non-finite") 

1314 break 

1315 elif "non-finite" in cur_cats and "voice" in new_cats: 

1316 stop = get_lang_conf(lang, "stop_non_finite_voice") 

1317 if stop: 1317 ↛ 1337line 1317 didn't jump to line 1337 because the condition on line 1317 was always true

1318 if celltext == debug_cell_text: 1318 ↛ 1319line 1318 didn't jump to line 1319 because the condition on line 1318 was never true

1319 print("stopping on non-finite-voice") 

1320 break 

1321 elif "non-finite" in new_cats and cur_cats & set( 

1322 ("person", "number") 

1323 ): 

1324 if celltext == debug_cell_text: 1324 ↛ 1325line 1324 didn't jump to line 1325 because the condition on line 1324 was never true

1325 print("stopping on non-finite new") 

1326 break 

1327 elif "non-finite" in new_cats and "tense" in new_cats: 

1328 stop = get_lang_conf(lang, "stop_non_finite_tense") 

1329 if stop: 

1330 if celltext == debug_cell_text: 1330 ↛ 1331line 1330 didn't jump to line 1331 because the condition on line 1330 was never true

1331 print("stopping on non-finite new") 

1332 break 

1333 elif "non-finite" in cur_cats and new_cats & set(("mood",)): 1333 ↛ 1334line 1333 didn't jump to line 1334 because the condition on line 1333 was never true

1334 if celltext == debug_cell_text: 

1335 print("stopping on non-finite cur") 

1336 break 

1337 if ( 

1338 "tense" in new_cats 

1339 and any("imperative" in x for x in coltags) 

1340 and get_lang_conf(lang, "imperative_no_tense") 

1341 ): 

1342 if celltext == debug_cell_text: 1342 ↛ 1343line 1342 didn't jump to line 1343 because the condition on line 1342 was never true

1343 print("skipping tense in imperative") 

1344 continue 

1345 elif ( 

1346 "mood" in new_cats 

1347 and "mood" in cur_cats 

1348 and 

1349 # Allow if all new tags are already in current set 

1350 any( 

1351 t not in ts1 

1352 for ts1 in coltags # current 

1353 for ts2 in tagsets # new (from above) 

1354 for t in ts2 

1355 ) 

1356 ): 

1357 skip = get_lang_conf(lang, "skip_mood_mood") 

1358 if skip: 

1359 if celltext == debug_cell_text: 1359 ↛ 1360line 1359 didn't jump to line 1360 because the condition on line 1359 was never true

1360 print("skipping on mood-mood") 

1361 # we continue to next header 

1362 else: 

1363 if celltext == debug_cell_text: 1363 ↛ 1364line 1363 didn't jump to line 1364 because the condition on line 1363 was never true

1364 print("stopping on mood-mood") 

1365 break 

1366 elif "tense" in new_cats and "tense" in cur_cats: 

1367 skip = get_lang_conf(lang, "skip_tense_tense") 

1368 if skip: 

1369 if celltext == debug_cell_text: 1369 ↛ 1370line 1369 didn't jump to line 1370 because the condition on line 1369 was never true

1370 print("skipping on tense-tense") 

1371 # we continue to next header 

1372 else: 

1373 if celltext == debug_cell_text: 1373 ↛ 1374line 1373 didn't jump to line 1374 because the condition on line 1373 was never true

1374 print("stopping on tense-tense") 

1375 break 

1376 elif "aspect" in new_cats and "aspect" in cur_cats: 

1377 if celltext == debug_cell_text: 1377 ↛ 1378line 1377 didn't jump to line 1378 because the condition on line 1377 was never true

1378 print("skipping on aspect-aspect") 

1379 continue 

1380 elif "number" in cur_cats and "number" in new_cats: 

1381 if celltext == debug_cell_text: 1381 ↛ 1382line 1381 didn't jump to line 1382 because the condition on line 1381 was never true

1382 print("stopping on number-number") 

1383 break 

1384 elif "number" in cur_cats and "gender" in new_cats: 

1385 if celltext == debug_cell_text: 1385 ↛ 1386line 1385 didn't jump to line 1386 because the condition on line 1385 was never true

1386 print("stopping on number-gender") 

1387 break 

1388 elif "person" in cur_cats and "person" in new_cats: 

1389 if celltext == debug_cell_text: 1389 ↛ 1390line 1389 didn't jump to line 1390 because the condition on line 1389 was never true

1390 print("stopping on person-person") 

1391 break 

1392 else: 

1393 # Merge tags and continue to next header up/left in the table. 

1394 row_tagsets = and_tagsets(lang, pos, row_tagsets, tagsets) 

1395 if celltext == debug_cell_text: 1395 ↛ 1396line 1395 didn't jump to line 1396 because the condition on line 1395 was never true

1396 print("merged: {}".format(coltags)) 

1397 # Update the row number from which we have last taken headers 

1398 last_header_row = hdrspan.rownum 

1399 # Merge the final row tagset into coltags 

1400 coltags = and_tagsets(lang, pos, coltags, row_tagsets) 

1401 # print( 

1402 # "HDRSPANS:", list((x.start, x.colspan, x.tagsets) for x in hdrspans) 

1403 # ) 

1404 if celltext == debug_cell_text: 1404 ↛ 1405line 1404 didn't jump to line 1405 because the condition on line 1404 was never true

1405 print("COMPUTE_COLTAGS {} {}: {}".format(start, colspan, coltags)) 

1406 assert isinstance(coltags, list) 

1407 assert all(isinstance(x, tuple) for x in coltags) 

1408 return coltags 

1409 

1410 

1411def parse_simple_table( 

1412 wxr, tablecontext, word, lang, pos, rows, titles, source, after, depth 

1413): 

1414 """This is the default table parser. Despite its name, it can parse 

1415 complex tables. This returns a list of forms to be added to the 

1416 part-of-speech, or None if the table could not be parsed.""" 

1417 assert isinstance(wxr, WiktextractContext) 

1418 assert isinstance(tablecontext, TableContext) 

1419 assert isinstance(word, str) 

1420 assert isinstance(lang, str) 

1421 assert isinstance(pos, str) 

1422 assert isinstance(rows, list) 

1423 assert isinstance(source, str) 

1424 assert isinstance(after, str) 

1425 assert isinstance(depth, int) 

1426 for row in rows: 

1427 for col in row: 

1428 assert isinstance(col, InflCell) 

1429 assert isinstance(titles, list) 

1430 for x in titles: 

1431 assert isinstance(x, str) 

1432 

1433 # print("PARSE_SIMPLE_TABLE: TITLES:", titles) 

1434 if debug_cell_text: 1434 ↛ 1435line 1434 didn't jump to line 1435 because the condition on line 1434 was never true

1435 print("ROWS:") 

1436 for row in rows: 

1437 print(" ", row) 

1438 

1439 # Check for forced rowspan kludge. See e.g. 

1440 # maorski/Serbo-Croatian. These are essentially multi-row 

1441 # cells implemented using <br> rather than separate cell. We fix this 

1442 # by identifying rows where this happens, and splitting the current row 

1443 # to multiple rows by synthesizing additional cells. 

1444 new_rows = [] 

1445 for row in rows: 

1446 split_row = ( 

1447 any(x.is_title and x.text in ("inanimate\nanimate",) for x in row) 

1448 and 

1449 # x is an InflCell 

1450 all(x.rowspan == 1 for x in row) 

1451 ) 

1452 if not split_row: 

1453 new_rows.append(row) 

1454 continue 

1455 row1 = [] 

1456 row2 = [] 

1457 for cell in row: 

1458 cell1 = copy.deepcopy(cell) 

1459 if "\n" in cell.text: 

1460 # Has more than one line - split this cell 

1461 parts = cell.text.strip().splitlines() 

1462 if len(parts) != 2: 1462 ↛ 1463line 1462 didn't jump to line 1463 because the condition on line 1462 was never true

1463 wxr.wtp.debug( 

1464 "forced rowspan kludge got {} parts: {!r}".format( 

1465 len(parts), cell.text 

1466 ), 

1467 sortid="inflection/1234", 

1468 ) 

1469 cell2 = copy.deepcopy(cell) 

1470 cell1.text = parts[0] 

1471 cell2.text = parts[1] 

1472 else: 

1473 cell1.rowspan = 2 

1474 cell2 = cell1 # ref, not a copy 

1475 row1.append(cell1) 

1476 row2.append(cell2) 

1477 new_rows.append(row1) 

1478 new_rows.append(row2) 

1479 rows = new_rows 

1480 # print("ROWS AFTER FORCED ROWSPAN KLUDGE:") 

1481 # for row in rows: 

1482 # print(" ", row) 

1483 

1484 # Parse definitions for references (from table itself and from text 

1485 # after it) 

1486 def_ht = {} 

1487 

1488 def add_defs(defs: list[tuple[str, str]]) -> None: 

1489 for ref, d in defs: 

1490 # print("DEF: ref={} d={}".format(ref, d)) 

1491 d = d.strip() 

1492 d = d.split(". ")[0].strip() # text before ". " 

1493 if not d: 1493 ↛ 1494line 1493 didn't jump to line 1494 because the condition on line 1493 was never true

1494 continue 

1495 if d.endswith("."): # catc ".."?? 

1496 d = d[:-1] 

1497 tags, topics = decode_tags(d, no_unknown_starts=True) 

1498 # print(f"{ref=}, {transformed=}, {tags=}") 

1499 if topics or any("error-unknown-tag" in ts for ts in tags): 

1500 d = d[0].lower() + d[1:] 

1501 tags, topics = decode_tags(d, no_unknown_starts=True) 

1502 if topics or any("error-unknown-tag" in ts for ts in tags): 

1503 # Failed to parse as tags 

1504 # print("Failed: topics={} tags={}" 

1505 # .format(topics, tags)) 

1506 continue 

1507 tags1_s: set[str] = set() 

1508 for ts in tags: 

1509 # Set.update is a union operation: definition tags are flat 

1510 tags1_s.update(ts) 

1511 tags1 = tuple(sorted(tags1_s)) 

1512 # print("DEFINED: {} -> {}".format(ref, tags1)) 

1513 def_ht[ref] = tags1 

1514 

1515 def generate_tags( 

1516 rowtags: list[tuple[str]], table_tags: list[str] 

1517 ) -> tuple[ 

1518 list[tuple[str, ...]], list[tuple[str, ...]], list[tuple[str, ...]] 

1519 ]: 

1520 new_coltags = [] 

1521 all_hdr_tags = [] # list of tuples 

1522 new_rowtags = [] 

1523 for rt0 in rowtags: 

1524 for ct0 in compute_coltags( 

1525 lang, 

1526 pos, 

1527 hdrspans, 

1528 col_idx, # col_idx=>start 

1529 colspan, 

1530 col, # cell_text 

1531 ): 

1532 base_tags: set[str] = ( 

1533 set(rt0) 

1534 | set(ct0) 

1535 | set(global_tags) 

1536 | set(table_tags) 

1537 ) # Union. 

1538 # print(f"{rt0=}, {ct0=}, {global_tags=}," 

1539 # f" {table_tags=}, {base_tags=}") 

1540 alt_tags = expand_header( 

1541 wxr, 

1542 tablecontext, 

1543 word, 

1544 lang, 

1545 pos, 

1546 text, 

1547 base_tags, 

1548 depth=depth, 

1549 column_number=col_idx, 

1550 ) 

1551 # base_tags are used in infl_map "if"-conds. 

1552 for tt in alt_tags: 

1553 if tt not in all_hdr_tags: 

1554 all_hdr_tags.append(tt) 

1555 tt_s = set(tt) 

1556 # Add tags from referenced footnotes 

1557 tt_s.update(refs_tags) 

1558 # Sort, convert to tuple, and add to set of 

1559 # alternatives. 

1560 tt = tuple(sorted(tt_s)) 

1561 if tt not in new_coltags: 

1562 new_coltags.append(tt) 

1563 # Kludge (saprast/Latvian/Verb): ignore row tags 

1564 # if trying to add a non-finite after mood. 

1565 if any(valid_tags[t] == "mood" for t in rt0) and any( 

1566 valid_tags[t] == "non-finite" for t in tt 

1567 ): 

1568 tags = tuple(sorted(set(tt) | set(hdr_tags))) 

1569 else: 

1570 tags = tuple(sorted(set(tt) | set(rt0) | set(hdr_tags))) 

1571 if tags not in new_rowtags: 

1572 new_rowtags.append(tags) 

1573 return new_rowtags, new_coltags, all_hdr_tags 

1574 

1575 def add_new_hdrspan( 

1576 col: str, 

1577 hdrspans: list[HdrSpan], 

1578 store_new_hdrspan: bool, 

1579 col0_followed_by_nonempty: bool, 

1580 col0_hdrspan: Optional[HdrSpan], 

1581 ) -> tuple[str, bool, Optional[HdrSpan]]: 

1582 hdrspan = HdrSpan( 

1583 col_idx, colspan, rowspan, rownum, new_coltags, col, all_headers 

1584 ) 

1585 hdrspans.append(hdrspan) 

1586 

1587 # infl-map tag "dummy-store-hdrspan" causes this new hdrspan 

1588 # to be added to a register of stored hdrspans to be used 

1589 # later with "dummy-load-stored-hdrspans". 

1590 if store_new_hdrspan: 1590 ↛ 1591line 1590 didn't jump to line 1591 because the condition on line 1590 was never true

1591 tablecontext.stored_hdrspans.append(hdrspan) 

1592 

1593 # Handle headers that are above left-side header 

1594 # columns and are followed by personal pronouns in 

1595 # remaining columns (basically headers that 

1596 # evaluate to no tags). In such cases widen the 

1597 # left-side header to the full row. 

1598 if previously_seen: # id(cell) in seen_cells previously 

1599 col0_followed_by_nonempty = True 

1600 return col, col0_followed_by_nonempty, col0_hdrspan 

1601 elif col0_hdrspan is None: 

1602 col0_hdrspan = hdrspan 

1603 elif any(all_hdr_tags): 1603 ↛ 1671line 1603 didn't jump to line 1671 because the condition on line 1603 was always true

1604 col0_cats = tagset_cats(col0_hdrspan.tagsets) 

1605 later_cats = tagset_cats(all_hdr_tags) 

1606 col0_allowed = get_lang_conf(lang, "hdr_expand_first") 

1607 later_allowed = get_lang_conf(lang, "hdr_expand_cont") 

1608 later_allowed = later_allowed | set(["dummy"]) 

1609 # dummy2 has different behavior than plain dummy 

1610 # and does not belong here. 

1611 

1612 # print("col0_cats={} later_cats={} " 

1613 # "fol_by_nonempty={} col_idx={} end={} " 

1614 # "tagsets={}" 

1615 # .format(col0_cats, later_cats, 

1616 # col0_followed_by_nonempty, col_idx, 

1617 # col0_hdrspan.start + 

1618 # col0_hdrspan.colspan, 

1619 # col0_hdrspan.tagsets)) 

1620 # print("col0.rowspan={} rowspan={}" 

1621 # .format(col0_hdrspan.rowspan, rowspan)) 

1622 # Only expand if [col0_cats and later_cats are allowed 

1623 # and don't overlap] and [col0 has tags], and there have 

1624 # been [no disallowed cells in between]. 

1625 # 

1626 # There are three cases here: 

1627 # - col0_hdrspan set, continue with allowed current 

1628 # - col0_hdrspan set, expand, start new 

1629 # - col0_hdrspan set, no expand, start new 

1630 if ( 

1631 not col0_followed_by_nonempty 

1632 and 

1633 # XXX Only one cat of tags: kunna/Swedish 

1634 # XXX len(col0_cats) == 1 and 

1635 col0_hdrspan.rowspan >= rowspan 

1636 and 

1637 # from hdrspan 

1638 not (later_cats - later_allowed) 

1639 and not (col0_cats & later_cats) 

1640 ): 

1641 # First case: col0 set, continue 

1642 return col, col0_followed_by_nonempty, col0_hdrspan 

1643 # We are going to start new col0_hdrspan. Check if 

1644 # we should expand. 

1645 if ( 

1646 not col0_followed_by_nonempty 

1647 and not (col0_cats - col0_allowed) 

1648 and 

1649 # Only "allowed" allowed 

1650 # XXX len(col0_cats) == 1 and 

1651 col_idx > col0_hdrspan.start + col0_hdrspan.colspan 

1652 ): 

1653 # col_idx is beyond current colspan 

1654 # *Expand* current col0_hdrspan 

1655 # print("EXPANDING COL0 MID: {} from {} to {} " 

1656 # "cols {}" 

1657 # .format(col0_hdrspan.text, 

1658 # col0_hdrspan.colspan, 

1659 # col_idx - col0_hdrspan.start, 

1660 # col0_hdrspan.tagsets)) 

1661 col0_hdrspan.colspan = col_idx - col0_hdrspan.start 

1662 col0_hdrspan.expanded = True 

1663 # Clear old col0_hdrspan 

1664 if col == debug_cell_text: 1664 ↛ 1665line 1664 didn't jump to line 1665 because the condition on line 1664 was never true

1665 print("START NEW {}".format(hdrspan.tagsets)) 

1666 col0_hdrspan = None 

1667 # Now start new, unless it comes from previous row 

1668 if not previously_seen: 1668 ↛ 1671line 1668 didn't jump to line 1671 because the condition on line 1668 was always true

1669 col0_hdrspan = hdrspan 

1670 col0_followed_by_nonempty = False 

1671 return col, col0_followed_by_nonempty, col0_hdrspan 

1672 

1673 def split_text_into_alts(col: str) -> tuple[str, list[str], list[str]]: 

1674 # Split the cell text into alternatives 

1675 split_extra_tags = [] 

1676 if col and is_superscript(col[0]): 1676 ↛ 1677line 1676 didn't jump to line 1677 because the condition on line 1676 was never true

1677 alts = [col] 

1678 else: 

1679 separators = [";", "•", r"\n", " or "] 

1680 if " + " not in col: 

1681 separators.append(",") 

1682 if not col.endswith("/"): 

1683 separators.append("/") 

1684 if col in special_phrase_splits: 

1685 # Use language-specific special splits. 

1686 # These are phrases and constructions that have 

1687 # unique ways of splitting, not specific characters 

1688 # to split on like with the default splitting. 

1689 alts, tags = special_phrase_splits[col] 

1690 split_extra_tags = tags.split() 

1691 for x in split_extra_tags: 

1692 assert x in valid_tags 

1693 assert isinstance(alts, (list, tuple)) 

1694 assert isinstance(tags, str) 

1695 else: 

1696 # Use default splitting. However, recognize 

1697 # language-specific replacements and change them to magic 

1698 # characters before splitting. This way we won't split 

1699 # them. This is important for, e.g., recognizing 

1700 # alternative pronouns. 

1701 # The magic characters are characters out of Unicode scope 

1702 # that are given a simple incremental value, int > unicode. 

1703 repls = {} 

1704 magic_ch = MAGIC_FIRST 

1705 trs = get_lang_conf(lang, "form_transformations") 

1706 # trs is a list of lists of strings 

1707 for _, v, _, _ in trs: 

1708 # v is a pattern string, like "^ich" 

1709 # form_transformations data is doing double-duty here, 

1710 # because the pattern strings are already known to us and 

1711 # not meant to be split. 

1712 m = re.search(v, col) 

1713 if m is not None: 

1714 # if pattern found in text 

1715 magic = chr(magic_ch) 

1716 magic_ch += 1 # next magic character value 

1717 col = re.sub(v, magic, col) # replace with magic ch 

1718 repls[magic] = m.group(0) 

1719 # remember what regex match string each magic char 

1720 # replaces. .group(0) is the whole match. 

1721 alts0 = split_at_comma_semi(col, separators=separators) 

1722 # with magic characters in place, split the text so that 

1723 # pre-transformation text is out of the way. 

1724 alts = [] 

1725 for alt in alts0: 

1726 # create a new list with the separated items and 

1727 # the magic characters replaced with the original texts. 

1728 for k, v in repls.items(): 

1729 alt = re.sub(k, v, alt) 

1730 alts.append(alt) 

1731 

1732 # Remove "*" from beginning of forms, as in non-attested 

1733 # or reconstructed forms. Otherwise it might confuse romanization 

1734 # detection. 

1735 alts = list(re.sub(r"^\*\*?([^ ])", r"\1", x) for x in alts) 

1736 alts = list( 

1737 x for x in alts if not re.match(r"pronounced with |\(with ", x) 

1738 ) 

1739 alts = list( 

1740 re.sub(r"^\((in the sense [^)]*)\)\s+", "", x) for x in alts 

1741 ) 

1742 return col, alts, split_extra_tags 

1743 

1744 def handle_mixed_lines(alts: list[str]) -> list[tuple[str, str, str]]: 

1745 # Handle the special case where romanization is given under 

1746 # normal form, e.g. in Russian. There can be multiple 

1747 # comma-separated forms in each case. We also handle the case 

1748 # where instead of romanization we have IPA pronunciation 

1749 # (e.g., avoir/French/verb). 

1750 len2 = len(alts) // 2 

1751 # Check for IPAs (forms first, IPAs under) 

1752 # base, base, IPA, IPA 

1753 if ( 

1754 len(alts) % 2 == 0 # Divisibly by two 

1755 and all( 

1756 re.match(r"^\s*/.*/\s*$", x) # Inside slashes = IPA 

1757 for x in alts[len2:] 

1758 ) 

1759 ): # In the second half of alts 

1760 nalts = list( 

1761 (alts[i], "", alts[i + len2]) 

1762 # List of tuples: (base, "", ipa) 

1763 for i in range(len2) 

1764 ) 

1765 # base, base, base, IPA 

1766 elif ( 

1767 len(alts) > 2 

1768 and re.match(r"^\s*/.*/\s*$", alts[-1]) 

1769 and all(not x.startswith("/") for x in alts[:-1]) 

1770 ): 

1771 # Only if the last alt is IPA 

1772 nalts = list((alts[i], "", alts[-1]) for i in range(len(alts) - 1)) 

1773 # base, IPA, IPA, IPA 

1774 elif ( 

1775 len(alts) > 2 

1776 and not alts[0].startswith("/") 

1777 and all( 

1778 re.match(r"^\s*/.*/\s*$", alts[i]) for i in range(1, len(alts)) 

1779 ) 

1780 ): 

1781 # First is base and the rest is IPA alternatives 

1782 nalts = list((alts[0], "", alts[i]) for i in range(1, len(alts))) 

1783 

1784 # Check for romanizations, forms first, romanizations under 

1785 elif ( 

1786 len(alts) % 2 == 0 

1787 and not any("(" in x for x in alts) 

1788 and all( 

1789 classify_desc( 

1790 re.sub( 

1791 r"\^.*$", 

1792 "", 

1793 # Remove ends of strings starting from ^. 

1794 # Supescripts have been already removed 

1795 # from the string, while ^xyz needs to be 

1796 # removed separately, though it's usually 

1797 # something with a single letter? 

1798 "".join(xx for xx in x if not is_superscript(xx)), 

1799 ) 

1800 ) 

1801 == "other" 

1802 for x in alts[:len2] 

1803 ) 

1804 and all( 

1805 classify_desc( 

1806 re.sub( 

1807 r"\^.*$", 

1808 "", 

1809 "".join(xx for xx in x if not is_superscript(xx)), 

1810 ) 

1811 ) 

1812 in ("romanization", "english") 

1813 for x in alts[len2:] 

1814 ) 

1815 ): 

1816 nalts = list((alts[i], alts[i + len2], "") for i in range(len2)) 

1817 # Check for romanizations, forms and romanizations alternating 

1818 elif ( 

1819 len(alts) % 2 == 0 

1820 and not any("(" in x for x in alts) 

1821 and all( 

1822 classify_desc( 

1823 re.sub( 

1824 r"\^.*$", 

1825 "", 

1826 "".join(xx for xx in alts[i] if not is_superscript(xx)), 

1827 ) 

1828 ) 

1829 == "other" 

1830 for i in range(0, len(alts), 2) 

1831 ) 

1832 and all( 

1833 classify_desc( 

1834 re.sub( 

1835 r"\^.*$", 

1836 "", 

1837 "".join(xx for xx in alts[i] if not is_superscript(xx)), 

1838 ) 

1839 ) 

1840 in ("romanization", "english") 

1841 for i in range(1, len(alts), 2) 

1842 ) 

1843 ): 

1844 # odds 

1845 nalts = list( 

1846 (alts[i], alts[i + 1], "") for i in range(0, len(alts), 2) 

1847 ) 

1848 # evens 

1849 # Handle complex Georgian entries with alternative forms and* 

1850 # *romanizations. It's a bit of a mess. Remove this kludge if not 

1851 # needed anymore. NOTE THAT THE PARENTHESES ON THE WEBSITE ARE NOT 

1852 # DISPLAYED. They are put inside their own span elements that are 

1853 # then hidden with some CSS. 

1854 # https://en.wiktionary.org/wiki/%E1%83%90%E1%83%9B%E1%83%94%E1%83%A0%E1%83%98%E1%83%99%E1%83%98%E1%83%A1_%E1%83%A8%E1%83%94%E1%83%94%E1%83%A0%E1%83%97%E1%83%94%E1%83%91%E1%83%A3%E1%83%9A%E1%83%98_%E1%83%A8%E1%83%A2%E1%83%90%E1%83%A2%E1%83%94%E1%83%91%E1%83%98 

1855 # ამერიკის შეერთებულ შტატებს(ა) (ameriḳis šeertebul šṭaṭebs(a)) 

1856 # The above should generate two alts entries, with two different 

1857 # parallel versions, one without (a) and with (a) at the end, 

1858 # for both the Georgian original and the romanization. 

1859 elif ( 1859 ↛ 1864line 1859 didn't jump to line 1864 because the condition on line 1859 was never true

1860 tablecontext.template_name == "ka-decl-noun" 

1861 and len(alts) >= 1 

1862 and any(" (" in alt_ for alt_ in alts) 

1863 ): 

1864 nalts = ka_decl_noun_template_cell(alts) 

1865 else: 

1866 new_alts = [] 

1867 for alt in alts: 

1868 lst = [""] 

1869 idx = 0 

1870 for m in re.finditer( 

1871 r"(^|\w|\*)\((\w+" r"(/\w+)*)\)", 

1872 # start OR letter OR asterisk (word/word*) 

1873 # \\___________group 1_______/ \ \_g3_/// 

1874 # \ \__gr. 2_// 

1875 # \_____________group 0________________/ 

1876 alt, 

1877 ): 

1878 v = m.group(2) # (word/word/word...) 

1879 if ( 

1880 classify_desc(v) == "tags" # Tags inside parens 

1881 or m.group(0) == alt 

1882 ): # All in parens 

1883 continue 

1884 new_lst = [] 

1885 for x in lst: 

1886 x += alt[idx : m.start()] + m.group(1) 

1887 # alt until letter or asterisk 

1888 idx = m.end() 

1889 vparts = v.split("/") 

1890 # group(2) = ["word", "wörd"...] 

1891 if len(vparts) == 1: 

1892 new_lst.append(x) 

1893 new_lst.append(x + v) 

1894 # "kind(er)" -> ["kind", "kinder"] 

1895 else: 

1896 for vv in vparts: 

1897 new_lst.append(x + vv) 

1898 # "lampai(tten/den)" -> 

1899 # ["lampaitten", "lampaiden"] 

1900 lst = new_lst 

1901 for x in lst: 

1902 new_alts.append(x + alt[idx:]) 

1903 # add the end of alt 

1904 nalts = list((x, "", "") for x in new_alts) 

1905 # [form, no romz, no ipa] 

1906 return nalts 

1907 

1908 def find_semantic_parens(form: str) -> tuple[str, list[str]]: 

1909 # "Some languages" (=Greek) use brackets to mark things that 

1910 # require tags, like (informality), [rarity] and {archaicity}. 

1911 extra_tags = [] 

1912 if re.match(r"\([^][(){}]*\)$", form): 

1913 if get_lang_conf(lang, "parentheses_for_informal"): 

1914 form = form[1:-1] 

1915 extra_tags.append("informal") 

1916 else: 

1917 form = form[1:-1] 

1918 elif re.match(r"\{\[[^][(){}]*\]\}$", form): 

1919 if get_lang_conf( 1919 ↛ 1926line 1919 didn't jump to line 1926 because the condition on line 1919 was always true

1920 lang, "square_brackets_for_rare" 

1921 ) and get_lang_conf(lang, "curly_brackets_for_archaic"): 

1922 # είμαι/Greek/Verb 

1923 form = form[2:-2] 

1924 extra_tags.extend(["rare", "archaic"]) 

1925 else: 

1926 form = form[2:-2] 

1927 elif re.match(r"\{[^][(){}]*\}$", form): 

1928 if get_lang_conf(lang, "curly_brackets_for_archaic"): 1928 ↛ 1933line 1928 didn't jump to line 1933 because the condition on line 1928 was always true

1929 # είμαι/Greek/Verb 

1930 form = form[1:-1] 

1931 extra_tags.extend(["archaic"]) 

1932 else: 

1933 form = form[1:-1] 

1934 elif re.match(r"\[[^][(){}]*\]$", form): 

1935 if get_lang_conf(lang, "square_brackets_for_rare"): 1935 ↛ 1940line 1935 didn't jump to line 1940 because the condition on line 1935 was always true

1936 # είμαι/Greek/Verb 

1937 form = form[1:-1] 

1938 extra_tags.append("rare") 

1939 else: 

1940 form = form[1:-1] 

1941 return form, extra_tags 

1942 

1943 def handle_parens( 

1944 form: str, roman: str, clitic: str, extra_tags: list[str] 

1945 ) -> tuple[str, str, str]: 

1946 if re.match(r"[’'][a-z]([a-z][a-z]?)?$", paren): 

1947 # is there a clitic starting with apostrophe? 

1948 clitic = paren 

1949 # assume the whole paren is a clitic 

1950 # then remove paren from form 

1951 form = (form[: m.start()] + subst + form[m.end() :]).strip() 

1952 elif classify_desc(paren) == "tags": 

1953 tagsets1, topics1 = decode_tags(paren) 

1954 if not topics1: 1954 ↛ 1975line 1954 didn't jump to line 1975 because the condition on line 1954 was always true

1955 for ts in tagsets1: 

1956 ts = tuple(x for x in ts if " " not in x) 

1957 # There are some generated tags containing 

1958 # spaces; do not let them through here. 

1959 extra_tags.extend(ts) 

1960 form = (form[: m.start()] + subst + form[m.end() :]).strip() 

1961 # brackets contain romanization 

1962 elif ( 

1963 m.start() > 0 

1964 and not roman 

1965 and classify_desc(form[: m.start()]) == "other" 

1966 and 

1967 # "other" ~ text 

1968 classify_desc(paren) in ("romanization", "english") 

1969 and not re.search(r"^with |-form$", paren) 

1970 ): 

1971 roman = paren 

1972 form = (form[: m.start()] + subst + form[m.end() :]).strip() 

1973 elif re.search(r"^with |-form", paren): 1973 ↛ 1974line 1973 didn't jump to line 1974 because the condition on line 1973 was never true

1974 form = (form[: m.start()] + subst + form[m.end() :]).strip() 

1975 return form, roman, clitic 

1976 

1977 def merge_row_and_column_tags(form, some_has_covered_text): 

1978 # Merge column tags and row tags. We give preference 

1979 # to moods etc coming from rowtags (cf. austteigen/German/Verb 

1980 # imperative forms). 

1981 

1982 # In certain cases, what a tag means depends on whether 

1983 # it is a row or column header. Depending on the language, 

1984 # we replace certain tags with others if they're in 

1985 # a column or row 

1986 

1987 ret = [] 

1988 # rtagreplacs = get_lang_conf(lang, "rowtag_replacements") 

1989 # ctagreplacs = get_lang_conf(lang, "coltag_replacements") 

1990 for rt in sorted(rowtags): 

1991 if "dummy-use-as-coltags" in rt: 1991 ↛ 1992line 1991 didn't jump to line 1992 because the condition on line 1991 was never true

1992 continue 

1993 # if lang was in rowtag_replacements) 

1994 # if not rtagreplacs == None: 

1995 # rt = replace_directional_tags(rt, rtagreplacs) 

1996 for ct in sorted(coltags): 

1997 if "dummy-use-as-rowtags" in ct: 1997 ↛ 1998line 1997 didn't jump to line 1998 because the condition on line 1997 was never true

1998 continue 

1999 # if lang was in coltag_replacements 

2000 # if not ctagreplacs == None: 

2001 # ct = replace_directional_tags(ct, 

2002 # ctagreplacs) 

2003 tags = set(global_tags) 

2004 tags.update(extra_tags) 

2005 tags.update(rt) 

2006 tags.update(refs_tags) 

2007 tags.update(tablecontext.section_header) 

2008 # Merge tags from column. For certain kinds of tags, 

2009 # those coming from row take precedence. 

2010 old_tags = set(tags) 

2011 for t in ct: 

2012 c = valid_tags[t] 

2013 if c in ("mood", "case", "number") and any( 

2014 valid_tags[tt] == c for tt in old_tags 

2015 ): 

2016 continue 

2017 tags.add(t) 

2018 

2019 # Extract language-specific tags from the 

2020 # form. This may also adjust the form. 

2021 form, lang_tags = lang_specific_tags(lang, pos, form) 

2022 tags.update(lang_tags) 

2023 

2024 # For non-finite verb forms, see if they have 

2025 # a gender/class suffix 

2026 if pos == "verb" and any( 

2027 valid_tags[t] == "non-finite" for t in tags 

2028 ): 

2029 form, tt = parse_head_final_tags(wxr, lang, form) 

2030 tags.update(tt) 

2031 

2032 # Remove "personal" tag if have nth person; these 

2033 # come up with e.g. reconhecer/Portuguese/Verb. But 

2034 # not if we also have "pronoun" 

2035 if ( 

2036 "personal" in tags 

2037 and "pronoun" not in tags 

2038 and any( 

2039 x in tags 

2040 for x in [ 

2041 "first-person", 

2042 "second-person", 

2043 "third-person", 

2044 ] 

2045 ) 

2046 ): 

2047 tags.remove("personal") 

2048 

2049 # If we have impersonal, remove person and number. 

2050 # This happens with e.g. viajar/Portuguese/Verb 

2051 if "impersonal" in tags: 

2052 tags = tags - set( 

2053 [ 

2054 "first-person", 

2055 "second-person", 

2056 "third-person", 

2057 "singular", 

2058 "plural", 

2059 ] 

2060 ) 

2061 

2062 # Remove unnecessary "positive" tag from verb forms 

2063 if pos == "verb" and "positive" in tags: 

2064 if "negative" in tags: 2064 ↛ 2065line 2064 didn't jump to line 2065 because the condition on line 2064 was never true

2065 tags.remove("negative") 

2066 tags.remove("positive") 

2067 

2068 # Many Russian (and other Slavic) inflection tables 

2069 # have animate/inanimate distinction that generates 

2070 # separate entries for neuter/feminine, but the 

2071 # distinction only applies to masculine. Remove them 

2072 # form neuter/feminine and eliminate duplicates. 

2073 if get_lang_conf(lang, "masc_only_animate"): 

2074 for t1 in ("animate", "inanimate"): 

2075 for t2 in ("neuter", "feminine"): 

2076 if ( 

2077 t1 in tags 

2078 and t2 in tags 

2079 and "masculine" not in tags 

2080 and "plural" not in tags 

2081 ): 

2082 tags.remove(t1) 

2083 

2084 # German adjective tables contain "(keiner)" etc 

2085 # for mixed declension plural. When the adjective 

2086 # disappears and it becomes just one word, remove 

2087 # the "includes-article" tag. e.g. eiskalt/German 

2088 if "includes-article" in tags and " " not in form: 

2089 tags.remove("includes-article") 

2090 

2091 # Handle ignored forms. We mark that the form was 

2092 # provided. This is important information; some words 

2093 # just do not have a certain form. However, there also 

2094 # many cases where no word in a language has a 

2095 # particular form. Post-processing could detect and 

2096 # remove such cases. 

2097 if form in IGNORED_COLVALUES: 

2098 # if cell text seems to be ignorable 

2099 if "dummy-ignore-skipped" in tags: 

2100 continue 

2101 if ( 

2102 col_idx not in has_covering_hdr 

2103 and some_has_covered_text 

2104 ): 

2105 continue 

2106 # don't ignore this cell if there's been a header 

2107 # above it 

2108 form = "-" 

2109 elif col_idx in has_covering_hdr: 

2110 some_has_covered_text = True 

2111 

2112 # Handle ambiguous object concord. If a header 

2113 # gives the "dummy-object-concord"-tag to a word, 

2114 # replace person, number and gender tags with 

2115 # their "object-" counterparts so that the verb 

2116 # agrees with the object instead. 

2117 # Use only when the verb has ONLY object agreement! 

2118 # a پخول/Pashto 

2119 if "dummy-object-concord" in tags: 2119 ↛ 2120line 2119 didn't jump to line 2120 because the condition on line 2119 was never true

2120 for subtag, objtag in object_concord_replacements.items(): 

2121 if subtag in tags: 

2122 tags.remove(subtag) 

2123 tags.add(objtag) 

2124 

2125 # Remove the dummy mood tag that we sometimes 

2126 # use to block adding other mood and related 

2127 # tags 

2128 tags = tags - set( 

2129 [ 

2130 "dummy-mood", 

2131 "dummy-tense", 

2132 "dummy-ignore-skipped", 

2133 "dummy-object-concord", 

2134 "dummy-reset-headers", 

2135 "dummy-use-as-coltags", 

2136 "dummy-use-as-rowtags", 

2137 "dummy-store-hdrspan", 

2138 "dummy-load-stored-hdrspans", 

2139 "dummy-reset-stored-hdrspans", 

2140 "dummy-section-header", 

2141 ] 

2142 ) 

2143 

2144 # Perform language-specific tag replacements according 

2145 # to rules in a table. 

2146 lang_tag_mappings = get_lang_conf(lang, "lang_tag_mappings") 

2147 if lang_tag_mappings is not None: 2147 ↛ 2148line 2147 didn't jump to line 2148 because the condition on line 2147 was never true

2148 for pre, post in lang_tag_mappings.items(): 

2149 if all(t in tags for t in pre): 

2150 tags = (tags - set(pre)) | set(post) 

2151 

2152 # Warn if there are entries with empty tags 

2153 if not tags: 

2154 wxr.wtp.debug( 

2155 "inflection table: empty tags for {}".format(form), 

2156 sortid="inflection/1826", 

2157 ) 

2158 

2159 # Warn if form looks like IPA 

2160 ########## XXX ######## 

2161 # Because IPA is its own unicode block, we could also 

2162 # technically do a Unicode name check to see if a string 

2163 # contains IPA. Not all valid IPA characters are in the 

2164 # IPA extension block, so you can technically have false 

2165 # negatives if it's something like /toki/, but it 

2166 # shouldn't give false positives. 

2167 # Alternatively, you could make a list of IPA-admissible 

2168 # characters and reject non-IPA stuff with that. 

2169 if re.match(r"\s*/.*/\s*$", form): 2169 ↛ 2170line 2169 didn't jump to line 2170 because the condition on line 2169 was never true

2170 wxr.wtp.debug( 

2171 "inflection table form looks like IPA: " 

2172 "form={} tags={}".format(form, tags), 

2173 sortid="inflection/1840", 

2174 ) 

2175 

2176 # Note that this checks `form`, not `in tags` 

2177 if form == "dummy-ignored-text-cell": 2177 ↛ 2178line 2177 didn't jump to line 2178 because the condition on line 2177 was never true

2178 continue 

2179 

2180 if "dummy-remove-this-cell" in tags: 2180 ↛ 2181line 2180 didn't jump to line 2181 because the condition on line 2180 was never true

2181 continue 

2182 

2183 # Add the form 

2184 tags = list(sorted(tags)) 

2185 dt = {"form": form, "tags": tags, "source": source} 

2186 if roman: 

2187 dt["roman"] = roman 

2188 if ipa: 

2189 dt["ipa"] = ipa 

2190 ret.append(dt) 

2191 # If we got separate clitic form, add it 

2192 if clitic: 

2193 dt = { 

2194 "form": clitic, 

2195 "tags": tags + ["clitic"], 

2196 "source": source, 

2197 } 

2198 ret.append(dt) 

2199 return ret, form, some_has_covered_text 

2200 

2201 # First extract definitions from cells 

2202 # See defs_ht for footnote defs stuff 

2203 for row in rows: 

2204 for cell in row: 

2205 text, refs, defs, hdr_tags = extract_cell_content( 

2206 lang, word, cell.text 

2207 ) 

2208 # refs, defs = footnote stuff, defs -> (ref, def) 

2209 add_defs(defs) 

2210 # Extract definitions from text after table 

2211 text, refs, defs, hdr_tags = extract_cell_content(lang, word, after) 

2212 add_defs(defs) 

2213 

2214 # Then extract the actual forms 

2215 ret = [] 

2216 hdrspans = [] 

2217 first_col_has_text = False 

2218 rownum = 0 

2219 title = None 

2220 global_tags = [] 

2221 table_tags = [] 

2222 special_phrase_splits = get_lang_conf(lang, "special_phrase_splits") 

2223 form_replacements = get_lang_conf(lang, "form_replacements") 

2224 form_transformations = get_lang_conf(lang, "form_transformations") 

2225 possibly_ignored_forms = get_lang_conf(lang, "conditionally_ignored_cells") 

2226 cleanup_rules = get_lang_conf(lang, "minor_text_cleanups") 

2227 

2228 for title in titles: 

2229 more_global_tags, more_table_tags, extra_forms = parse_title( 

2230 title, source 

2231 ) 

2232 global_tags.extend(more_global_tags) 

2233 table_tags.extend(more_table_tags) 

2234 ret.extend(extra_forms) 

2235 cell_rowcnt = collections.defaultdict(int) 

2236 seen_cells = set() 

2237 has_covering_hdr = set() 

2238 some_has_covered_text = False 

2239 for row in rows: 

2240 # print("ROW:", row) 

2241 # print("====") 

2242 # print(f"Start of PREVIOUS row hdrspans:" 

2243 # f"{tuple(sp.tagsets for sp in hdrspans)}") 

2244 # print(f"Start of row txt: {tuple(t.text for t in row)}") 

2245 if not row: 2245 ↛ 2246line 2245 didn't jump to line 2246 because the condition on line 2245 was never true

2246 continue # Skip empty rows 

2247 all_headers = all(x.is_title or not x.text.strip() for x in row) 

2248 text = row[0].text 

2249 if ( 

2250 row[0].is_title 

2251 and text 

2252 and not is_superscript(text[0]) 

2253 and text not in infl_map # zealous inflation map? 

2254 and ( 

2255 re.match(r"Inflection ", text) 

2256 or re.sub( 

2257 r"\s+", 

2258 " ", # flatten whitespace 

2259 re.sub( 

2260 r"\s*\([^)]*\)", 

2261 "", 

2262 # Remove whitespace+parens 

2263 text, 

2264 ), 

2265 ).strip() 

2266 not in infl_map 

2267 ) 

2268 and not re.match(infl_start_re, text) 

2269 and all( 

2270 x.is_title == row[0].is_title and x.text == text 

2271 # all InflCells in `row` have the same is_title and text 

2272 for x in row 

2273 ) 

2274 ): 

2275 if text and title is None: 

2276 # Only if there were no titles previously make the first 

2277 # text that is found the title 

2278 title = text 

2279 if re.match(r"(Note:|Notes:)", title): 2279 ↛ 2280line 2279 didn't jump to line 2280 because the condition on line 2279 was never true

2280 continue # not a title 

2281 more_global_tags, more_table_tags, extra_forms = parse_title( 

2282 title, source 

2283 ) 

2284 global_tags.extend(more_global_tags) 

2285 table_tags.extend(more_table_tags) 

2286 ret.extend(extra_forms) 

2287 continue # Skip title rows without incrementing i 

2288 if "dummy-skip-this" in global_tags: 2288 ↛ 2289line 2288 didn't jump to line 2289 because the condition on line 2288 was never true

2289 return [] 

2290 rowtags = [()] 

2291 # have_hdr = False 

2292 # have_hdr never used? 

2293 have_text = False 

2294 samecell_cnt = 0 

2295 col0_hdrspan = None # col0 or later header (despite its name) 

2296 col0_followed_by_nonempty = False 

2297 row_empty = True 

2298 for col_idx, cell in enumerate(row): 

2299 colspan = cell.colspan # >= 1 

2300 rowspan = cell.rowspan # >= 1 

2301 previously_seen = id(cell) in seen_cells 

2302 # checks to see if this cell was in the previous ROW 

2303 seen_cells.add(id(cell)) 

2304 if samecell_cnt == 0: 

2305 # First column of a (possible multi-column) cell 

2306 samecell_cnt = colspan - 1 

2307 else: 

2308 assert samecell_cnt > 0 

2309 samecell_cnt -= 1 

2310 continue 

2311 

2312 # is_first_row_of_cell = cell_rowcnt[id(cell)] == 0 

2313 # never used? 

2314 

2315 # defaultdict(int) around line 1900 

2316 cell_rowcnt[id(cell)] += 1 

2317 # => how many cols this spans 

2318 col = cell.text 

2319 if not col: 

2320 continue 

2321 row_empty = False 

2322 is_title = cell.is_title 

2323 

2324 # If the cell has a target, i.e., text after colon, interpret 

2325 # it as simply specifying a value for that value and ignore 

2326 # it otherwise. 

2327 if cell.target: 

2328 text, refs, defs, hdr_tags = extract_cell_content( 

2329 lang, word, col 

2330 ) 

2331 if not text: 2331 ↛ 2332line 2331 didn't jump to line 2332 because the condition on line 2331 was never true

2332 continue 

2333 refs_tags = set() 

2334 for ref in refs: # gets tags from footnotes 2334 ↛ 2335line 2334 didn't jump to line 2335 because the loop on line 2334 never started

2335 if ref in def_ht: 

2336 refs_tags.update(def_ht[ref]) 

2337 rowtags = expand_header( 

2338 wxr, 

2339 tablecontext, 

2340 word, 

2341 lang, 

2342 pos, 

2343 text, 

2344 [], 

2345 silent=True, 

2346 depth=depth, 

2347 column_number=col_idx, 

2348 ) 

2349 rowtags = list( 

2350 set(tuple(sorted(set(x) | refs_tags)) for x in rowtags) 

2351 ) 

2352 is_title = False 

2353 col = cell.target 

2354 

2355 # print(rownum, col_idx, col) 

2356 # print(f"is_title: {is_title}") 

2357 if is_title: 

2358 # It is a header cell 

2359 text, refs, defs, hdr_tags = extract_cell_content( 

2360 lang, word, col 

2361 ) 

2362 if not text: 

2363 continue 

2364 # Extract tags from referenced footnotes 

2365 refs_tags = set() 

2366 for ref in refs: 

2367 if ref in def_ht: 

2368 refs_tags.update(def_ht[ref]) 

2369 

2370 # Expand header to tags 

2371 v = expand_header( 

2372 wxr, 

2373 tablecontext, 

2374 word, 

2375 lang, 

2376 pos, 

2377 text, 

2378 [], 

2379 silent=True, 

2380 depth=depth, 

2381 column_number=col_idx, 

2382 ) 

2383 # print("EXPANDED {!r} to {}".format(text, v)) 

2384 

2385 if col_idx == 0: 

2386 # first_col_has_text is used for a test to ignore 

2387 # upper-left cells that are just text without 

2388 # header info 

2389 first_col_has_text = True 

2390 # Check if the header expands to reset hdrspans 

2391 if any("dummy-reset-headers" in tt for tt in v): 

2392 new_hdrspans = [] 

2393 for hdrspan in hdrspans: 

2394 # if there are HdrSpan objects (abstract headers with 

2395 # row- and column-spans) that are to the left or at the 

2396 # same row or below, KEEP those; things above and to 

2397 # the right of the hdrspan with dummy-reset-headers 

2398 # are discarded. Tags from the header together with 

2399 # dummy-reset-headers are kept as normal. 

2400 if ( 

2401 hdrspan.start + hdrspan.colspan < col_idx 

2402 or hdrspan.rownum > rownum - cell.rowspan 

2403 ): 

2404 new_hdrspans.append(hdrspan) 

2405 hdrspans = new_hdrspans 

2406 

2407 for tt in v: 

2408 if "dummy-section-header" in tt: 2408 ↛ 2409line 2408 didn't jump to line 2409 because the condition on line 2408 was never true

2409 tablecontext.section_header = tt 

2410 break 

2411 if "dummy-reset-section-header" in tt: 2411 ↛ 2412line 2411 didn't jump to line 2412 because the condition on line 2411 was never true

2412 tablecontext.section_header = [] 

2413 # Text between headers on a row causes earlier headers to 

2414 # be reset 

2415 if have_text: 

2416 # print(" HAVE_TEXT BEFORE HDR:", col) 

2417 # Reset rowtags if new title column after previous 

2418 # text cells 

2419 # +-----+-----+-----+-----+ 

2420 # |hdr-a|txt-a|hdr-B|txt-B| 

2421 # +-----+-----+-----+-----+ 

2422 # ^reset rowtags=> 

2423 # XXX beware of header "—": "" - must not clear on that if 

2424 # it expands to no tags 

2425 rowtags = [()] 

2426 # have_hdr = True 

2427 # have_hdr never used? 

2428 # print("HAVE_HDR: {} rowtags={}".format(col, rowtags)) 

2429 # Update rowtags and coltags 

2430 has_covering_hdr.add(col_idx) # col_idx == current column 

2431 # has_covering_hdr is a set that has the col_idx-ids of columns 

2432 # that have previously had some kind of header. It is never 

2433 # resetted inside the col_idx-loops OR the bigger rows-loop, so 

2434 # applies to the whole table. 

2435 

2436 rowtags, new_coltags, all_hdr_tags = generate_tags( 

2437 rowtags, table_tags 

2438 ) 

2439 

2440 if any("dummy-skip-this" in ts for ts in rowtags): 

2441 continue # Skip this cell 

2442 

2443 if any("dummy-load-stored-hdrspans" in ts for ts in v): 2443 ↛ 2444line 2443 didn't jump to line 2444 because the condition on line 2443 was never true

2444 hdrspans.extend(tablecontext.stored_hdrspans) 

2445 

2446 if any("dummy-reset-stored-hdrspans" in ts for ts in v): 2446 ↛ 2447line 2446 didn't jump to line 2447 because the condition on line 2446 was never true

2447 tablecontext.stored_hdrspans = [] 

2448 

2449 if any("dummy-store-hdrspan" in ts for ts in v): 2449 ↛ 2451line 2449 didn't jump to line 2451 because the condition on line 2449 was never true

2450 # print(f"STORED: {col}") 

2451 store_new_hdrspan = True 

2452 else: 

2453 store_new_hdrspan = False 

2454 

2455 new_coltags = list( 

2456 x 

2457 for x in new_coltags 

2458 if not any(t in noinherit_tags for t in x) 

2459 ) 

2460 # print("new_coltags={} previously_seen={} all_hdr_tags={}" 

2461 # .format(new_coltags, previously_seen, all_hdr_tags)) 

2462 if any(new_coltags): 

2463 ( 

2464 col, 

2465 col0_followed_by_nonempty, 

2466 col0_hdrspan, 

2467 ) = add_new_hdrspan( 

2468 col, 

2469 hdrspans, 

2470 store_new_hdrspan, 

2471 col0_followed_by_nonempty, 

2472 col0_hdrspan, 

2473 ) 

2474 

2475 continue 

2476 

2477 # These values are ignored, at least for now 

2478 if re.match(r"^(# |\(see )", col): 2478 ↛ 2479line 2478 didn't jump to line 2479 because the condition on line 2478 was never true

2479 continue 

2480 

2481 if any("dummy-skip-this" in ts for ts in rowtags): 

2482 continue # Skip this cell 

2483 

2484 # If the word has no rowtags and is a multi-row cell, then 

2485 # ignore this. This happens with empty separator rows 

2486 # within a rowspan>1 cell. cf. wander/English/Conjugation. 

2487 if rowtags == [()] and rowspan > 1: 

2488 continue 

2489 

2490 # Minor cleanup. See e.g. είμαι/Greek/Verb present participle. 

2491 if cleanup_rules: 

2492 for regx, substitution in cleanup_rules.items(): 

2493 col = re.sub(regx, substitution, col) 

2494 

2495 if ( 2495 ↛ 2500line 2495 didn't jump to line 2500 because the condition on line 2495 was never true

2496 col_idx == 0 

2497 and not first_col_has_text 

2498 and get_lang_conf(lang, "ignore_top_left_text_cell") is True 

2499 ): 

2500 continue # Skip text at top left, as in Icelandic, Faroese 

2501 

2502 # if col0_hdrspan is not None: 

2503 # print("COL0 FOLLOWED NONHDR: {!r} by {!r}" 

2504 # .format(col0_hdrspan.text, col)) 

2505 col0_followed_by_nonempty = True 

2506 have_text = True 

2507 

2508 # Determine column tags for the multi-column cell 

2509 combined_coltags = compute_coltags( 

2510 lang, pos, hdrspans, col_idx, colspan, col 

2511 ) 

2512 if any("dummy-ignored-text-cell" in ts for ts in combined_coltags): 2512 ↛ 2513line 2512 didn't jump to line 2513 because the condition on line 2512 was never true

2513 continue 

2514 

2515 # Split the text into separate forms. First simplify spaces except 

2516 # newline. 

2517 col = re.sub(r"[ \t\r]+", " ", col) 

2518 # Split the cell text into alternatives 

2519 

2520 col, alts, split_extra_tags = split_text_into_alts(col) 

2521 

2522 # Some cells have mixed form content, like text and romanization, 

2523 # or text and IPA. Handle these. 

2524 alts = handle_mixed_lines(alts) 

2525 

2526 alts = list((x, combined_coltags) for x in alts) 

2527 

2528 # Generate forms from the alternatives 

2529 # alts is a list of (tuple of forms, tuple of tags) 

2530 for (form, base_roman, ipa), coltags in alts: 

2531 form = form.strip() 

2532 extra_tags = [] 

2533 extra_tags.extend(split_extra_tags) 

2534 # Handle special splits again here, so that we can have custom 

2535 # mappings from form to form and tags. 

2536 if form in form_replacements: 

2537 replacement, tags = form_replacements[form] 

2538 for x in tags.split(): 

2539 assert x in valid_tags 

2540 assert isinstance(replacement, str) 

2541 assert isinstance(tags, str) 

2542 form = replacement 

2543 extra_tags.extend(tags.split()) 

2544 

2545 check_romanization_form_transformation = False 

2546 # loop over regexes in form_transformation and replace text 

2547 # in form using regex patterns 

2548 # this does a bit of the same stuff the above does, 

2549 # but with regexes and re.sub() instead 

2550 for ( 

2551 form_transformations_pos, 

2552 v, 

2553 subst, 

2554 tags, 

2555 ) in form_transformations: 

2556 # v is a pattern string, like "^ich" 

2557 if ( 

2558 isinstance(form_transformations_pos, str) 

2559 and pos != form_transformations_pos 

2560 ) or ( 

2561 (not isinstance(form_transformations_pos, str)) 

2562 and pos not in form_transformations_pos 

2563 ): 

2564 continue 

2565 m = re.search(v, form) 

2566 if m is not None: 

2567 if base_roman: 2567 ↛ 2568line 2567 didn't jump to line 2568 because the condition on line 2567 was never true

2568 for _, rom_v, rom_sub, _ in form_transformations: 

2569 rom_m = re.search(rom_v, base_roman) 

2570 if rom_m is not None: 

2571 base_roman = re.sub( 

2572 rom_v, rom_sub, base_roman 

2573 ) 

2574 break 

2575 form = re.sub(v, subst, form) 

2576 for x in tags.split(): 

2577 assert x in valid_tags 

2578 extra_tags.extend(tags.split()) 

2579 check_romanization_form_transformation = True 

2580 break 

2581 

2582 # Clean the value, extracting reference symbols 

2583 form, refs, defs, hdr_tags = extract_cell_content( 

2584 lang, word, form 

2585 ) 

2586 # if refs: 

2587 # print("REFS:", refs) 

2588 extra_tags.extend(hdr_tags) 

2589 # Extract tags from referenced footnotes 

2590 refs_tags = set() 

2591 for ref in refs: 

2592 if ref in def_ht: 

2593 refs_tags.update(def_ht[ref]) 

2594 

2595 if base_roman: 

2596 if check_romanization_form_transformation: 2596 ↛ 2600line 2596 didn't jump to line 2600 because the condition on line 2596 was never true

2597 # because form_transformations are used to handle things 

2598 # where the romanization has the "same" structure, we 

2599 # need to handle that here too.... 

2600 for ( 

2601 _, 

2602 v, 

2603 subst, 

2604 _, 

2605 ) in form_transformations: 

2606 # v is a pattern string, like "^ich" 

2607 m = re.search(v, base_roman) 

2608 if m is not None: 

2609 base_roman = re.sub(v, subst, base_roman) 

2610 # XXX add tag stuff here if needed 

2611 break 

2612 

2613 base_roman, _, _, hdr_tags = extract_cell_content( 

2614 lang, word, base_roman 

2615 ) 

2616 extra_tags.extend(hdr_tags) 

2617 

2618 # Do some additional cleanup on the cell. 

2619 form = re.sub(r"^\s*,\s*", "", form) 

2620 form = re.sub(r"\s*,\s*$", "", form) 

2621 form = re.sub(r"\s*(,\s*)+", ", ", form) 

2622 form = re.sub(r"(?i)^Main:", "", form) 

2623 form = re.sub(r"\s+", " ", form) 

2624 form = form.strip() 

2625 

2626 # Look for parentheses that have semantic meaning 

2627 form, et = find_semantic_parens(form) 

2628 extra_tags.extend(et) 

2629 

2630 # Handle parentheses in the table element. We parse 

2631 # tags anywhere and romanizations anywhere but beginning. 

2632 roman = base_roman 

2633 paren = None 

2634 clitic = None 

2635 m = re.search(r"(\s+|^)\(([^)]*)\)", form) 

2636 # start|spaces + (anything) 

2637 if m is not None: 

2638 subst = m.group(1) 

2639 paren = m.group(2) 

2640 else: 

2641 m = re.search(r"\(([^)]*)\)(\s+|$)", form) 

2642 # (anything) + spaces|end 

2643 if m is not None: 2643 ↛ 2644line 2643 didn't jump to line 2644 because the condition on line 2643 was never true

2644 paren = m.group(1) 

2645 subst = m.group(2) 

2646 if paren is not None: 

2647 form, roman, clitic = handle_parens( 

2648 form, roman, clitic, extra_tags 

2649 ) 

2650 

2651 # Ignore certain forms that are not really forms, 

2652 # unless they're really, really close to the article title 

2653 if form in ( 2653 ↛ 2658line 2653 didn't jump to line 2658 because the condition on line 2653 was never true

2654 "", 

2655 "unchanged", 

2656 "after an", # in sona/Irish/Adj/Mutation 

2657 ): 

2658 Lev = distw([form], word) 

2659 if form and Lev < 0.1: 

2660 wxr.wtp.debug( 

2661 "accepted possible false positive '{}' with" 

2662 "> 0.1 Levenshtein distance in {}/{}".format( 

2663 form, word, lang 

2664 ), 

2665 sortid="inflection/2213", 

2666 ) 

2667 elif form and Lev < 0.3: 

2668 wxr.wtp.debug( 

2669 "skipped possible match '{}' with > 0.3" 

2670 "Levenshtein distance in {}/{}".format( 

2671 form, word, lang 

2672 ), 

2673 sortid="inflection/2218", 

2674 ) 

2675 continue 

2676 else: 

2677 continue 

2678 # print("ROWTAGS={} COLTAGS={} REFS_TAGS={} " 

2679 # "FORM={!r} ROMAN={!r}" 

2680 # .format(rowtags, coltags, refs_tags, 

2681 # form, roman)) 

2682 

2683 # Merge tags from row and column and do miscellaneous 

2684 # tag-related handling. 

2685 ( 

2686 merge_ret, 

2687 form, 

2688 some_has_covered_text, 

2689 ) = merge_row_and_column_tags(form, some_has_covered_text) 

2690 ret.extend(merge_ret) 

2691 

2692 # End of row. 

2693 rownum += 1 

2694 # For certain languages, if the row was empty, reset 

2695 # hdrspans (saprast/Latvian/Verb, but not aussteigen/German/Verb). 

2696 if row_empty and get_lang_conf(lang, "empty_row_resets"): 

2697 hdrspans = [] 

2698 # Check if we should expand col0_hdrspan. 

2699 if col0_hdrspan is not None: 

2700 col0_allowed = get_lang_conf(lang, "hdr_expand_first") 

2701 col0_cats = tagset_cats(col0_hdrspan.tagsets) 

2702 # Only expand if col0_cats and later_cats are allowed 

2703 # and don't overlap and col0 has tags, and there have 

2704 # been no disallowed cells in between. 

2705 if ( 

2706 not col0_followed_by_nonempty 

2707 and not (col0_cats - col0_allowed) 

2708 and 

2709 # len(col0_cats) == 1 and 

2710 col_idx > col0_hdrspan.start + col0_hdrspan.colspan 

2711 ): 

2712 # If an earlier header is only followed by headers that yield 

2713 # no tags, expand it to entire row 

2714 # print("EXPANDING COL0: {} from {} to {} cols {}" 

2715 # .format(col0_hdrspan.text, col0_hdrspan.colspan, 

2716 # len(row) - col0_hdrspan.start, 

2717 # col0_hdrspan.tagsets)) 

2718 col0_hdrspan.colspan = len(row) - col0_hdrspan.start 

2719 col0_hdrspan.expanded = True 

2720 # XXX handle refs and defs 

2721 # for x in hdrspans: 

2722 # print(" HDRSPAN {} {} {} {!r}" 

2723 # .format(x.start, x.colspan, x.tagsets, x.text)) 

2724 

2725 # Post-process German nouns with articles in separate columns. We move the 

2726 # definite/indefinite/usually-without-article markers into the noun and 

2727 # remove the article entries. 

2728 if get_lang_conf(lang, "articles_in_separate_columns") and any( 

2729 "noun" in x["tags"] for x in ret 

2730 ): 

2731 new_ret = [] 

2732 saved_tags = set() 

2733 had_noun = False 

2734 for dt in ret: 

2735 tags = dt["tags"] 

2736 # print(tags) 

2737 if "noun" in tags: 

2738 tags = list( 

2739 sorted(set(t for t in tags if t != "noun") | saved_tags) 

2740 ) 

2741 had_noun = True 

2742 elif ( 2742 ↛ 2769line 2742 didn't jump to line 2769 because the condition on line 2742 was always true

2743 "indefinite" in tags 

2744 or "definite" in tags 

2745 or "usually-without-article" in tags 

2746 or "without-article" in tags 

2747 ): 

2748 if had_noun: 

2749 saved_tags = set(tags) 

2750 else: 

2751 saved_tags = saved_tags | set(tags) # E.g. Haus/German 

2752 remove_useless_tags(lang, pos, saved_tags) 

2753 saved_tags = saved_tags & set( 

2754 [ 

2755 "masculine", 

2756 "feminine", 

2757 "neuter", 

2758 "singular", 

2759 "plural", 

2760 "indefinite", 

2761 "definite", 

2762 "usually-without-article", 

2763 "without-article", 

2764 ] 

2765 ) 

2766 had_noun = False 

2767 continue # Skip the articles 

2768 

2769 dt = dt.copy() 

2770 dt["tags"] = tags 

2771 new_ret.append(dt) 

2772 ret = new_ret 

2773 

2774 elif possibly_ignored_forms: 

2775 # Some languages have tables with cells that are kind of separated 

2776 # and difficult to handle, like eulersche Formel/German where 

2777 # the definite and indefinite articles are just floating. 

2778 # If a language has a dict of conditionally_ignored_cells, 

2779 # and if the contents of a cell is found in one of the rules 

2780 # there, ignore that cell if it 

2781 # 1. Does not have the appropriate tag (like "definite" for "die") 

2782 # and 

2783 # 2. The title of the article is not one of the other co-words 

2784 # (ie. it's an article for the definite articles in german etc.) 

2785 # pass 

2786 new_ret = [] 

2787 for cell_data in ret: 

2788 tags = cell_data["tags"] 

2789 text = cell_data["form"] 

2790 skip_this = False 

2791 for key_tag, ignored_forms in possibly_ignored_forms.items(): 

2792 if text not in ignored_forms: 2792 ↛ 2794line 2792 didn't jump to line 2794 because the condition on line 2792 was always true

2793 continue 

2794 if word in ignored_forms: 

2795 continue 

2796 if key_tag not in tags: 

2797 skip_this = True 

2798 

2799 if skip_this: 2799 ↛ 2800line 2799 didn't jump to line 2800 because the condition on line 2799 was never true

2800 continue 

2801 new_ret.append(cell_data) 

2802 

2803 ret = new_ret 

2804 

2805 # Post-process English inflection tables, addding "multiword-construction" 

2806 # when the number of words has increased. 

2807 if lang == "English" and pos == "verb": 

2808 word_words = len(word.split()) 

2809 new_ret = [] 

2810 for dt in ret: 

2811 form = dt.get("form", "") 

2812 if len(form.split()) > word_words: 

2813 dt = dt.copy() 

2814 dt["tags"] = list(dt.get("tags", [])) 

2815 # This strange copy-assigning shuffle is preventative black 

2816 # magic; do not touch lest you invoke deep bugs. 

2817 data_append(dt, "tags", "multiword-construction") 

2818 new_ret.append(dt) 

2819 ret = new_ret 

2820 

2821 # Always insert "table-tags" detail as the first entry in any inflection 

2822 # table. This way we can reliably detect where a new table starts. 

2823 # Table-tags applies until the next table-tags entry. 

2824 if ret or table_tags: 

2825 table_tags = list(sorted(set(table_tags))) 

2826 dt = { 

2827 "form": " ".join(table_tags), 

2828 "source": source, 

2829 "tags": ["table-tags"], 

2830 } 

2831 if dt["form"] == "": 

2832 dt["form"] = "no-table-tags" 

2833 if tablecontext.template_name: 

2834 tn = { 

2835 "form": tablecontext.template_name, 

2836 "source": source, 

2837 "tags": ["inflection-template"], 

2838 } 

2839 ret = [dt] + [tn] + ret 

2840 else: 

2841 ret = [dt] + ret 

2842 

2843 return ret 

2844 

2845 

2846def handle_generic_table( 

2847 wxr, tablecontext, data, word, lang, pos, rows, titles, source, after, depth 

2848): 

2849 assert isinstance(wxr, WiktextractContext) 

2850 assert isinstance(data, dict) 

2851 assert isinstance(word, str) 

2852 assert isinstance(lang, str) 

2853 assert isinstance(pos, str) 

2854 assert isinstance(rows, list) 

2855 assert isinstance(source, str) 

2856 assert isinstance(after, str) 

2857 assert isinstance(depth, int) 

2858 for row in rows: 

2859 assert isinstance(row, list) 

2860 for x in row: 

2861 assert isinstance(x, InflCell) 

2862 assert isinstance(titles, list) 

2863 for x in titles: 

2864 assert isinstance(x, str) 

2865 

2866 # Try to parse the table as a simple table 

2867 ret = parse_simple_table( 

2868 wxr, tablecontext, word, lang, pos, rows, titles, source, after, depth 

2869 ) 

2870 if ret is None: 2870 ↛ 2873line 2870 didn't jump to line 2873 because the condition on line 2870 was never true

2871 # XXX handle other table formats 

2872 # We were not able to handle the table 

2873 wxr.wtp.debug( 

2874 "unhandled inflection table format, {}/{}".format(word, lang), 

2875 sortid="inflection/2370", 

2876 ) 

2877 return 

2878 

2879 # Add the returned forms but eliminate duplicates. 

2880 have_forms = set() 

2881 for dt in ret: 

2882 fdt = freeze(dt) 

2883 if fdt in have_forms: 

2884 continue # Don't add duplicates 

2885 # Some Russian words have Declension and Pre-reform declension partially 

2886 # duplicating same data. Don't add "dated" tags variant if already have 

2887 # the same without "dated" from the modern declension table 

2888 

2889 tags = dt.get("tags", []) 

2890 for dated_tag in ("dated",): 

2891 if dated_tag in tags: 

2892 dt2 = dt.copy() 

2893 tags2 = list(x for x in tags if x != dated_tag) 

2894 dt2["tags"] = tags2 

2895 if tags2 and freeze(dt2) in have_forms: 2895 ↛ 2896line 2895 didn't jump to line 2896 because the condition on line 2895 was never true

2896 break # Already have without archaic 

2897 else: 

2898 if "table-tags" not in tags: 

2899 have_forms.add(fdt) 

2900 data_append(data, "forms", dt) 

2901 

2902 

2903def determine_header( 

2904 wxr, 

2905 tablecontext, 

2906 lang, 

2907 word, 

2908 pos, 

2909 table_kind, 

2910 kind, 

2911 style, 

2912 row, 

2913 col, 

2914 celltext, 

2915 titletext, 

2916 cols_headered, 

2917 target, 

2918 cellstyle, 

2919): 

2920 assert isinstance(table_kind, NodeKind) 

2921 assert isinstance(kind, (NodeKind, str)) 

2922 assert style is None or isinstance(style, str) 

2923 assert cellstyle is None or isinstance(cellstyle, str) 

2924 

2925 if table_kind == NodeKind.TABLE: 

2926 header_kind = NodeKind.TABLE_HEADER_CELL 

2927 elif table_kind == NodeKind.HTML: 2927 ↛ 2929line 2927 didn't jump to line 2929 because the condition on line 2927 was always true

2928 header_kind = "th" 

2929 idx = celltext.find(": ") 

2930 is_title = False 

2931 # remove anything in parentheses, compress whitespace, .strip() 

2932 cleaned_titletext = re.sub( 

2933 r"\s+", " ", re.sub(r"\s*\([^)]*\)", "", titletext) 

2934 ).strip() 

2935 cleaned, _, _, _ = extract_cell_content(lang, word, celltext) 

2936 cleaned = re.sub(r"\s+", " ", cleaned) 

2937 hdr_expansion = expand_header( 

2938 wxr, 

2939 tablecontext, 

2940 word, 

2941 lang, 

2942 pos, 

2943 cleaned, 

2944 [], 

2945 silent=True, 

2946 ignore_tags=True, 

2947 ) 

2948 candidate_hdr = not any( 

2949 any(t.startswith("error-") for t in ts) for ts in hdr_expansion 

2950 ) 

2951 # KJ candidate_hdr says that a specific cell is a candidate 

2952 # for being a header because it passed through expand_header 

2953 # without getting any "error-" tags; that is, the contents 

2954 # is "valid" for being a header; these are the false positives 

2955 # we want to catch 

2956 ignored_cell = any( 

2957 any(t.startswith("dummy-") for t in ts) for ts in hdr_expansion 

2958 ) 

2959 # ignored_cell should NOT be used to filter for headers, like 

2960 # candidate_hdr is used, but only to filter for related *debug 

2961 # messages*: some dummy-tags are actually half-way to headers, 

2962 # like ones with "Notes", so they MUST be headers, but later 

2963 # on they're ignored *as* headers so they don't need to print 

2964 # out any cells-as-headers debug messages. 

2965 if ( 

2966 candidate_hdr 

2967 and kind != header_kind 

2968 and cleaned != "" 

2969 and cleaned != "dummy-ignored-text-cell" 

2970 and cleaned not in IGNORED_COLVALUES 

2971 ): 

2972 # print("col: {}".format(col)) 

2973 if not ignored_cell and lang not in LANGUAGES_WITH_CELLS_AS_HEADERS: 

2974 wxr.wtp.debug( 

2975 "rejected heuristic header: " 

2976 "table cell identified as header and given " 

2977 "candidate status, BUT {} is not in " 

2978 "LANGUAGES_WITH_CELLS_AS_HEADERS; " 

2979 "cleaned text: {}".format(lang, cleaned), 

2980 sortid="inflection/2447", 

2981 ) 

2982 candidate_hdr = False 

2983 elif cleaned not in LANGUAGES_WITH_CELLS_AS_HEADERS.get(lang, ""): 

2984 wxr.wtp.debug( 

2985 "rejected heuristic header: " 

2986 "table cell identified as header and given " 

2987 "candidate status, BUT the cleaned text is " 

2988 "not in LANGUAGES_WITH_CELLS_AS_HEADERS[{}]; " 

2989 "cleaned text: {}".format(lang, cleaned), 

2990 sortid="inflection/2457", 

2991 ) 

2992 candidate_hdr = False 

2993 else: 

2994 wxr.wtp.debug( 

2995 "accepted heuristic header: " 

2996 "table cell identified as header and given " 

2997 "candidate status, AND the cleaned text is " 

2998 "in LANGUAGES_WITH_CELLS_AS_HEADERS[{}]; " 

2999 "cleaned text: {}".format(lang, cleaned), 

3000 sortid="inflection/2466", 

3001 ) 

3002 

3003 # If the cell starts with something that could start a 

3004 # definition (typically a reference symbol), make it a candidate 

3005 # regardless of whether the language is listed. 

3006 if re.match(def_re, cleaned) and not re.match(nondef_re, cleaned): 3006 ↛ 3007line 3006 didn't jump to line 3007 because the condition on line 3006 was never true

3007 candidate_hdr = True 

3008 

3009 # print("titletext={!r} hdr_expansion={!r} candidate_hdr={!r} " 

3010 # "lang={} pos={}" 

3011 # .format(titletext, hdr_expansion, candidate_hdr, 

3012 # lang, pos)) 

3013 if idx >= 0 and titletext[:idx] in infl_map: 

3014 target = titletext[idx + 2 :].strip() 

3015 celltext = celltext[:idx] 

3016 is_title = True 

3017 elif ( 

3018 kind == header_kind 

3019 and " + " not in titletext # For "avoir + blah blah"? 

3020 and not any( 

3021 isinstance(x, WikiNode) 

3022 and x.kind == NodeKind.HTML 

3023 and x.sarg == "span" 

3024 and x.attrs.get("lang") in ("az",) 

3025 for x in col.children 

3026 ) 

3027 ): 

3028 is_title = True 

3029 elif ( 

3030 candidate_hdr 

3031 and cleaned_titletext not in IGNORED_COLVALUES 

3032 and distw([cleaned_titletext], word) > 0.3 

3033 and cleaned_titletext not in ("I", "es") 

3034 ): 

3035 is_title = True 

3036 # if first column or same style as first column 

3037 elif ( 

3038 style == cellstyle 

3039 and 

3040 # and title is not identical to word name 

3041 titletext != word 

3042 and cleaned not in IGNORED_COLVALUES 

3043 and cleaned != "dummy-ignored-text-cell" 

3044 and 

3045 # the style composite string is not broken 

3046 not style.startswith("////") 

3047 and " + " not in titletext 

3048 ): 

3049 if not ignored_cell and lang not in LANGUAGES_WITH_CELLS_AS_HEADERS: 3049 ↛ 3050line 3049 didn't jump to line 3050 because the condition on line 3049 was never true

3050 wxr.wtp.debug( 

3051 "rejected heuristic header: " 

3052 "table cell identified as header based " 

3053 "on style, BUT {} is not in " 

3054 "LANGUAGES_WITH_CELLS_AS_HEADERS; " 

3055 "cleaned text: {}, style: {}".format(lang, cleaned, style), 

3056 sortid="inflection/2512", 

3057 ) 

3058 elif ( 3058 ↛ 3062line 3058 didn't jump to line 3062 because the condition on line 3058 was never true

3059 not ignored_cell 

3060 and cleaned not in LANGUAGES_WITH_CELLS_AS_HEADERS.get(lang, "") 

3061 ): 

3062 wxr.wtp.debug( 

3063 "rejected heuristic header: " 

3064 "table cell identified as header based " 

3065 "on style, BUT the cleaned text is " 

3066 "not in LANGUAGES_WITH_CELLS_AS_HEADERS[{}]; " 

3067 "cleaned text: {}, style: {}".format(lang, cleaned, style), 

3068 sortid="inflection/2522", 

3069 ) 

3070 else: 

3071 wxr.wtp.debug( 

3072 "accepted heuristic header: " 

3073 "table cell identified as header based " 

3074 "on style, AND the cleaned text is " 

3075 "in LANGUAGES_WITH_CELLS_AS_HEADERS[{}]; " 

3076 "cleaned text: {}, style: {}".format(lang, cleaned, style), 

3077 sortid="inflection/2530", 

3078 ) 

3079 is_title = True 

3080 if ( 3080 ↛ 3087line 3080 didn't jump to line 3087 because the condition on line 3080 was never true

3081 not is_title 

3082 and len(row) < len(cols_headered) 

3083 and cols_headered[len(row)] 

3084 ): 

3085 # Whole column has title suggesting they are headers 

3086 # (e.g. "Case") 

3087 is_title = True 

3088 if re.match( 

3089 r"Conjugation of |Declension of |Inflection of |" 

3090 r"Mutation of |Notes\b", # \b is word-boundary 

3091 titletext, 

3092 ): 

3093 is_title = True 

3094 return is_title, hdr_expansion, target, celltext 

3095 

3096 

3097class TableContext: 

3098 """Saved context used when parsing a table and its subtables.""" 

3099 

3100 __slot__ = ( 

3101 "stored_hdrspans", 

3102 "section_header", 

3103 "template_name", 

3104 ) 

3105 

3106 def __init__(self, template_name=None): 

3107 self.stored_hdrspans = [] 

3108 self.section_header = [] 

3109 if not template_name: 

3110 self.template_name = "" 

3111 else: 

3112 self.template_name = template_name 

3113 

3114 

3115def handle_wikitext_or_html_table( 

3116 wxr: WiktextractContext, 

3117 word: str, 

3118 lang: str, 

3119 pos: str, 

3120 data, 

3121 tree, 

3122 titles, 

3123 source, 

3124 after, 

3125 tablecontext: TableContext | None = None, 

3126): 

3127 """Parses a table from parsed Wikitext format into rows and columns of 

3128 InflCell objects and then calls handle_generic_table() to parse it into 

3129 forms. This adds the forms into ``data``.""" 

3130 assert isinstance(wxr, WiktextractContext) 

3131 assert isinstance(word, str) 

3132 assert isinstance(lang, str) 

3133 assert isinstance(pos, str) 

3134 assert isinstance(data, dict) 

3135 assert isinstance(tree, WikiNode) 

3136 assert tree.kind == NodeKind.TABLE or ( 

3137 tree.kind == NodeKind.HTML and tree.sarg == "table" 

3138 ) 

3139 assert isinstance(titles, list) 

3140 assert isinstance(source, str) 

3141 for x in titles: 

3142 assert isinstance(x, str) 

3143 assert isinstance(after, str) 

3144 assert tablecontext is None or isinstance(tablecontext, TableContext) 

3145 # Imported here to avoid a circular import 

3146 from wiktextract.page import clean_node, recursively_extract 

3147 

3148 # from wikitextprocessor.parser import print_tree 

3149 # print_tree(tree) 

3150 # print("-------==========-------") 

3151 

3152 if not tablecontext: 

3153 tablecontext = TableContext() 

3154 

3155 # Get language specific text removal patterns 

3156 remove_text_patterns: ( 

3157 dict[tuple[str, ...], tuple[str | re.Pattern, ...]] | None 

3158 ) = None 

3159 if rem := get_lang_conf(lang, "remove_text_patterns"): 

3160 for poses in rem.keys(): 

3161 if pos in poses: 

3162 remove_text_patterns = rem[poses] 

3163 break 

3164 

3165 def handle_table1( 

3166 wxr, 

3167 tablecontext, 

3168 word, 

3169 lang, 

3170 pos, 

3171 data, 

3172 tree, 

3173 titles, 

3174 source, 

3175 after, 

3176 depth, 

3177 ): 

3178 """Helper function allowing the 'flattening' out of the table 

3179 recursion: instead of handling the tables in the wrong order 

3180 (recursively), this function adds to new_row that is then 

3181 iterated through in the main function at the end, creating 

3182 a longer table (still in pieces) in the correct order.""" 

3183 

3184 assert isinstance(data, dict) 

3185 assert isinstance(titles, list) 

3186 assert isinstance(source, str) 

3187 for x in titles: 

3188 assert isinstance(x, str) 

3189 assert isinstance(after, str) 

3190 assert isinstance(depth, int) 

3191 # print("HANDLE_WIKITEXT_TABLE", titles) 

3192 

3193 col_gap_data = [] # Filling for columns with rowspan > 1 

3194 # col_gap_data contains None or InflCell 

3195 vertical_still_left = [] # Number of remaining rows for which to fill 

3196 # the column; vertical_still_left contains int 

3197 cols_headered = [] # [F, T, F, F...] 

3198 # True when the whole column contains headers, even 

3199 # when the cell is not considered a header; triggered 

3200 # by the "*" inflmap meta-tag. 

3201 rows = [] 

3202 

3203 sub_ret = [] 

3204 

3205 # from wikitextprocessor.parser import print_tree 

3206 # print_tree(tree) 

3207 for node in tree.children: 

3208 if not isinstance(node, WikiNode): 

3209 continue 

3210 if node.kind == NodeKind.HTML: 

3211 kind = node.sarg 

3212 else: 

3213 kind = node.kind 

3214 

3215 # print(" {}".format(node)) 

3216 if kind in (NodeKind.TABLE_CAPTION, "caption"): 

3217 # print(" CAPTION:", node) 

3218 pass 

3219 elif kind in (NodeKind.TABLE_ROW, "tr"): 

3220 if "vsShow" in node.attrs.get("class", "").split(): 

3221 # vsShow rows are those that are intially shown in tables 

3222 # that have more data. The hidden data duplicates these 

3223 # rows, so we skip it and just process the hidden data. 

3224 continue 

3225 

3226 # if ( 

3227 # len(node.children) == 1 

3228 # and node.children[0].attrs.get("class") == "separator" 

3229 # ): 

3230 # print("------------------ skip separator") 

3231 # continue 

3232 

3233 # Parse a table row. 

3234 row = [] 

3235 style = None 

3236 row_has_nonempty_cells = False 

3237 # Have nonempty cell not from rowspan 

3238 for col in get_table_cells(node): 

3239 # loop through each cell in the ROW 

3240 

3241 # The below skip is not needed anymore, because we "skip" in 

3242 # get_table_cells, but left here as a comment 

3243 # if not isinstance(col, WikiNode): 

3244 # # This skip is not used for counting, 

3245 # # "None" is not used in 

3246 # # indexing or counting or looping. 

3247 # continue 

3248 if col.kind == NodeKind.HTML: 

3249 kind = col.sarg 

3250 else: 

3251 kind = col.kind 

3252 if kind not in ( 3252 ↛ 3258line 3252 didn't jump to line 3258 because the condition on line 3252 was never true

3253 NodeKind.TABLE_HEADER_CELL, 

3254 NodeKind.TABLE_CELL, 

3255 "th", 

3256 "td", 

3257 ): 

3258 print(" UNEXPECTED ROW CONTENT: {}".format(col)) 

3259 continue 

3260 

3261 while ( 

3262 len(row) < len(vertical_still_left) 

3263 and vertical_still_left[len(row)] > 0 

3264 ): 

3265 # vertical_still_left is [...0, 0, 2...] for each 

3266 # column. It is populated at the end of the loop, at the 

3267 # same time as col_gap_data. This needs to be looped and 

3268 # filled this way because each `for col`-looping jumps 

3269 # straight to the next meaningful cell; there is no 

3270 # "None" cells, only emptiness between, and rowspan and 

3271 # colspan are just to generate the "fill- 

3272 vertical_still_left[len(row)] -= 1 

3273 row.append(col_gap_data[len(row)]) 

3274 

3275 # appending row is how "indexing" is 

3276 # done here; something is appended, 

3277 # like a filler-cell here or a "start" 

3278 # cell at the end of the row-loop, 

3279 # which increased len(row) which is 

3280 # then used as the target-index to check 

3281 # for gaps. vertical_still_left is 

3282 # the countdown to when to stop 

3283 # filling in gaps, and goes down to 0, 

3284 # and col_gap_data is not touched 

3285 # except when a new rowspan is needed, 

3286 # at the same time that 

3287 # vertical_still_left gets reassigned. 

3288 

3289 try: 

3290 rowspan = int(col.attrs.get("rowspan", "1")) # 🡙 

3291 colspan = int(col.attrs.get("colspan", "1")) # 🡘 

3292 except ValueError: 

3293 rowspan = 1 

3294 colspan = 1 

3295 # print("COL:", col) 

3296 

3297 # Too many of these errors 

3298 if colspan > 100: 

3299 # wxr.wtp.error( 

3300 # f"Colspan {colspan} over 30, set to 1", 

3301 # sortid="inflection/20250113a", 

3302 # ) 

3303 colspan = 100 

3304 if rowspan > 100: 3304 ↛ 3309line 3304 didn't jump to line 3309 because the condition on line 3304 was never true

3305 # wxr.wtp.error( 

3306 # f"Rowspan {rowspan} over 30, set to 1", 

3307 # sortid="inflection/20250113b", 

3308 # ) 

3309 rowspan = 100 

3310 

3311 # Process any nested tables recursively. 

3312 tables, rest = recursively_extract( 

3313 col, 

3314 lambda x: isinstance(x, WikiNode) 

3315 and (x.kind == NodeKind.TABLE or x.sarg == "table"), 

3316 ) 

3317 

3318 # Clean the rest of the cell. 

3319 celltext = clean_node(wxr, None, rest) 

3320 # print(f"CLEANED: {celltext=}") 

3321 # print(f"SUBTABLES: {tables}") 

3322 

3323 # Remove regexed patterns from text 

3324 if remove_text_patterns is not None: 

3325 for pat in remove_text_patterns: 

3326 celltext = re.sub(pat, "", celltext) 

3327 # print(f"AFTER: {celltext=} <<") 

3328 

3329 # Handle nested tables. 

3330 for tbl in tables: 

3331 # Some nested tables (e.g., croí/Irish) have subtitles 

3332 # as normal paragraphs in the same cell under a descrip- 

3333 # tive text that should be treated as a title (e.g., 

3334 # "Forms with the definite article", with "definite" not 

3335 # mentioned elsewhere). 

3336 new_titles = list(titles) 

3337 if celltext: 

3338 new_titles.append(celltext) 

3339 subtbl = handle_table1( 

3340 wxr, 

3341 tablecontext, 

3342 word, 

3343 lang, 

3344 pos, 

3345 data, 

3346 tbl, 

3347 new_titles, 

3348 source, 

3349 "", 

3350 depth + 1, 

3351 ) 

3352 if subtbl: 3352 ↛ 3330line 3352 didn't jump to line 3330 because the condition on line 3352 was always true

3353 sub_ret.append((rows, titles, after, depth)) 

3354 rows = [] 

3355 titles = [] 

3356 after = "" 

3357 sub_ret.extend(subtbl) 

3358 

3359 # This magic value is used as part of header detection 

3360 cellstyle = ( 

3361 col.attrs.get("style", "") 

3362 + "//" 

3363 + col.attrs.get("class", "") 

3364 + "//" 

3365 + str(kind) 

3366 ) 

3367 

3368 if not row: # if first column in row 

3369 style = cellstyle 

3370 target = None 

3371 titletext = celltext.strip() 

3372 while titletext and is_superscript(titletext[-1]): 

3373 titletext = titletext[:-1] 

3374 

3375 ( 

3376 is_title, 

3377 hdr_expansion, 

3378 target, 

3379 celltext, 

3380 ) = determine_header( 

3381 wxr, 

3382 tablecontext, 

3383 lang, 

3384 word, 

3385 pos, 

3386 tree.kind, 

3387 kind, 

3388 style, 

3389 row, 

3390 col, 

3391 celltext, 

3392 titletext, 

3393 cols_headered, 

3394 None, 

3395 cellstyle, 

3396 ) 

3397 

3398 if is_title: 

3399 # If this cell gets a "*" tag, make the whole column 

3400 # below it (toggling it in cols_headered = [F, F, T...]) 

3401 # into headers. 

3402 while len(cols_headered) <= len(row): 

3403 cols_headered.append(False) 

3404 if any("*" in tt for tt in hdr_expansion): 

3405 cols_headered[len(row)] = True 

3406 celltext = "" 

3407 # if row_has_nonempty_cells has been True at some point, it 

3408 # keeps on being True. 

3409 # if row_has_nonempty_cells or is_title or celltext != "": 

3410 # row_has_nonempty_cells = True 

3411 # ⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓ 

3412 row_has_nonempty_cells |= is_title or celltext != "" 

3413 cell = InflCell( 

3414 celltext, is_title, colspan, rowspan, target 

3415 ) 

3416 for _ in range(0, colspan): 

3417 # colspan🡘 current loop (col) or 1 

3418 # All the data-filling for colspan 

3419 # is done simply in this loop, 

3420 # while rowspan needs to use 

3421 # vertical_still_left to count gaps 

3422 # and col_gap_data to fill in 

3423 # those gaps with InflCell data. 

3424 if rowspan > 1: # rowspan🡙 current loop (col) or 1 

3425 while len(col_gap_data) <= len(row): 

3426 # Initialize col_gap_data/ed if 

3427 # it is lacking slots 

3428 # for each column; col_gap_data and 

3429 # vertical_still_left are never 

3430 # reset to [], during 

3431 # the whole table function. 

3432 col_gap_data.append(None) 

3433 vertical_still_left.append(0) 

3434 # Below is where the "rectangle" block of rowspan 

3435 # and colspan is filled for the future. 

3436 col_gap_data[len(row)] = cell 

3437 # col_gap_data contains cells that 

3438 # will be used in the 

3439 # future, or None 

3440 vertical_still_left[len(row)] = rowspan - 1 

3441 # A counter for how many gaps🡙 are still left to be 

3442 # filled (row.append or 

3443 # row[col_gap_data[len(row)] => 

3444 # rows), it is not reset to [], but decremented to 0 

3445 # each time a row gets something from col_gap_data. 

3446 # Append this cell 1+ times for colspan🡘 

3447 row.append(cell) 

3448 if not row: 

3449 continue 

3450 # After looping the original row-nodes above, fill 

3451 # in the rest of the row if the final cell has colspan 

3452 # (inherited from above, so a cell with rowspan and colspan) 

3453 for i in range(len(row), len(vertical_still_left)): 

3454 if vertical_still_left[i] <= 0: 

3455 continue 

3456 vertical_still_left[i] -= 1 

3457 while len(row) < i: 

3458 row.append(InflCell("", False, 1, 1, None)) 

3459 row.append(col_gap_data[i]) 

3460 # print(" ROW {!r}".format(row)) 

3461 if row_has_nonempty_cells: 3461 ↛ 3207line 3461 didn't jump to line 3207 because the condition on line 3461 was always true

3462 rows.append(row) 

3463 elif kind in ( 3463 ↛ 3207line 3463 didn't jump to line 3207 because the condition on line 3463 was always true

3464 NodeKind.TABLE_HEADER_CELL, 

3465 NodeKind.TABLE_CELL, 

3466 "th", 

3467 "td", 

3468 "span", 

3469 ): 

3470 # print(" TOP-LEVEL CELL", node) 

3471 pass 

3472 

3473 if sub_ret: 

3474 main_ret = sub_ret 

3475 main_ret.append((rows, titles, after, depth)) 

3476 else: 

3477 main_ret = [(rows, titles, after, depth)] 

3478 return main_ret 

3479 

3480 new_rows = handle_table1( 

3481 wxr, tablecontext, word, lang, pos, data, tree, titles, source, after, 0 

3482 ) 

3483 

3484 # Now we have a table that has been parsed into rows and columns of 

3485 # InflCell objects. Parse the inflection table from that format. 

3486 if new_rows: 3486 ↛ exitline 3486 didn't return from function 'handle_wikitext_or_html_table' because the condition on line 3486 was always true

3487 for rows, titles, after, depth in new_rows: 

3488 handle_generic_table( 

3489 wxr, 

3490 tablecontext, 

3491 data, 

3492 word, 

3493 lang, 

3494 pos, 

3495 rows, 

3496 titles, 

3497 source, 

3498 after, 

3499 depth, 

3500 ) 

3501 

3502 

3503def get_table_cells(node: WikiNode) -> Generator[WikiNode, None, None]: 

3504 """If a wikitext table cell contains HTML cells `<td>`, as they sometimes 

3505 do because it is easier to write wikitext conditionals that way, 

3506 those td-elements are parsed as child elements of the Wikitext cell. 

3507 This generator will yield wikitext and HTML direct children of 

3508 `node` and if a Wikitext TABLE_CELL has direct td-element children, 

3509 those are also yielded.""" 

3510 for col in node.children: 

3511 if not isinstance(col, WikiNode): 

3512 continue 

3513 if any( 

3514 isinstance(c, HTMLNode) and c.sarg in ("th", "td") 

3515 for c in col.children 

3516 ): 

3517 html_cells = [] 

3518 content = [] 

3519 for c in col.children: 

3520 if isinstance(c, HTMLNode) and c.sarg in ("th", "td"): 

3521 html_cells.append(c) 

3522 else: 

3523 content.append(c) 

3524 # Remove td-elements from col so they are not returned twice 

3525 col.children = content 

3526 yield col 

3527 for c in html_cells: 

3528 yield c 

3529 else: 

3530 yield col 

3531 

3532 

3533def handle_html_table( 

3534 wxr, word, lang, pos, data, tree, titles, source, after, tablecontext=None 

3535): 

3536 """A passer-on function for html-tables, XXX, remove these?""" 

3537 handle_wikitext_or_html_table( 

3538 wxr, word, lang, pos, data, tree, titles, source, after, tablecontext 

3539 ) 

3540 

3541 

3542def handle_wikitext_table( 

3543 wxr, word, lang, pos, data, tree, titles, source, after, tablecontext=None 

3544): 

3545 """A passer-on function for html-tables, XXX, remove these?""" 

3546 handle_wikitext_or_html_table( 

3547 wxr, word, lang, pos, data, tree, titles, source, after, tablecontext 

3548 ) 

3549 

3550 

3551def parse_inflection_section( 

3552 wxr, data, word, lang, pos, section, tree, tablecontext=None 

3553): 

3554 """Parses an inflection section on a page. ``data`` should be the 

3555 data for a part-of-speech, and inflections will be added to it.""" 

3556 

3557 # print("PARSE_INFLECTION_SECTION {}/{}/{}/{}" 

3558 # .format(word, lang, pos, section)) 

3559 assert isinstance(wxr, WiktextractContext) 

3560 assert isinstance(data, dict) 

3561 assert isinstance(word, str) 

3562 assert isinstance(lang, str) 

3563 assert isinstance(section, str) 

3564 assert isinstance(tree, WikiNode) 

3565 assert tablecontext is None or isinstance(tablecontext, TableContext) 

3566 source = section 

3567 tables = [] 

3568 titleparts = [] 

3569 preceding_bolded_title = "" 

3570 

3571 # from wikitextprocessor.parser import print_tree 

3572 # print_tree(tree) 

3573 # print("--------------******************----------------") 

3574 

3575 def process_tables(): 

3576 for kind, node, titles, after in tables: 

3577 after = "".join(after).strip() 

3578 after = clean_value(wxr, after) 

3579 if kind == "wikitext": 

3580 handle_wikitext_table( 

3581 wxr, 

3582 word, 

3583 lang, 

3584 pos, 

3585 data, 

3586 node, 

3587 titles, 

3588 source, 

3589 after, 

3590 tablecontext=tablecontext, 

3591 ) 

3592 elif kind == "html": 3592 ↛ 3606line 3592 didn't jump to line 3606 because the condition on line 3592 was always true

3593 handle_html_table( 

3594 wxr, 

3595 word, 

3596 lang, 

3597 pos, 

3598 data, 

3599 node, 

3600 titles, 

3601 source, 

3602 after, 

3603 tablecontext=tablecontext, 

3604 ) 

3605 else: 

3606 raise RuntimeError( 

3607 "{}: unimplemented table kind {}".format(word, kind) 

3608 ) 

3609 

3610 def recurse_navframe(node, titles): 

3611 nonlocal tables 

3612 nonlocal titleparts 

3613 titleparts = [] 

3614 old_tables = tables 

3615 tables = [] 

3616 

3617 recurse(node, [], navframe=True) 

3618 

3619 process_tables() 

3620 tables = old_tables 

3621 

3622 def recurse(node, titles, navframe=False): 

3623 nonlocal tables 

3624 if isinstance(node, (list, tuple)): 

3625 for x in node: 

3626 recurse(x, titles, navframe) 

3627 return 

3628 if isinstance(node, str): 

3629 if tables: 

3630 tables[-1][-1].append(node) 

3631 elif navframe: 

3632 titleparts.append(node) 

3633 return 

3634 if not isinstance(node, WikiNode): 3634 ↛ 3635line 3634 didn't jump to line 3635 because the condition on line 3634 was never true

3635 if navframe: 

3636 wxr.wtp.debug( 

3637 "inflection table: unhandled in NavFrame: {}".format(node), 

3638 sortid="inflection/2907", 

3639 ) 

3640 return 

3641 kind = node.kind 

3642 if navframe: 

3643 if kind == NodeKind.HTML: 

3644 classes = node.attrs.get("class", "").split() 

3645 if "NavToggle" in classes: 3645 ↛ 3646line 3645 didn't jump to line 3646 because the condition on line 3645 was never true

3646 return 

3647 if "NavHead" in classes: 

3648 # print("NAVHEAD:", node) 

3649 recurse(node.children, titles, navframe) 

3650 return 

3651 if "NavContent" in classes: 

3652 # print("NAVCONTENT:", node) 

3653 title = "".join(titleparts).strip() 

3654 title = html.unescape(title) 

3655 title = title.strip() 

3656 new_titles = list(titles) 

3657 if not re.match(r"(Note:|Notes:)", title): 3657 ↛ 3659line 3657 didn't jump to line 3659 because the condition on line 3657 was always true

3658 new_titles.append(title) 

3659 recurse(node, new_titles, navframe=False) 

3660 return 

3661 else: 

3662 if kind == NodeKind.TABLE: 

3663 tables.append(["wikitext", node, titles, []]) 

3664 return 

3665 elif kind == NodeKind.HTML and node.sarg == "table": 

3666 classes = node.attrs.get("class", ()) 

3667 if "audiotable" in classes: 

3668 return 

3669 tables.append(["html", node, titles, []]) 

3670 return 

3671 elif kind in ( 3671 ↛ 3678line 3671 didn't jump to line 3678 because the condition on line 3671 was never true

3672 NodeKind.LEVEL2, 

3673 NodeKind.LEVEL3, 

3674 NodeKind.LEVEL4, 

3675 NodeKind.LEVEL5, 

3676 NodeKind.LEVEL6, 

3677 ): 

3678 return # Skip subsections 

3679 if ( 

3680 kind == NodeKind.HTML 

3681 and node.sarg == "div" 

3682 and "NavFrame" in node.attrs.get("class", "").split() 

3683 ): 

3684 recurse_navframe(node, titles) 

3685 return 

3686 if kind == NodeKind.LINK: 

3687 if len(node.largs) > 1: 

3688 recurse(node.largs[1:], titles, navframe) 

3689 else: 

3690 recurse(node.largs[0], titles, navframe) 

3691 return 

3692 if kind == NodeKind.HTML and node.sarg == "ref": 

3693 return 

3694 if kind == NodeKind.LIST and node.sarg == ";": 

3695 nonlocal preceding_bolded_title 

3696 from wiktextract.page import clean_node 

3697 

3698 preceding_bolded_title = clean_node(wxr, None, node).strip("; ") 

3699 for x in node.children: 

3700 recurse(x, titles, navframe) 

3701 

3702 assert tree.kind == NodeKind.ROOT 

3703 for x in tree.children: 

3704 if preceding_bolded_title != "": 

3705 recurse(x, [preceding_bolded_title]) 

3706 else: 

3707 recurse(x, []) 

3708 

3709 # Process the tables we found 

3710 process_tables() 

3711 

3712 # XXX this code is used for extracting tables for inflection tests 

3713 if wxr.config.expand_tables: 3713 ↛ 3714line 3713 didn't jump to line 3714 because the condition on line 3713 was never true

3714 if section != "Mutation": 

3715 with open(wxr.config.expand_tables, "w") as f: 

3716 f.write(word + "\n") 

3717 f.write(lang + "\n") 

3718 f.write(pos + "\n") 

3719 f.write(section + "\n") 

3720 text = wxr.wtp.node_to_wikitext(tree) 

3721 f.write(text + "\n")