Coverage for src/wiktextract/extractor/en/inflection.py: 86%

1481 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2024-12-27 08:07 +0000

1# Code for parsing inflection tables. 

2# 

3# Copyright (c) 2021-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org. 

4 

5import collections 

6import copy 

7import functools 

8import html 

9import itertools 

10import re 

11import unicodedata 

12from typing import Optional, Union 

13 

14from wikitextprocessor import MAGIC_FIRST, NodeKind, WikiNode 

15 

16from ...clean import clean_value 

17from ...datautils import data_append, freeze, split_at_comma_semi 

18from ...tags import valid_tags 

19from ...wxr_context import WiktextractContext 

20from .form_descriptions import ( 

21 classify_desc, 

22 decode_tags, 

23 distw, 

24 parse_head_final_tags, 

25) 

26from .inflectiondata import infl_map, infl_start_map, infl_start_re 

27from .lang_specific_configs import get_lang_conf, lang_specific_tags 

28from .table_headers_heuristics_data import LANGUAGES_WITH_CELLS_AS_HEADERS 

29from .type_utils import FormData 

30 

31# --debug-text-cell WORD 

32# Command-line parameter for debugging. When parsing inflection tables, 

33# print out debug messages when encountering this text. 

34debug_cell_text: Optional[str] = None 

35 

36 

37def set_debug_cell_text(text: str) -> None: 

38 global debug_cell_text 

39 debug_cell_text = text 

40 

41 

42TagSets = list[tuple[str, ...]] 

43 

44# Column texts that are interpreted as an empty column. 

45IGNORED_COLVALUES = { 

46 "-", 

47 "־", 

48 "᠆", 

49 "‐", 

50 "‑", 

51 "‒", 

52 "–", 

53 "—", 

54 "―", 

55 "−", 

56 "⸺", 

57 "⸻", 

58 "﹘", 

59 "﹣", 

60 "-", 

61 "/", 

62 "?", 

63 "not used", 

64 "not applicable", 

65} 

66 

67# These tags are never inherited from above 

68# XXX merge with lang_specific 

69noinherit_tags = { 

70 "infinitive-i", 

71 "infinitive-i-long", 

72 "infinitive-ii", 

73 "infinitive-iii", 

74 "infinitive-iv", 

75 "infinitive-v", 

76} 

77 

78# Subject->object transformation mapping, when using dummy-object-concord 

79# to replace subject concord tags with object concord tags 

80object_concord_replacements = { 

81 "first-person": "object-first-person", 

82 "second-person": "object-second-person", 

83 "third-person": "object-third-person", 

84 "singular": "object-singular", 

85 "plural": "object-plural", 

86 "definite": "object-definite", 

87 "indefinite": "object-indefinite", 

88 "class-1": "object-class-1", 

89 "class-2": "object-class-2", 

90 "class-3": "object-class-3", 

91 "class-4": "object-class-4", 

92 "class-5": "object-class-5", 

93 "class-6": "object-class-6", 

94 "class-7": "object-class-7", 

95 "class-8": "object-class-8", 

96 "class-9": "object-class-9", 

97 "class-10": "object-class-10", 

98 "class-11": "object-class-11", 

99 "class-12": "object-class-12", 

100 "class-13": "object-class-13", 

101 "class-14": "object-class-14", 

102 "class-15": "object-class-15", 

103 "class-16": "object-class-16", 

104 "class-17": "object-class-17", 

105 "class-18": "object-class-18", 

106 "masculine": "object-masculine", 

107 "feminine": "object-feminine", 

108} 

109 

110# Words in title that cause addition of tags in all entries 

111title_contains_global_map = { 

112 "possessive": "possessive", 

113 "possessed forms of": "possessive", 

114 "predicative forms of": "predicative", 

115 "negative": "negative", 

116 "positive definite forms": "positive definite", 

117 "positive indefinite forms": "positive indefinite", 

118 "comparative": "comparative", 

119 "superlative": "superlative", 

120 "combined forms": "combined-form", 

121 "mutation": "mutation", 

122 "definite article": "definite", 

123 "indefinite article": "indefinite", 

124 "indefinite declension": "indefinite", 

125 "bare forms": "indefinite", # e.g., cois/Irish 

126 "definite declension": "definite", 

127 "pre-reform": "dated", 

128 "personal pronouns": "personal pronoun", 

129 "composed forms of": "multiword-construction", 

130 "subordinate-clause forms of": "subordinate-clause", 

131 "participles of": "participle", 

132 "variation of": "dummy-skip-this", # a'/Scottish Gaelic 

133 "command form of": "imperative", # a راتلل/Pashto 

134 "historical inflection of": "dummy-skip-this", # kork/Norwegian Nynorsk 

135 "obsolete declension": "obsolete", # März/German 20241111 

136} 

137for k, v in title_contains_global_map.items(): 

138 if any(t not in valid_tags for t in v.split()): 138 ↛ 139line 138 didn't jump to line 139 because the condition on line 138 was never true

139 print("TITLE_CONTAINS_GLOBAL_MAP UNRECOGNIZED TAG: {}: {}".format(k, v)) 

140table_hdr_ign_part = r"(Inflection|Conjugation|Declension|Mutation) of [^\s]" 

141 

142table_hdr_ign_part_re = re.compile(r"(?i)(" + table_hdr_ign_part + ")") 

143# (?i) python regex extension, ignore case 

144title_contains_global_re = re.compile( 

145 r"(?i)(^|\b)({}|{})($|\b)".format( 

146 table_hdr_ign_part, 

147 "|".join(re.escape(x) for x in title_contains_global_map.keys()), 

148 ) 

149) 

150 

151# Words in title that cause addition of tags to table-tags "form" 

152title_contains_wordtags_map = { 

153 "pf": "perfective", 

154 "impf": "imperfective", 

155 "strong": "strong", 

156 "weak": "weak", 

157 "countable": "countable", 

158 "uncountable": "uncountable", 

159 "inanimate": "inanimate", 

160 "animate": "animate", 

161 "transitive": "transitive", 

162 "intransitive": "intransitive", 

163 "ditransitive": "ditransitive", 

164 "ambitransitive": "ambitransitive", 

165 "archaic": "archaic", 

166 "dated": "dated", 

167 "affirmative": "affirmative", 

168 "negative": "negative", 

169 "subject pronouns": "subjective", 

170 "object pronouns": "objective", 

171 "emphatic": "emphatic", 

172 "proper noun": "proper-noun", 

173 "no plural": "no-plural", 

174 "imperfective": "imperfective", 

175 "perfective": "perfective", 

176 "no supine stem": "no-supine", 

177 "no perfect stem": "no-perfect", 

178 "deponent": "deponent", 

179 "irregular": "irregular", 

180 "no short forms": "no-short-form", 

181 "iō-variant": "iō-variant", 

182 "1st declension": "declension-1", 

183 "2nd declension": "declension-2", 

184 "3rd declension": "declension-3", 

185 "4th declension": "declension-4", 

186 "5th declension": "declension-5", 

187 "6th declension": "declension-6", 

188 "first declension": "declension-1", 

189 "second declension": "declension-2", 

190 "third declension": "declension-3", 

191 "fourth declension": "declension-4", 

192 "fifth declension": "declension-5", 

193 "sixth declension": "declension-6", 

194 "1st conjugation": "conjugation-1", 

195 "2nd conjugation": "conjugation-2", 

196 "3rd conjugation": "conjugation-3", 

197 "4th conjugation": "conjugation-4", 

198 "5th conjugation": "conjugation-5", 

199 "6th conjugation": "conjugation-6", 

200 "7th conjugation": "conjugation-7", 

201 "first conjugation": "conjugation-1", 

202 "second conjugation": "conjugation-2", 

203 "third conjugation": "conjugation-3", 

204 "fourth conjugation": "conjugation-4", 

205 "fifth conjugation": "conjugation-5", 

206 "sixth conjugation": "conjugation-6", 

207 "seventh conjugation": "conjugation-7", 

208 # Corsican regional tags in table header 

209 "cismontane": "Cismontane", 

210 "ultramontane": "Ultramontane", 

211 "western lombard": "Western-Lombard", 

212 "eastern lombard": "Eastern-Lombard", 

213} 

214for k, v in title_contains_wordtags_map.items(): 

215 if any(t not in valid_tags for t in v.split()): 215 ↛ 216line 215 didn't jump to line 216 because the condition on line 215 was never true

216 print( 

217 "TITLE_CONTAINS_WORDTAGS_MAP UNRECOGNIZED TAG: {}: {}".format(k, v) 

218 ) 

219title_contains_wordtags_re = re.compile( 

220 r"(?i)(^|\b)({}|{})($|\b)".format( 

221 table_hdr_ign_part, 

222 "|".join(re.escape(x) for x in title_contains_wordtags_map.keys()), 

223 ) 

224) 

225 

226# Parenthesized elements in title that are converted to tags in 

227# "table-tags" form 

228title_elements_map = { 

229 "weak": "weak", 

230 "strong": "strong", 

231 "separable": "separable", 

232 "masculine": "masculine", 

233 "feminine": "feminine", 

234 "neuter": "neuter", 

235 "singular": "singular", 

236 "plural": "plural", 

237 "archaic": "archaic", 

238 "dated": "dated", 

239 "Attic": "Attic", # e.g. καλός/Greek/Adj 

240 "Epic": "Epic", # e.g. καλός/Greek/Adj 

241} 

242for k, v in title_elements_map.items(): 

243 if any(t not in valid_tags for t in v.split()): 243 ↛ 244line 243 didn't jump to line 244 because the condition on line 243 was never true

244 print("TITLE_ELEMENTS_MAP UNRECOGNIZED TAG: {}: {}".format(k, v)) 

245 

246# Parenthized element starts to map them to tags for form for the rest of 

247# the element 

248title_elemstart_map = { 

249 "auxiliary": "auxiliary", 

250 "Kotus type": "class", 

251 "ÕS type": "class", 

252 "class": "class", 

253 "short class": "class", 

254 "type": "class", 

255 "strong class": "class", 

256 "weak class": "class", 

257 "accent paradigm": "accent-paradigm", 

258 "stem in": "class", 

259} 

260for k, v in title_elemstart_map.items(): 

261 if any(t not in valid_tags for t in v.split()): 261 ↛ 262line 261 didn't jump to line 262 because the condition on line 261 was never true

262 print("TITLE_ELEMSTART_MAP UNRECOGNIZED TAG: {}: {}".format(k, v)) 

263title_elemstart_re = re.compile( 

264 r"^({}) ".format("|".join(re.escape(x) for x in title_elemstart_map.keys())) 

265) 

266 

267 

268# Regexp for cell starts that are likely definitions of reference symbols. 

269# See also nondef_re. 

270def_re = re.compile( 

271 r"(\s*•?\s+)?" 

272 r"((\*+|[△†0123456789⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻]+)([⁾):]|\s|(?=[A-Z]))|" 

273 r"\^(\*+|[△†])|" 

274 r"([¹²³⁴⁵⁶⁷⁸⁹])|" 

275 r"([ᴬᴮᴰᴱᴳᴴᴵᴶᴷᴸᴹᴺᴼᴾᴿᵀᵁⱽᵂᵃᵇᶜᵈᵉᶠᵍʰⁱʲᵏˡᵐⁿᵒᵖʳˢᵗᵘᵛʷˣʸᶻᵝᵞᵟᶿᶥᵠᵡ]))" 

276) 

277# ᴺᴸᴴ persan/Old Irish 

278 

279# Regexp for cell starts that are exceptions to def_re and do not actually 

280# start a definition. 

281nondef_re = re.compile( 

282 r"(^\s*(1|2|3)\s+(sg|pl)\s*$|" # 1s or 3p etc. 

283 r"\s*\d\d?\s*/\s*\d\d?\s*$)" 

284) # taka/Swahili "15 / 17" 

285 

286# Certain tags are moved from headers in tables into word tags, as they always 

287# apply to the whole word. 

288TAGS_FORCED_WORDTAGS: set[str] = set( 

289 [ 

290 # This was originally created for a issue with number paradigms in 

291 # Arabic, but that is being handled elsewhere now. 

292 ] 

293) 

294 

295 

296class InflCell: 

297 """Cell in an inflection table.""" 

298 

299 __slots__ = ( 

300 "text", 

301 "is_title", 

302 "colspan", 

303 "rowspan", 

304 "target", 

305 ) 

306 

307 def __init__( 

308 self, 

309 text: str, 

310 is_title: bool, 

311 colspan: int, 

312 rowspan: int, 

313 target: Optional[str], 

314 ) -> None: 

315 assert isinstance(text, str) 

316 assert is_title in (True, False) 

317 assert isinstance(colspan, int) and colspan >= 1 

318 assert isinstance(rowspan, int) and rowspan >= 1 

319 assert target is None or isinstance(target, str) 

320 self.text = text.strip() 

321 self.is_title = text and is_title 

322 self.colspan = colspan 

323 self.rowspan = rowspan 

324 self.target = target 

325 

326 def __str__(self) -> str: 

327 v = "{}/{}/{}/{!r}".format( 

328 self.text, self.is_title, self.colspan, self.rowspan 

329 ) 

330 if self.target: 

331 v += ": {!r}".format(self.target) 

332 return v 

333 

334 def __repr__(self) -> str: 

335 return str(self) 

336 

337 

338class HdrSpan: 

339 """Saved information about a header cell/span during the parsing 

340 of a table.""" 

341 

342 __slots__ = ( 

343 "start", 

344 "colspan", 

345 "rowspan", 

346 "rownum", # Row number where this occurred 

347 "tagsets", # list of tuples 

348 "text", # For debugging 

349 "all_headers_row", 

350 "expanded", # The header has been expanded to cover whole row/part 

351 ) 

352 

353 def __init__( 

354 self, 

355 start: int, 

356 colspan: int, 

357 rowspan: int, 

358 rownum: int, 

359 tagsets: TagSets, 

360 text: str, 

361 all_headers_row: bool, 

362 ) -> None: 

363 assert isinstance(start, int) and start >= 0 

364 assert isinstance(colspan, int) and colspan >= 1 

365 assert isinstance(rownum, int) 

366 assert isinstance(tagsets, list) 

367 for x in tagsets: 

368 assert isinstance(x, tuple) 

369 assert all_headers_row in (True, False) 

370 self.start = start 

371 self.colspan = colspan 

372 self.rowspan = rowspan 

373 self.rownum = rownum 

374 self.tagsets = list(tuple(sorted(set(tags))) for tags in tagsets) 

375 self.text = text 

376 self.all_headers_row = all_headers_row 

377 self.expanded = False 

378 

379 

380def is_superscript(ch: str) -> bool: 

381 """Returns True if the argument is a superscript character.""" 

382 assert isinstance(ch, str) and len(ch) == 1 

383 try: 

384 name = unicodedata.name(ch) 

385 except ValueError: 

386 return False 

387 return ( 

388 re.match( 

389 r"SUPERSCRIPT |" 

390 r"MODIFIER LETTER SMALL |" 

391 r"MODIFIER LETTER CAPITAL ", 

392 name, 

393 ) 

394 is not None 

395 ) 

396 

397 

398def remove_useless_tags(lang: str, pos: str, tags: set[str]) -> None: 

399 """Remove certain tag combinations from ``tags`` when they serve no purpose 

400 together (cover all options).""" 

401 assert isinstance(lang, str) 

402 assert isinstance(pos, str) 

403 assert isinstance(tags, set) 

404 if ( 

405 "animate" in tags 

406 and "inanimate" in tags 

407 and get_lang_conf(lang, "animate_inanimate_remove") 

408 ): 

409 tags.remove("animate") 

410 tags.remove("inanimate") 

411 if ( 

412 "virile" in tags 

413 and "nonvirile" in tags 

414 and get_lang_conf(lang, "virile_nonvirile_remove") 

415 ): 

416 tags.remove("virile") 

417 tags.remove("nonvirile") 

418 # If all numbers in the language are listed, remove them all 

419 numbers = get_lang_conf(lang, "numbers") 

420 if numbers and all(x in tags for x in numbers): 

421 for x in numbers: 

422 tags.remove(x) 

423 # If all genders in the language are listed, remove them all 

424 genders = get_lang_conf(lang, "genders") 

425 if genders and all(x in tags for x in genders): 

426 for x in genders: 

427 tags.remove(x) 

428 # If all voices in the language are listed, remove them all 

429 voices = get_lang_conf(lang, "voices") 

430 if voices and all(x in tags for x in voices): 

431 for x in voices: 

432 tags.remove(x) 

433 # If all strengths of the language are listed, remove them all 

434 strengths = get_lang_conf(lang, "strengths") 

435 if strengths and all(x in tags for x in strengths): 

436 for x in strengths: 

437 tags.remove(x) 

438 # If all persons of the language are listed, remove them all 

439 persons = get_lang_conf(lang, "persons") 

440 if persons and all(x in tags for x in persons): 

441 for x in persons: 

442 tags.remove(x) 

443 # If all definitenesses of the language are listed, remove them all 

444 definitenesses = get_lang_conf(lang, "definitenesses") 

445 if definitenesses and all(x in tags for x in definitenesses): 

446 for x in definitenesses: 

447 tags.remove(x) 

448 

449 

450def tagset_cats(tagset: TagSets) -> set[str]: 

451 """Returns a set of tag categories for the tagset (merged from all 

452 alternatives).""" 

453 return set(valid_tags[t] for ts in tagset for t in ts) 

454 

455 

456def or_tagsets( 

457 lang: str, pos: str, tagsets1: TagSets, tagsets2: TagSets 

458) -> TagSets: 

459 """Merges two tagsets (the new tagset just merges the tags from both, in 

460 all combinations). If they contain simple alternatives (differ in 

461 only one category), they are simply merged; otherwise they are split to 

462 more alternatives. The tagsets are assumed be sets of sorted tuples.""" 

463 assert isinstance(tagsets1, list) 

464 assert all(isinstance(x, tuple) for x in tagsets1) 

465 assert isinstance(tagsets2, list) 

466 assert all(isinstance(x, tuple) for x in tagsets1) 

467 tagsets: TagSets = [] # This will be the result 

468 

469 def add_tags(tags1: tuple[str, ...]) -> None: 

470 # CONTINUE 

471 if not tags1: 

472 return # empty set would merge with anything, won't change result 

473 if not tagsets: 

474 tagsets.append(tags1) 

475 return 

476 for tags2 in tagsets: 

477 # Determine if tags1 can be merged with tags2 

478 num_differ = 0 

479 if tags1 and tags2: 479 ↛ 497line 479 didn't jump to line 497 because the condition on line 479 was always true

480 cats1 = set(valid_tags[t] for t in tags1) 

481 cats2 = set(valid_tags[t] for t in tags2) 

482 cats = cats1 | cats2 

483 for cat in cats: 

484 tags1_in_cat = set(t for t in tags1 if valid_tags[t] == cat) 

485 tags2_in_cat = set(t for t in tags2 if valid_tags[t] == cat) 

486 if ( 

487 tags1_in_cat != tags2_in_cat 

488 or not tags1_in_cat 

489 or not tags2_in_cat 

490 ): 

491 num_differ += 1 

492 if not tags1_in_cat or not tags2_in_cat: 

493 # Prevent merging if one is empty 

494 num_differ += 1 

495 # print("tags1={} tags2={} num_differ={}" 

496 # .format(tags1, tags2, num_differ)) 

497 if num_differ <= 1: 

498 # Yes, they can be merged 

499 tagsets.remove(tags2) 

500 tags_s = set(tags1) | set(tags2) 

501 remove_useless_tags(lang, pos, tags_s) 

502 tags_t = tuple(sorted(tags_s)) 

503 add_tags(tags_t) # Could result in further merging 

504 return 

505 # If we could not merge, add to tagsets 

506 tagsets.append(tags1) 

507 

508 for tags in tagsets1: 

509 add_tags(tags) 

510 for tags in tagsets2: 

511 add_tags(tags) 

512 if not tagsets: 

513 tagsets.append(()) 

514 

515 # print("or_tagsets: {} + {} -> {}" 

516 # .format(tagsets1, tagsets2, tagsets)) 

517 return tagsets 

518 

519 

520def and_tagsets( 

521 lang: str, 

522 pos: str, 

523 tagsets1: list[tuple[str, ...]], 

524 tagsets2: list[tuple[str, ...]], 

525) -> list[tuple[str, ...]]: 

526 """Merges tagsets by taking union of all cobinations, without trying 

527 to determine whether they are compatible.""" 

528 assert isinstance(tagsets1, list) and len(tagsets1) >= 1 

529 assert all(isinstance(x, tuple) for x in tagsets1) 

530 assert isinstance(tagsets2, list) and len(tagsets2) >= 1 

531 assert all(isinstance(x, tuple) for x in tagsets1) 

532 new_tagsets = [] 

533 tags: Union[set[str], tuple[str, ...]] 

534 for tags1 in tagsets1: 

535 for tags2 in tagsets2: 

536 tags = set(tags1) | set(tags2) 

537 remove_useless_tags(lang, pos, tags) 

538 if "dummy-ignored-text-cell" in tags: 538 ↛ 539line 538 didn't jump to line 539 because the condition on line 538 was never true

539 tags.remove("dummy-ignored-text-cell") 

540 tags = tuple(sorted(tags)) 

541 if tags not in new_tagsets: 541 ↛ 535line 541 didn't jump to line 535 because the condition on line 541 was always true

542 new_tagsets.append(tags) 

543 # print("and_tagsets: {} + {} -> {}" 

544 # .format(tagsets1, tagsets2, new_tagsets)) 

545 return new_tagsets 

546 

547 

548@functools.lru_cache(65536) 

549def extract_cell_content( 

550 lang: str, word: str, col: str 

551) -> tuple[str, list[str], list[tuple[str, str]], list[str]]: 

552 """Cleans a row/column header for later processing. This returns 

553 (cleaned, refs, defs, tags).""" 

554 # print("EXTRACT_CELL_CONTENT {!r}".format(col)) 

555 hdr_tags = [] 

556 col = re.sub(r"(?s)\s*,\s*$", "", col) 

557 col = re.sub(r"(?s)\s*•\s*$", "", col) 

558 col = re.sub(r"\s+", " ", col) 

559 col = col.strip() 

560 if re.search( 

561 r"^\s*(There are |" 

562 r"\* |" 

563 r"see |" 

564 r"Use |" 

565 r"use the |" 

566 r"Only used |" 

567 r"The forms in |" 

568 r"these are also written |" 

569 r"The genitive can be |" 

570 r"Genitive forms are rare or non-existant|" 

571 r"Accusative Note: |" 

572 r"Classifier Note: |" 

573 r"Noun: Assamese nouns are |" 

574 r"the active conjugation|" 

575 r"the instrumenal singular|" 

576 r"Note:|" 

577 r"\^* Note:|" 

578 r"possible mutated form |" 

579 r"The future tense: )", 

580 col, 

581 ): 

582 return "dummy-ignored-text-cell", [], [], [] 

583 

584 # Temporarily remove final parenthesized part (if separated by whitespace), 

585 # so that we can extract reference markers before it. 

586 final_paren = "" 

587 m = re.search(r"\s+\([^)]*\)$", col) 

588 if m is not None: 

589 final_paren = m.group(0) 

590 col = col[: m.start()] 

591 

592 # Extract references and tag markers 

593 refs = [] 

594 special_references = get_lang_conf(lang, "special_references") 

595 while True: 

596 m = re.search(r"\^(.|\([^)]*\))$", col) 

597 if not m: 

598 break 

599 r = m.group(1) 

600 if r.startswith("(") and r.endswith(")"): 

601 r = r[1:-1] 

602 for r1 in r.split(","): 

603 if r1 == "rare": 603 ↛ 604line 603 didn't jump to line 604 because the condition on line 603 was never true

604 hdr_tags.append("rare") 

605 elif special_references and r1 in special_references: 

606 hdr_tags.extend(special_references[r1].split()) 

607 else: 

608 # v = m.group(1) 

609 if r1.startswith("(") and r1.endswith(")"): 609 ↛ 610line 609 didn't jump to line 610 because the condition on line 609 was never true

610 r1 = r1[1:-1] 

611 refs.append(unicodedata.normalize("NFKD", r1)) 

612 col = col[: m.start()] 

613 # See if it is a ref definition 

614 # print("BEFORE REF CHECK: {!r}".format(col)) 

615 m = def_re.match(col) 

616 # print(f"Before def_re: {refs=}") 

617 if m and not nondef_re.match(col): 

618 ofs = 0 

619 ref = None 

620 deflst = [] 

621 for m in re.finditer(def_re, col): 

622 if ref: 

623 deflst.append((ref, col[ofs : m.start()].strip())) 

624 ref = unicodedata.normalize( 

625 "NFKD", m.group(3) or m.group(5) or m.group(6) or "" 

626 ) 

627 ofs = m.end() 

628 if ref: 628 ↛ 631line 628 didn't jump to line 631 because the condition on line 628 was always true

629 deflst.append((ref, col[ofs:].strip())) 

630 # print("deflst:", deflst) 

631 return "", [], deflst, [] 

632 # See if it *looks* like a reference to a definition 

633 # print(f"After def_re: {refs=}") 

634 while col: 

635 if is_superscript(col[-1]) or col[-1] in ("†",): 

636 if col.endswith("ʳᵃʳᵉ"): 

637 hdr_tags.append("rare") 

638 col = col[:-4].strip() 

639 continue 

640 if special_references: 

641 stop_flag = False 

642 for r in special_references: 

643 if col.endswith(r): 

644 hdr_tags.extend(special_references[r].split()) 

645 col = col[: -len(r)].strip() 

646 stop_flag = True 

647 break # this for loop 

648 if stop_flag: 

649 continue # this while loop 

650 # Numbers and H/L/N are useful information 

651 refs.append(unicodedata.normalize("NFKD", col[-1])) 

652 col = col[:-1] 

653 else: 

654 break 

655 

656 # Check for another form of note definition 

657 if ( 657 ↛ 663line 657 didn't jump to line 663 because the condition on line 657 was never true

658 len(col) > 2 

659 and col[1] in (")", " ", ":") 

660 and col[0].isdigit() 

661 and not re.match(nondef_re, col) 

662 ): 

663 return "", [], [(col[0], col[2:].strip())], [] 

664 col = col.strip() 

665 

666 # Extract final "*" reference symbols. Sometimes there are multiple. 

667 m = re.search(r"\*+$", col) 

668 if m is not None: 

669 col = col[: m.start()] 

670 refs.append(unicodedata.normalize("NFKD", m.group(0))) 

671 if col.endswith("(*)"): 671 ↛ 672line 671 didn't jump to line 672 because the condition on line 671 was never true

672 col = col[:-3].strip() 

673 refs.append("*") 

674 

675 # Put back the final parenthesized part 

676 col = col.strip() + final_paren 

677 # print("EXTRACT_CELL_CONTENT: orig_col={!r} col={!r} refs={!r} hdr_tags={}" 

678 # .format(orig_col, col, refs, hdr_tags)) 

679 return col.strip(), refs, [], hdr_tags 

680 

681 

682@functools.lru_cache(10000) 

683def parse_title( 

684 title: str, source: str 

685) -> tuple[list[str], list[str], list[FormData]]: 

686 """Parses inflection table title. This returns (global_tags, table_tags, 

687 extra_forms), where ``global_tags`` is tags to be added to each inflection 

688 entry, ``table_tags`` are tags for the word but not to be added to every 

689 form, and ``extra_forms`` is dictionary describing additional forms to be 

690 included in the part-of-speech entry).""" 

691 assert isinstance(title, str) 

692 assert isinstance(source, str) 

693 title = html.unescape(title) 

694 title = re.sub(r"(?i)<[^>]*>", "", title).strip() 

695 title = re.sub(r"\s+", " ", title) 

696 # print("PARSE_TITLE:", title) 

697 global_tags = [] 

698 table_tags = [] 

699 extra_forms = [] 

700 # Add certain global tags based on contained words 

701 for m in re.finditer(title_contains_global_re, title): 

702 v = m.group(0).lower() 

703 if re.match(table_hdr_ign_part_re, v): 703 ↛ 704line 703 didn't jump to line 704 because the condition on line 703 was never true

704 continue 

705 global_tags.extend(title_contains_global_map[v].split()) 

706 # Add certain tags to table-tags "form" based on contained words 

707 for m in re.finditer(title_contains_wordtags_re, title): 

708 v = m.group(0).lower() 

709 if re.match(table_hdr_ign_part_re, v): 709 ↛ 710line 709 didn't jump to line 710 because the condition on line 709 was never true

710 continue 

711 table_tags.extend(title_contains_wordtags_map[v].split()) 

712 if re.search(r"Conjugation of (s’|se ).*French verbs", title): 712 ↛ 713line 712 didn't jump to line 713 because the condition on line 712 was never true

713 global_tags.append("reflexive") 

714 # Check for <x>-type at the beginning of title (e.g., Armenian) and various 

715 # other ways of specifying an inflection class. 

716 for m in re.finditer( 

717 r"\b(" 

718 r"[\w/]+-type|" 

719 r"accent-\w+|" 

720 r"[\w/]+-stem|" 

721 r"[^ ]+ gradation|" 

722 r"\b(stem in [\w/ ]+)|" 

723 r"[^ ]+ alternation|" 

724 r"(First|Second|Third|Fourth|Fifth|Sixth|Seventh) " 

725 r"(Conjugation|declension)|" 

726 r"First and second declension|" 

727 r"(1st|2nd|3rd|4th|5th|6th) declension|" 

728 r"\w[\w/ ]* harmony" 

729 r")\b", 

730 title, 

731 ): 

732 dt: FormData = {"form": m.group(1), "source": source, "tags": ["class"]} 

733 extra_forms.append(dt) 

734 # Parse parenthesized part from title 

735 for m in re.finditer(r"\(([^)]*)\)", title): 

736 for elem in m.group(1).split(","): 

737 # group(0) is the whole string, group(1) first parens 

738 elem = elem.strip() 

739 if elem in title_elements_map: 

740 table_tags.extend(title_elements_map[elem].split()) 

741 else: 

742 m1 = re.match(title_elemstart_re, elem) 

743 if m1: 

744 tags = title_elemstart_map[m1.group(1)].split() 

745 dt = { 

746 "form": elem[m1.end() :], 

747 "source": source, 

748 "tags": tags, 

749 } 

750 extra_forms.append(dt) 

751 # For titles that contains no parenthesized parts, do some special 

752 # handling to still interpret parts from them 

753 if "(" not in title: 

754 # No parenthesized parts 

755 m1 = re.search(r"\b(Portuguese) (-.* verb) ", title) 

756 if m1 is not None: 

757 dt = {"form": m1.group(2), "tags": ["class"], "source": source} 

758 extra_forms.append(dt) 

759 for elem in title.split(","): 

760 elem = elem.strip() 

761 if elem in title_elements_map: 761 ↛ 762line 761 didn't jump to line 762 because the condition on line 761 was never true

762 table_tags.extend(title_elements_map[elem].split()) 

763 elif elem.endswith("-stem"): 763 ↛ 764line 763 didn't jump to line 764 because the condition on line 763 was never true

764 dt = {"form": elem, "tags": ["class"], "source": source} 

765 extra_forms.append(dt) 

766 return global_tags, table_tags, extra_forms 

767 

768 

769def expand_header( 

770 wxr: WiktextractContext, 

771 tablecontext: "TableContext", 

772 word: str, 

773 lang: str, 

774 pos: str, 

775 text: str, 

776 base_tags: Union[list[str], set[str], tuple[str, ...]], 

777 silent=False, 

778 ignore_tags=False, 

779 depth=0, 

780) -> list[tuple[str, ...]]: 

781 """Expands a cell header to tagset, handling conditional expressions 

782 in infl_map. This returns list of tuples of tags, each list element 

783 describing an alternative interpretation. ``base_tags`` is combined 

784 column and row tags for the cell in which the text is being interpreted 

785 (conditional expressions in inflection data may depend on it). 

786 If ``silent`` is True, then no warnings will be printed. If ``ignore_tags`` 

787 is True, then tags listed in "if" will be ignored in the test (this is 

788 used when trying to heuristically detect whether a non-<th> cell is anyway 

789 a header).""" 

790 assert isinstance(wxr, WiktextractContext) 

791 assert isinstance(word, str) 

792 assert isinstance(lang, str) 

793 assert isinstance(pos, str) 

794 assert isinstance(text, str) 

795 assert isinstance(base_tags, (list, tuple, set)) 

796 assert silent in (True, False) 

797 assert isinstance(depth, int) 

798 # print("EXPAND_HDR: text={!r} base_tags={!r}".format(text, base_tags)) 

799 # First map the text using the inflection map 

800 text = clean_value(wxr, text) 

801 combined_return: list[tuple[str, ...]] = [] 

802 parts = split_at_comma_semi(text, separators=[";"]) 

803 for text in parts: 

804 if not text: 804 ↛ 805line 804 didn't jump to line 805 because the condition on line 804 was never true

805 continue 

806 if text in infl_map: 

807 v = infl_map[text] # list or string 

808 else: 

809 m = re.match(infl_start_re, text) 

810 if m is not None: 810 ↛ 811line 810 didn't jump to line 811 because the condition on line 810 was never true

811 v = infl_start_map[m.group(1)] 

812 # print("INFL_START {} -> {}".format(text, v)) 

813 elif re.match(r"Notes", text): 

814 # Ignored header 

815 # print("IGNORING NOTES") 

816 combined_return = or_tagsets( 

817 lang, pos, combined_return, [("dummy-skip-this",)] 

818 ) 

819 # this just adds dummy-skip-this 

820 continue 

821 elif text in IGNORED_COLVALUES: 

822 combined_return = or_tagsets( 

823 lang, pos, combined_return, [("dummy-ignore-skipped",)] 

824 ) 

825 continue 

826 # Try without final parenthesized part 

827 text_without_parens = re.sub(r"[,/]?\s+\([^)]*\)\s*$", "", text) 

828 if text_without_parens in infl_map: 

829 v = infl_map[text_without_parens] 

830 elif m is None: 830 ↛ 846line 830 didn't jump to line 846 because the condition on line 830 was always true

831 if not silent: 

832 wxr.wtp.debug( 

833 "inflection table: unrecognized header: {}".format( 

834 repr(text) 

835 ), 

836 sortid="inflection/735", 

837 ) 

838 # Unrecognized header 

839 combined_return = or_tagsets( 

840 lang, pos, combined_return, [("error-unrecognized-form",)] 

841 ) 

842 continue 

843 

844 # Then loop interpreting the value, until the value is a simple string. 

845 # This may evaluate nested conditional expressions. 

846 default_then = None 

847 while True: 

848 # If it is a string, we are done. 

849 if isinstance(v, str): 

850 tags = set(v.split()) 

851 remove_useless_tags(lang, pos, tags) 

852 tagset = [tuple(sorted(tags))] 

853 break 

854 # For a list, just interpret it as alternatives. (Currently the 

855 # alternatives must directly be strings.) 

856 if isinstance(v, (list, tuple)): 

857 tagset = [] 

858 for x in v: 

859 tags = set(x.split()) 

860 remove_useless_tags(lang, pos, tags) 

861 tags_t = tuple(sorted(tags)) 

862 if tags_t not in tagset: 862 ↛ 858line 862 didn't jump to line 858 because the condition on line 862 was always true

863 tagset.append(tags_t) 

864 break 

865 # Otherwise the value should be a dictionary describing a 

866 # conditional expression. 

867 if not isinstance(v, dict): 867 ↛ 868line 867 didn't jump to line 868 because the condition on line 867 was never true

868 wxr.wtp.debug( 

869 "inflection table: internal: " 

870 "UNIMPLEMENTED INFL_MAP VALUE: {}".format(infl_map[text]), 

871 sortid="inflection/767", 

872 ) 

873 tagset = [()] 

874 break 

875 # Evaluate the conditional expression. 

876 assert isinstance(v, dict) 

877 cond: Union[bool, str] = "default-true" 

878 c: Union[str, list[str], set[str]] = "" 

879 # Handle "lang" condition. The value must be either a 

880 # single language or a list of languages, and the 

881 # condition evaluates to True if the table is one of 

882 # those languages. 

883 if "lang" in v: 

884 c = v["lang"] 

885 if isinstance(c, str): 

886 cond = c == lang 

887 else: 

888 assert isinstance(c, (list, tuple, set)) 

889 cond = lang in c 

890 # Handle "nested-table-depth" condition. The value must 

891 # be an int or list of ints, and the condition evaluates 

892 # True if the depth is one of those values. 

893 # "depth" is how deep into a nested table tree the current 

894 # table lies. It is first started in handle_wikitext_table, 

895 # so only applies to tables-within-tables, not other 

896 # WikiNode content. `depth` is currently only passed as a 

897 # parameter down the table parsing stack, and not stored. 

898 if cond and "nested-table-depth" in v: 898 ↛ 899line 898 didn't jump to line 899 because the condition on line 898 was never true

899 d = v["nested-table-depth"] 

900 if isinstance(d, int): 

901 cond = d == depth 

902 else: 

903 assert isinstance(d, (list, tuple, set)) 

904 cond = depth in d 

905 # Handle inflection-template condition. Must be a string 

906 # or list of strings, and if tablecontext.template_name is in 

907 # those, accept the condition. 

908 # TableContext.template_name is passed down from page/ 

909 # parse_inflection, before parsing and expanding itself 

910 # has begun. 

911 if cond and tablecontext and "inflection-template" in v: 

912 d1 = v["inflection-template"] 

913 if isinstance(d1, str): 913 ↛ 916line 913 didn't jump to line 916 because the condition on line 913 was always true

914 cond = d1 == tablecontext.template_name 

915 else: 

916 assert isinstance(d1, (list, tuple, set)) 

917 cond = tablecontext.template_name in d1 

918 # Handle "pos" condition. The value must be either a single 

919 # part-of-speech or a list of them, and the condition evaluates to 

920 # True if the part-of-speech is any of those listed. 

921 if cond and "pos" in v: 

922 c = v["pos"] 

923 if isinstance(c, str): 

924 cond = c == pos 

925 else: 

926 assert isinstance(c, (list, tuple, set)) 

927 cond = pos in c 

928 # Handle "if" condition. The value must be a string containing a 

929 # space-separated list of tags. The condition evaluates to True if 

930 # ``base_tags`` contains all of the listed tags. If the condition 

931 # is of the form "any: ...tags...", then any of the tags will be 

932 # enough. 

933 if cond and "if" in v and not ignore_tags: 

934 c = v["if"] 

935 assert isinstance(c, str) 

936 # "if" condition is true if any of the listed tags is present if 

937 # it starts with "any:", otherwise all must be present 

938 if c.startswith("any: "): 

939 cond = any(t in base_tags for t in c[5:].split()) 

940 else: 

941 cond = all(t in base_tags for t in c.split()) 

942 

943 # Handle "default" assignment. Store the value to be used 

944 # as a default later. 

945 if "default" in v: 

946 assert isinstance(v["default"], str) 

947 default_then = v["default"] 

948 

949 # Warning message about missing conditions for debugging. 

950 

951 if cond == "default-true" and not default_then and not silent: 

952 wxr.wtp.debug( 

953 "inflection table: IF MISSING COND: word={} " 

954 "lang={} text={} base_tags={} c={} cond={}".format( 

955 word, lang, text, base_tags, c, cond 

956 ), 

957 sortid="inflection/851", 

958 ) 

959 # Based on the result of evaluating the condition, select either 

960 # "then" part or "else" part. 

961 if cond: 

962 v = v.get("then", "") 

963 else: 

964 v1 = v.get("else") 

965 if v1 is None: 

966 if default_then: 

967 v = default_then 

968 else: 

969 if not silent: 

970 wxr.wtp.debug( 

971 "inflection table: IF WITHOUT ELSE EVALS " 

972 "False: " 

973 "{}/{} {!r} base_tags={}".format( 

974 word, lang, text, base_tags 

975 ), 

976 sortid="inflection/865", 

977 ) 

978 v = "error-unrecognized-form" 

979 else: 

980 v = v1 

981 

982 # Merge the resulting tagset from this header part with the other 

983 # tagsets from the whole header 

984 combined_return = or_tagsets(lang, pos, combined_return, tagset) 

985 

986 # Return the combined tagsets, or empty tagset if we got no tagsets 

987 if not combined_return: 

988 combined_return = [()] 

989 return combined_return 

990 

991 

992def compute_coltags( 

993 lang: str, 

994 pos: str, 

995 hdrspans: list[str], 

996 start: int, 

997 colspan: int, 

998 celltext: int, 

999) -> list[tuple[str]]: 

1000 """Computes column tags for a column of the given width based on the 

1001 current header spans.""" 

1002 assert isinstance(lang, str) 

1003 assert isinstance(pos, str) 

1004 assert isinstance(hdrspans, list) 

1005 assert isinstance(start, int) and start >= 0 

1006 assert isinstance(colspan, int) and colspan >= 1 

1007 assert isinstance(celltext, str) # For debugging only 

1008 # print("COMPUTE_COLTAGS CALLED start={} colspan={} celltext={!r}" 

1009 # .format(start, colspan, celltext)) 

1010 # For debugging, set this to the form for whose cell you want debug prints 

1011 if celltext == debug_cell_text: 1011 ↛ 1012line 1011 didn't jump to line 1012 because the condition on line 1011 was never true

1012 print( 

1013 "COMPUTE_COLTAGS CALLED start={} colspan={} celltext={!r}".format( 

1014 start, colspan, celltext 

1015 ) 

1016 ) 

1017 for hdrspan in hdrspans: 

1018 print( 

1019 " row={} start={} colspans={} tagsets={}".format( 

1020 hdrspan.rownum, 

1021 hdrspan.start, 

1022 hdrspan.colspan, 

1023 hdrspan.tagsets, 

1024 ) 

1025 ) 

1026 used = set() 

1027 coltags = [()] 

1028 last_header_row = 1000000 

1029 # Iterate through the headers in reverse order, i.e., headers lower in the 

1030 # table (closer to the cell) first. 

1031 row_tagsets = [()] 

1032 row_tagsets_rownum = 1000000 

1033 used_hdrspans = set() 

1034 for hdrspan in reversed(hdrspans): 

1035 if ( 

1036 hdrspan.start + hdrspan.colspan <= start 

1037 or hdrspan.start >= start + colspan 

1038 ): 

1039 # Does not horizontally overlap current cell. Ignore this hdrspan. 

1040 if celltext == debug_cell_text: 1040 ↛ 1041line 1040 didn't jump to line 1041 because the condition on line 1040 was never true

1041 print( 

1042 "Ignoring row={} start={} colspan={} tagsets={}".format( 

1043 hdrspan.rownum, 

1044 hdrspan.start, 

1045 hdrspan.colspan, 

1046 hdrspan.tagsets, 

1047 ) 

1048 ) 

1049 continue 

1050 # If the cell partially overlaps the current cell, assume we have 

1051 # reached something unrelated and abort. 

1052 if ( 

1053 hdrspan.start < start 

1054 and hdrspan.start + hdrspan.colspan > start 

1055 and hdrspan.start + hdrspan.colspan < start + colspan 

1056 ): 

1057 if celltext == debug_cell_text: 1057 ↛ 1058line 1057 didn't jump to line 1058 because the condition on line 1057 was never true

1058 print( 

1059 "break on partial overlap at start {} {} {}".format( 

1060 hdrspan.start, hdrspan.colspan, hdrspan.tagsets 

1061 ) 

1062 ) 

1063 break 

1064 if ( 

1065 hdrspan.start < start + colspan 

1066 and hdrspan.start > start 

1067 and hdrspan.start + hdrspan.colspan > start + colspan 

1068 and not hdrspan.expanded 

1069 ): 

1070 if celltext == debug_cell_text: 1070 ↛ 1071line 1070 didn't jump to line 1071 because the condition on line 1070 was never true

1071 print( 

1072 "break on partial overlap at end {} {} {}".format( 

1073 hdrspan.start, hdrspan.colspan, hdrspan.tagsets 

1074 ) 

1075 ) 

1076 break 

1077 # Check if we have already used this cell. 

1078 if id(hdrspan) in used_hdrspans: 

1079 continue 

1080 # We are going to use this cell. 

1081 used_hdrspans.add(id(hdrspan)) 

1082 tagsets = hdrspan.tagsets 

1083 # If the hdrspan is fully inside the current cell and does not cover 

1084 # it fully, check if we should merge information from multiple cells. 

1085 if not hdrspan.expanded and ( 

1086 hdrspan.start > start 

1087 or hdrspan.start + hdrspan.colspan < start + colspan 

1088 ): 

1089 # Multiple columns apply to the current cell, only 

1090 # gender/number/case tags present 

1091 # If there are no tags outside the range in any of the 

1092 # categories included in these cells, don't add anything 

1093 # (assume all choices valid in the language are possible). 

1094 in_cats = set( 

1095 valid_tags[t] 

1096 for x in hdrspans 

1097 if x.rownum == hdrspan.rownum 

1098 and x.start >= start 

1099 and x.start + x.colspan <= start + colspan 

1100 for tt in x.tagsets 

1101 for t in tt 

1102 ) 

1103 if celltext == debug_cell_text: 1103 ↛ 1104line 1103 didn't jump to line 1104 because the condition on line 1103 was never true

1104 print("in_cats={} tagsets={}".format(in_cats, tagsets)) 

1105 # Merge the tagsets into existing tagsets. This merges 

1106 # alternatives into the same tagset if there is only one 

1107 # category different; otherwise this splits the tagset into 

1108 # more alternatives. 

1109 includes_all_on_row = True 

1110 for x in hdrspans: 

1111 # print("X: x.rownum={} x.start={}".format(x.rownum, x.start)) 

1112 if x.rownum != hdrspan.rownum: 

1113 continue 

1114 if x.start < start or x.start + x.colspan > start + colspan: 

1115 if celltext == debug_cell_text: 1115 ↛ 1116line 1115 didn't jump to line 1116 because the condition on line 1115 was never true

1116 print( 

1117 "NOT IN RANGE: {} {} {}".format( 

1118 x.start, x.colspan, x.tagsets 

1119 ) 

1120 ) 

1121 includes_all_on_row = False 

1122 continue 

1123 if id(x) in used_hdrspans: 

1124 if celltext == debug_cell_text: 1124 ↛ 1125line 1124 didn't jump to line 1125 because the condition on line 1124 was never true

1125 print( 

1126 "ALREADY USED: {} {} {}".format( 

1127 x.start, x.colspan, x.tagsets 

1128 ) 

1129 ) 

1130 continue 

1131 used_hdrspans.add(id(x)) 

1132 if celltext == debug_cell_text: 1132 ↛ 1133line 1132 didn't jump to line 1133 because the condition on line 1132 was never true

1133 print( 

1134 "Merging into wide col: x.rownum={} " 

1135 "x.start={} x.colspan={} " 

1136 "start={} colspan={} tagsets={} x.tagsets={}".format( 

1137 x.rownum, 

1138 x.start, 

1139 x.colspan, 

1140 start, 

1141 colspan, 

1142 tagsets, 

1143 x.tagsets, 

1144 ) 

1145 ) 

1146 tagsets = or_tagsets(lang, pos, tagsets, x.tagsets) 

1147 # If all headers on the row were included, ignore them. 

1148 # See e.g. kunna/Swedish/Verb. 

1149 ts_cats = tagset_cats(tagsets) 

1150 if ( 

1151 includes_all_on_row 

1152 or 

1153 # Kludge, see fut/Hungarian/Verb 

1154 ("tense" in ts_cats and "object" in ts_cats) 

1155 ): 

1156 tagsets = [()] 

1157 # For limited categories, if the category doesn't appear 

1158 # outside, we won't include the category 

1159 if not in_cats - set( 

1160 ("gender", "number", "person", "case", "category", "voice") 

1161 ): 

1162 # Sometimes we have masc, fem, neut and plural, so treat 

1163 # number and gender as the same here (if one given, look for 

1164 # the other too) 

1165 if "number" in in_cats or "gender" in in_cats: 

1166 in_cats.update(("number", "gender")) 

1167 # Determine which categories occur outside on 

1168 # the same row. Ignore headers that have been expanded 

1169 # to cover the whole row/part of it. 

1170 out_cats = set( 

1171 valid_tags[t] 

1172 for x in hdrspans 

1173 if x.rownum == hdrspan.rownum 

1174 and not x.expanded 

1175 and ( 

1176 x.start < start or x.start + x.colspan > start + colspan 

1177 ) 

1178 for tt in x.tagsets 

1179 for t in tt 

1180 ) 

1181 if celltext == debug_cell_text: 1181 ↛ 1182line 1181 didn't jump to line 1182 because the condition on line 1181 was never true

1182 print("in_cats={} out_cats={}".format(in_cats, out_cats)) 

1183 # Remove all inside categories that do not appear outside 

1184 

1185 new_tagsets = [] 

1186 for ts in tagsets: 

1187 tags = tuple( 

1188 sorted(t for t in ts if valid_tags[t] in out_cats) 

1189 ) 

1190 if tags not in new_tagsets: 1190 ↛ 1186line 1190 didn't jump to line 1186 because the condition on line 1190 was always true

1191 new_tagsets.append(tags) 

1192 if celltext == debug_cell_text and new_tagsets != tagsets: 1192 ↛ 1193line 1192 didn't jump to line 1193 because the condition on line 1192 was never true

1193 print( 

1194 "Removed tags that do not " 

1195 "appear outside {} -> {}".format( 

1196 # have_hdr never used? 

1197 tagsets, 

1198 new_tagsets, 

1199 ) 

1200 ) 

1201 tagsets = new_tagsets 

1202 key = (hdrspan.start, hdrspan.colspan) 

1203 if key in used: 

1204 if celltext == debug_cell_text: 1204 ↛ 1205line 1204 didn't jump to line 1205 because the condition on line 1204 was never true

1205 print( 

1206 "Cellspan already used: start={} " 

1207 "colspan={} rownum={} {}".format( 

1208 hdrspan.start, 

1209 hdrspan.colspan, 

1210 hdrspan.rownum, 

1211 hdrspan.tagsets, 

1212 ) 

1213 ) 

1214 action = get_lang_conf(lang, "reuse_cellspan") 

1215 # can be "stop", "skip" or "reuse" 

1216 if action == "stop": 

1217 break 

1218 if action == "skip": 

1219 continue 

1220 assert action == "reuse" 

1221 tcats = tagset_cats(tagsets) 

1222 # Most headers block using the same column position above. However, 

1223 # "register" tags don't do this (cf. essere/Italian/verb: "formal") 

1224 if len(tcats) != 1 or "register" not in tcats: 

1225 used.add(key) 

1226 # If we have moved to a different row, merge into column tagsets 

1227 # (we use different and_tagsets within the row) 

1228 if row_tagsets_rownum != hdrspan.rownum: 

1229 # row_tagsets_rownum was initialized as 10000000 

1230 ret = and_tagsets(lang, pos, coltags, row_tagsets) 

1231 if celltext == debug_cell_text: 1231 ↛ 1232line 1231 didn't jump to line 1232 because the condition on line 1231 was never true

1232 print( 

1233 "merging rows: {} {} -> {}".format( 

1234 coltags, row_tagsets, ret 

1235 ) 

1236 ) 

1237 coltags = ret 

1238 row_tagsets = [()] 

1239 row_tagsets_rownum = hdrspan.rownum 

1240 # Merge into coltags 

1241 if hdrspan.all_headers_row and hdrspan.rownum + 1 == last_header_row: 

1242 # If this row is all headers and immediately preceeds the last 

1243 # header we accepted, take any header from there. 

1244 row_tagsets = and_tagsets(lang, pos, row_tagsets, tagsets) 

1245 if celltext == debug_cell_text: 1245 ↛ 1246line 1245 didn't jump to line 1246 because the condition on line 1245 was never true

1246 print("merged (next header row): {}".format(row_tagsets)) 

1247 else: 

1248 # new_cats is for the new tags (higher up in the table) 

1249 new_cats = tagset_cats(tagsets) 

1250 # cur_cats is for the tags already collected (lower in the table) 

1251 cur_cats = tagset_cats(coltags) 

1252 if celltext == debug_cell_text: 1252 ↛ 1253line 1252 didn't jump to line 1253 because the condition on line 1252 was never true

1253 print( 

1254 "row={} start={} colspan={} tagsets={} coltags={} " 

1255 "new_cats={} cur_cats={}".format( 

1256 hdrspan.rownum, 

1257 hdrspan.start, 

1258 hdrspan.colspan, 

1259 tagsets, 

1260 coltags, 

1261 new_cats, 

1262 cur_cats, 

1263 ) 

1264 ) 

1265 if "detail" in new_cats: 

1266 if not any(coltags): # Only if no tags so far 

1267 coltags = or_tagsets(lang, pos, coltags, tagsets) 

1268 if celltext == debug_cell_text: 1268 ↛ 1269line 1268 didn't jump to line 1269 because the condition on line 1268 was never true

1269 print("stopping on detail after merge") 

1270 break 

1271 # Here, we block bleeding of categories from above 

1272 elif "non-finite" in cur_cats and "non-finite" in new_cats: 

1273 stop = get_lang_conf(lang, "stop_non_finite_non_finite") 

1274 if stop: 1274 ↛ 1300line 1274 didn't jump to line 1300 because the condition on line 1274 was always true

1275 if celltext == debug_cell_text: 1275 ↛ 1276line 1275 didn't jump to line 1276 because the condition on line 1275 was never true

1276 print("stopping on non-finite-non-finite") 

1277 break 

1278 elif "non-finite" in cur_cats and "voice" in new_cats: 

1279 stop = get_lang_conf(lang, "stop_non_finite_voice") 

1280 if stop: 1280 ↛ 1300line 1280 didn't jump to line 1300 because the condition on line 1280 was always true

1281 if celltext == debug_cell_text: 1281 ↛ 1282line 1281 didn't jump to line 1282 because the condition on line 1281 was never true

1282 print("stopping on non-finite-voice") 

1283 break 

1284 elif "non-finite" in new_cats and cur_cats & set( 

1285 ("person", "number") 

1286 ): 

1287 if celltext == debug_cell_text: 1287 ↛ 1288line 1287 didn't jump to line 1288 because the condition on line 1287 was never true

1288 print("stopping on non-finite new") 

1289 break 

1290 elif "non-finite" in new_cats and "tense" in new_cats: 

1291 stop = get_lang_conf(lang, "stop_non_finite_tense") 

1292 if stop: 

1293 if celltext == debug_cell_text: 1293 ↛ 1294line 1293 didn't jump to line 1294 because the condition on line 1293 was never true

1294 print("stopping on non-finite new") 

1295 break 

1296 elif "non-finite" in cur_cats and new_cats & set(("mood",)): 1296 ↛ 1297line 1296 didn't jump to line 1297 because the condition on line 1296 was never true

1297 if celltext == debug_cell_text: 

1298 print("stopping on non-finite cur") 

1299 break 

1300 if ( 

1301 "tense" in new_cats 

1302 and any("imperative" in x for x in coltags) 

1303 and get_lang_conf(lang, "imperative_no_tense") 

1304 ): 

1305 if celltext == debug_cell_text: 1305 ↛ 1306line 1305 didn't jump to line 1306 because the condition on line 1305 was never true

1306 print("skipping tense in imperative") 

1307 continue 

1308 elif ( 

1309 "mood" in new_cats 

1310 and "mood" in cur_cats 

1311 and 

1312 # Allow if all new tags are already in current set 

1313 any( 

1314 t not in ts1 

1315 for ts1 in coltags # current 

1316 for ts2 in tagsets # new (from above) 

1317 for t in ts2 

1318 ) 

1319 ): 

1320 skip = get_lang_conf(lang, "skip_mood_mood") 

1321 if skip: 

1322 if celltext == debug_cell_text: 1322 ↛ 1323line 1322 didn't jump to line 1323 because the condition on line 1322 was never true

1323 print("skipping on mood-mood") 

1324 # we continue to next header 

1325 else: 

1326 if celltext == debug_cell_text: 1326 ↛ 1327line 1326 didn't jump to line 1327 because the condition on line 1326 was never true

1327 print("stopping on mood-mood") 

1328 break 

1329 elif "tense" in new_cats and "tense" in cur_cats: 

1330 skip = get_lang_conf(lang, "skip_tense_tense") 

1331 if skip: 

1332 if celltext == debug_cell_text: 1332 ↛ 1333line 1332 didn't jump to line 1333 because the condition on line 1332 was never true

1333 print("skipping on tense-tense") 

1334 # we continue to next header 

1335 else: 

1336 if celltext == debug_cell_text: 1336 ↛ 1337line 1336 didn't jump to line 1337 because the condition on line 1336 was never true

1337 print("stopping on tense-tense") 

1338 break 

1339 elif "aspect" in new_cats and "aspect" in cur_cats: 

1340 if celltext == debug_cell_text: 1340 ↛ 1341line 1340 didn't jump to line 1341 because the condition on line 1340 was never true

1341 print("skipping on aspect-aspect") 

1342 continue 

1343 elif "number" in cur_cats and "number" in new_cats: 

1344 if celltext == debug_cell_text: 1344 ↛ 1345line 1344 didn't jump to line 1345 because the condition on line 1344 was never true

1345 print("stopping on number-number") 

1346 break 

1347 elif "number" in cur_cats and "gender" in new_cats: 

1348 if celltext == debug_cell_text: 1348 ↛ 1349line 1348 didn't jump to line 1349 because the condition on line 1348 was never true

1349 print("stopping on number-gender") 

1350 break 

1351 elif "person" in cur_cats and "person" in new_cats: 

1352 if celltext == debug_cell_text: 1352 ↛ 1353line 1352 didn't jump to line 1353 because the condition on line 1352 was never true

1353 print("stopping on person-person") 

1354 break 

1355 else: 

1356 # Merge tags and continue to next header up/left in the table. 

1357 row_tagsets = and_tagsets(lang, pos, row_tagsets, tagsets) 

1358 if celltext == debug_cell_text: 1358 ↛ 1359line 1358 didn't jump to line 1359 because the condition on line 1358 was never true

1359 print("merged: {}".format(coltags)) 

1360 # Update the row number from which we have last taken headers 

1361 last_header_row = hdrspan.rownum 

1362 # Merge the final row tagset into coltags 

1363 coltags = and_tagsets(lang, pos, coltags, row_tagsets) 

1364 # print( 

1365 # "HDRSPANS:", list((x.start, x.colspan, x.tagsets) for x in hdrspans) 

1366 # ) 

1367 if celltext == debug_cell_text: 1367 ↛ 1368line 1367 didn't jump to line 1368 because the condition on line 1367 was never true

1368 print("COMPUTE_COLTAGS {} {}: {}".format(start, colspan, coltags)) 

1369 assert isinstance(coltags, list) 

1370 assert all(isinstance(x, tuple) for x in coltags) 

1371 return coltags 

1372 

1373 

1374def parse_simple_table( 

1375 wxr, tablecontext, word, lang, pos, rows, titles, source, after, depth 

1376): 

1377 """This is the default table parser. Despite its name, it can parse 

1378 complex tables. This returns a list of forms to be added to the 

1379 part-of-speech, or None if the table could not be parsed.""" 

1380 assert isinstance(wxr, WiktextractContext) 

1381 assert isinstance(tablecontext, TableContext) 

1382 assert isinstance(word, str) 

1383 assert isinstance(lang, str) 

1384 assert isinstance(pos, str) 

1385 assert isinstance(rows, list) 

1386 assert isinstance(source, str) 

1387 assert isinstance(after, str) 

1388 assert isinstance(depth, int) 

1389 for row in rows: 

1390 for col in row: 

1391 assert isinstance(col, InflCell) 

1392 assert isinstance(titles, list) 

1393 for x in titles: 

1394 assert isinstance(x, str) 

1395 

1396 # print("PARSE_SIMPLE_TABLE: TITLES:", titles) 

1397 if debug_cell_text: 1397 ↛ 1398line 1397 didn't jump to line 1398 because the condition on line 1397 was never true

1398 print("ROWS:") 

1399 for row in rows: 

1400 print(" ", row) 

1401 

1402 # Check for forced rowspan kludge. See e.g. 

1403 # maorski/Serbo-Croatian. These are essentially multi-row 

1404 # cells implemented using <br> rather than separate cell. We fix this 

1405 # by identifying rows where this happens, and splitting the current row 

1406 # to multiple rows by synthesizing additional cells. 

1407 new_rows = [] 

1408 for row in rows: 

1409 split_row = ( 

1410 any(x.is_title and x.text in ("inanimate\nanimate",) for x in row) 

1411 and 

1412 # x is an InflCell 

1413 all(x.rowspan == 1 for x in row) 

1414 ) 

1415 if not split_row: 

1416 new_rows.append(row) 

1417 continue 

1418 row1 = [] 

1419 row2 = [] 

1420 for cell in row: 

1421 cell1 = copy.deepcopy(cell) 

1422 if "\n" in cell.text: 

1423 # Has more than one line - split this cell 

1424 parts = cell.text.strip().splitlines() 

1425 if len(parts) != 2: 1425 ↛ 1426line 1425 didn't jump to line 1426 because the condition on line 1425 was never true

1426 wxr.wtp.debug( 

1427 "forced rowspan kludge got {} parts: {!r}".format( 

1428 len(parts), cell.text 

1429 ), 

1430 sortid="inflection/1234", 

1431 ) 

1432 cell2 = copy.deepcopy(cell) 

1433 cell1.text = parts[0] 

1434 cell2.text = parts[1] 

1435 else: 

1436 cell1.rowspan = 2 

1437 cell2 = cell1 # ref, not a copy 

1438 row1.append(cell1) 

1439 row2.append(cell2) 

1440 new_rows.append(row1) 

1441 new_rows.append(row2) 

1442 rows = new_rows 

1443 # print("ROWS AFTER FORCED ROWSPAN KLUDGE:") 

1444 # for row in rows: 

1445 # print(" ", row) 

1446 

1447 # Parse definitions for references (from table itself and from text 

1448 # after it) 

1449 def_ht = {} 

1450 

1451 def add_defs(defs: list[tuple[str, str]]) -> None: 

1452 for ref, d in defs: 

1453 # print("DEF: ref={} d={}".format(ref, d)) 

1454 d = d.strip() 

1455 d = d.split(". ")[0].strip() # text before ". " 

1456 if not d: 1456 ↛ 1457line 1456 didn't jump to line 1457 because the condition on line 1456 was never true

1457 continue 

1458 if d.endswith("."): # catc ".."?? 

1459 d = d[:-1] 

1460 tags, topics = decode_tags(d, no_unknown_starts=True) 

1461 # print(f"{ref=}, {d=}, {tags=}") 

1462 if topics or any("error-unknown-tag" in ts for ts in tags): 

1463 d = d[0].lower() + d[1:] 

1464 tags, topics = decode_tags(d, no_unknown_starts=True) 

1465 if topics or any("error-unknown-tag" in ts for ts in tags): 

1466 # Failed to parse as tags 

1467 # print("Failed: topics={} tags={}" 

1468 # .format(topics, tags)) 

1469 continue 

1470 tags1_s: set[str] = set() 

1471 for ts in tags: 

1472 tags1_s.update(ts) 

1473 tags1 = tuple(sorted(tags1_s)) 

1474 # print("DEFINED: {} -> {}".format(ref, tags1)) 

1475 def_ht[ref] = tags1 

1476 

1477 def generate_tags( 

1478 rowtags: list[tuple[str]], table_tags: list[str] 

1479 ) -> tuple[ 

1480 list[tuple[str, ...]], list[tuple[str, ...]], list[tuple[str, ...]] 

1481 ]: 

1482 new_coltags = [] 

1483 all_hdr_tags = [] # list of tuples 

1484 new_rowtags = [] 

1485 for rt0 in rowtags: 

1486 for ct0 in compute_coltags( 

1487 lang, 

1488 pos, 

1489 hdrspans, 

1490 col_idx, # col_idx=>start 

1491 colspan, 

1492 col, # cell_text 

1493 ): 

1494 base_tags: set[str] = ( 

1495 set(rt0) 

1496 | set(ct0) 

1497 | set(global_tags) 

1498 | set(itertools.chain.from_iterable(table_tags)) 

1499 ) # Union. 

1500 alt_tags = expand_header( 

1501 wxr, 

1502 tablecontext, 

1503 word, 

1504 lang, 

1505 pos, 

1506 text, 

1507 base_tags, 

1508 depth=depth, 

1509 ) 

1510 # base_tags are used in infl_map "if"-conds. 

1511 for tt in alt_tags: 

1512 if tt not in all_hdr_tags: 

1513 all_hdr_tags.append(tt) 

1514 tt_s = set(tt) 

1515 # Certain tags are always moved to word-level tags 

1516 if tt_s & TAGS_FORCED_WORDTAGS: 1516 ↛ 1517line 1516 didn't jump to line 1517 because the condition on line 1516 was never true

1517 table_tags.extend(tt_s & TAGS_FORCED_WORDTAGS) 

1518 tt_s = tt_s - TAGS_FORCED_WORDTAGS 

1519 # Add tags from referenced footnotes 

1520 tt_s.update(refs_tags) 

1521 # Sort, convert to tuple, and add to set of 

1522 # alternatives. 

1523 tt = tuple(sorted(tt_s)) 

1524 if tt not in new_coltags: 

1525 new_coltags.append(tt) 

1526 # Kludge (saprast/Latvian/Verb): ignore row tags 

1527 # if trying to add a non-finite after mood. 

1528 if any(valid_tags[t] == "mood" for t in rt0) and any( 

1529 valid_tags[t] == "non-finite" for t in tt 

1530 ): 

1531 tags = tuple(sorted(set(tt) | set(hdr_tags))) 

1532 else: 

1533 tags = tuple(sorted(set(tt) | set(rt0) | set(hdr_tags))) 

1534 if tags not in new_rowtags: 

1535 new_rowtags.append(tags) 

1536 return new_rowtags, new_coltags, all_hdr_tags 

1537 

1538 def add_new_hdrspan( 

1539 col: str, 

1540 hdrspans: list[HdrSpan], 

1541 store_new_hdrspan: bool, 

1542 col0_followed_by_nonempty: bool, 

1543 col0_hdrspan: Optional[HdrSpan], 

1544 ) -> tuple[str, bool, Optional[HdrSpan]]: 

1545 hdrspan = HdrSpan( 

1546 col_idx, colspan, rowspan, rownum, new_coltags, col, all_headers 

1547 ) 

1548 hdrspans.append(hdrspan) 

1549 

1550 # infl-map tag "dummy-store-hdrspan" causes this new hdrspan 

1551 # to be added to a register of stored hdrspans to be used 

1552 # later with "dummy-load-stored-hdrspans". 

1553 if store_new_hdrspan: 1553 ↛ 1554line 1553 didn't jump to line 1554 because the condition on line 1553 was never true

1554 tablecontext.stored_hdrspans.append(hdrspan) 

1555 

1556 # Handle headers that are above left-side header 

1557 # columns and are followed by personal pronouns in 

1558 # remaining columns (basically headers that 

1559 # evaluate to no tags). In such cases widen the 

1560 # left-side header to the full row. 

1561 if previously_seen: # id(cell) in seen_cells previously 

1562 col0_followed_by_nonempty = True 

1563 return col, col0_followed_by_nonempty, col0_hdrspan 

1564 elif col0_hdrspan is None: 

1565 col0_hdrspan = hdrspan 

1566 elif any(all_hdr_tags): 1566 ↛ 1634line 1566 didn't jump to line 1634 because the condition on line 1566 was always true

1567 col0_cats = tagset_cats(col0_hdrspan.tagsets) 

1568 later_cats = tagset_cats(all_hdr_tags) 

1569 col0_allowed = get_lang_conf(lang, "hdr_expand_first") 

1570 later_allowed = get_lang_conf(lang, "hdr_expand_cont") 

1571 later_allowed = later_allowed | set(["dummy"]) 

1572 # dummy2 has different behavior than plain dummy 

1573 # and does not belong here. 

1574 

1575 # print("col0_cats={} later_cats={} " 

1576 # "fol_by_nonempty={} col_idx={} end={} " 

1577 # "tagsets={}" 

1578 # .format(col0_cats, later_cats, 

1579 # col0_followed_by_nonempty, col_idx, 

1580 # col0_hdrspan.start + 

1581 # col0_hdrspan.colspan, 

1582 # col0_hdrspan.tagsets)) 

1583 # print("col0.rowspan={} rowspan={}" 

1584 # .format(col0_hdrspan.rowspan, rowspan)) 

1585 # Only expand if [col0_cats and later_cats are allowed 

1586 # and don't overlap] and [col0 has tags], and there have 

1587 # been [no disallowed cells in between]. 

1588 # 

1589 # There are three cases here: 

1590 # - col0_hdrspan set, continue with allowed current 

1591 # - col0_hdrspan set, expand, start new 

1592 # - col0_hdrspan set, no expand, start new 

1593 if ( 

1594 not col0_followed_by_nonempty 

1595 and 

1596 # XXX Only one cat of tags: kunna/Swedish 

1597 # XXX len(col0_cats) == 1 and 

1598 col0_hdrspan.rowspan >= rowspan 

1599 and 

1600 # from hdrspan 

1601 not (later_cats - later_allowed) 

1602 and not (col0_cats & later_cats) 

1603 ): 

1604 # First case: col0 set, continue 

1605 return col, col0_followed_by_nonempty, col0_hdrspan 

1606 # We are going to start new col0_hdrspan. Check if 

1607 # we should expand. 

1608 if ( 

1609 not col0_followed_by_nonempty 

1610 and not (col0_cats - col0_allowed) 

1611 and 

1612 # Only "allowed" allowed 

1613 # XXX len(col0_cats) == 1 and 

1614 col_idx > col0_hdrspan.start + col0_hdrspan.colspan 

1615 ): 

1616 # col_idx is beyond current colspan 

1617 # *Expand* current col0_hdrspan 

1618 # print("EXPANDING COL0 MID: {} from {} to {} " 

1619 # "cols {}" 

1620 # .format(col0_hdrspan.text, 

1621 # col0_hdrspan.colspan, 

1622 # col_idx - col0_hdrspan.start, 

1623 # col0_hdrspan.tagsets)) 

1624 col0_hdrspan.colspan = col_idx - col0_hdrspan.start 

1625 col0_hdrspan.expanded = True 

1626 # Clear old col0_hdrspan 

1627 if col == debug_cell_text: 1627 ↛ 1628line 1627 didn't jump to line 1628 because the condition on line 1627 was never true

1628 print("START NEW {}".format(hdrspan.tagsets)) 

1629 col0_hdrspan = None 

1630 # Now start new, unless it comes from previous row 

1631 if not previously_seen: 1631 ↛ 1634line 1631 didn't jump to line 1634 because the condition on line 1631 was always true

1632 col0_hdrspan = hdrspan 

1633 col0_followed_by_nonempty = False 

1634 return col, col0_followed_by_nonempty, col0_hdrspan 

1635 

1636 def split_text_into_alts(col: str) -> tuple[str, list[str], list[str]]: 

1637 # Split the cell text into alternatives 

1638 split_extra_tags = [] 

1639 if col and is_superscript(col[0]): 1639 ↛ 1640line 1639 didn't jump to line 1640 because the condition on line 1639 was never true

1640 alts = [col] 

1641 else: 

1642 separators = [";", "•", r"\n", " or "] 

1643 if " + " not in col: 

1644 separators.append(",") 

1645 if not col.endswith("/"): 

1646 separators.append("/") 

1647 if col in special_phrase_splits: 

1648 # Use language-specific special splits. 

1649 # These are phrases and constructions that have 

1650 # unique ways of splitting, not specific characters 

1651 # to split on like with the default splitting. 

1652 alts, tags = special_phrase_splits[col] 

1653 split_extra_tags = tags.split() 

1654 for x in split_extra_tags: 

1655 assert x in valid_tags 

1656 assert isinstance(alts, (list, tuple)) 

1657 assert isinstance(tags, str) 

1658 else: 

1659 # Use default splitting. However, recognize 

1660 # language-specific replacements and change them to magic 

1661 # characters before splitting. This way we won't split 

1662 # them. This is important for, e.g., recognizing 

1663 # alternative pronouns. 

1664 # The magic characters are characters out of Unicode scope 

1665 # that are given a simple incremental value, int > unicode. 

1666 repls = {} 

1667 magic_ch = MAGIC_FIRST 

1668 trs = get_lang_conf(lang, "form_transformations") 

1669 # trs is a list of lists of strings 

1670 for _, v, _, _ in trs: 

1671 # v is a pattern string, like "^ich" 

1672 # form_transformations data is doing double-duty here, 

1673 # because the pattern strings are already known to us and 

1674 # not meant to be split. 

1675 m = re.search(v, col) 

1676 if m is not None: 

1677 # if pattern found in text 

1678 magic = chr(magic_ch) 

1679 magic_ch += 1 # next magic character value 

1680 col = re.sub(v, magic, col) # replace with magic ch 

1681 repls[magic] = m.group(0) 

1682 # remember what regex match string each magic char 

1683 # replaces. .group(0) is the whole match. 

1684 alts0 = split_at_comma_semi(col, separators=separators) 

1685 # with magic characters in place, split the text so that 

1686 # pre-transformation text is out of the way. 

1687 alts = [] 

1688 for alt in alts0: 

1689 # create a new list with the separated items and 

1690 # the magic characters replaced with the original texts. 

1691 for k, v in repls.items(): 

1692 alt = re.sub(k, v, alt) 

1693 alts.append(alt) 

1694 # Remove "*" from beginning of forms, as in non-attested 

1695 # or reconstructed forms. Otherwise it might confuse romanization 

1696 # detection. 

1697 alts = list(re.sub(r"^\*\*?([^ ])", r"\1", x) for x in alts) 

1698 alts = list( 

1699 x for x in alts if not re.match(r"pronounced with |\(with ", x) 

1700 ) 

1701 alts = list( 

1702 re.sub(r"^\((in the sense [^)]*)\)\s+", "", x) for x in alts 

1703 ) 

1704 # Check for parenthesized alternatives, e.g. ripromettersi/Italian 

1705 if all( 1705 ↛ 1716line 1705 didn't jump to line 1716 because the condition on line 1705 was never true

1706 re.match(r"\w+( \w+)* \(\w+( \w+)*(, \w+( \w+)*)*\)$", alt) 

1707 # word word* \(word word*(, word word*)*\) 

1708 and all( 

1709 distw([re.sub(r" \(.*", "", alt)], x) < 0.5 

1710 # Levenshtein distance 

1711 for x in re.sub(r".*\((.*)\)", r"\1", alt).split(", ") 

1712 ) 

1713 # Extract from parentheses for testin 

1714 for alt in alts 

1715 ): 

1716 new_alts = [] 

1717 for alt in alts: 

1718 # Replace parentheses before splitting 

1719 alt = alt.replace(" (", ", ") 

1720 alt = alt.replace(")", "") 

1721 for new_alt in alt.split(", "): 

1722 new_alts.append(new_alt) 

1723 alts = new_alts 

1724 return col, alts, split_extra_tags 

1725 

1726 def handle_mixed_lines(alts: list[str]) -> list[tuple[str, str, str]]: 

1727 # Handle the special case where romanization is given under 

1728 # normal form, e.g. in Russian. There can be multiple 

1729 # comma-separated forms in each case. We also handle the case 

1730 # where instead of romanization we have IPA pronunciation 

1731 # (e.g., avoir/French/verb). 

1732 len2 = len(alts) // 2 

1733 # Check for IPAs (forms first, IPAs under) 

1734 # base, base, IPA, IPA 

1735 if ( 

1736 len(alts) % 2 == 0 # Divisibly by two 

1737 and all( 

1738 re.match(r"^\s*/.*/\s*$", x) # Inside slashes = IPA 

1739 for x in alts[len2:] 

1740 ) 

1741 ): # In the second half of alts 

1742 nalts = list( 

1743 (alts[i], "", alts[i + len2]) 

1744 # List of tuples: (base, "", ipa) 

1745 for i in range(len2) 

1746 ) 

1747 # base, base, base, IPA 

1748 elif ( 

1749 len(alts) > 2 

1750 and re.match(r"^\s*/.*/\s*$", alts[-1]) 

1751 and all(not x.startswith("/") for x in alts[:-1]) 

1752 ): 

1753 # Only if the last alt is IPA 

1754 nalts = list((alts[i], "", alts[-1]) for i in range(len(alts) - 1)) 

1755 # base, IPA, IPA, IPA 

1756 elif ( 

1757 len(alts) > 2 

1758 and not alts[0].startswith("/") 

1759 and all( 

1760 re.match(r"^\s*/.*/\s*$", alts[i]) for i in range(1, len(alts)) 

1761 ) 

1762 ): 

1763 # First is base and the rest is IPA alternatives 

1764 nalts = list((alts[0], "", alts[i]) for i in range(1, len(alts))) 

1765 

1766 # Check for romanizations, forms first, romanizations under 

1767 elif ( 

1768 len(alts) % 2 == 0 

1769 and not any("(" in x for x in alts) 

1770 and all( 

1771 classify_desc( 

1772 re.sub( 

1773 r"\^.*$", 

1774 "", 

1775 # Remove ends of strings starting from ^. 

1776 # Supescripts have been already removed 

1777 # from the string, while ^xyz needs to be 

1778 # removed separately, though it's usually 

1779 # something with a single letter? 

1780 "".join(xx for xx in x if not is_superscript(xx)), 

1781 ) 

1782 ) 

1783 == "other" 

1784 for x in alts[:len2] 

1785 ) 

1786 and all( 

1787 classify_desc( 

1788 re.sub( 

1789 r"\^.*$", 

1790 "", 

1791 "".join(xx for xx in x if not is_superscript(xx)), 

1792 ) 

1793 ) 

1794 in ("romanization", "english") 

1795 for x in alts[len2:] 

1796 ) 

1797 ): 

1798 nalts = list((alts[i], alts[i + len2], "") for i in range(len2)) 

1799 # Check for romanizations, forms and romanizations alternating 

1800 elif ( 

1801 len(alts) % 2 == 0 

1802 and not any("(" in x for x in alts) 

1803 and all( 

1804 classify_desc( 

1805 re.sub( 

1806 r"\^.*$", 

1807 "", 

1808 "".join(xx for xx in alts[i] if not is_superscript(xx)), 

1809 ) 

1810 ) 

1811 == "other" 

1812 for i in range(0, len(alts), 2) 

1813 ) 

1814 and all( 

1815 classify_desc( 

1816 re.sub( 

1817 r"\^.*$", 

1818 "", 

1819 "".join(xx for xx in alts[i] if not is_superscript(xx)), 

1820 ) 

1821 ) 

1822 in ("romanization", "english") 

1823 for i in range(1, len(alts), 2) 

1824 ) 

1825 ): 

1826 # odds 

1827 nalts = list( 

1828 (alts[i], alts[i + 1], "") for i in range(0, len(alts), 2) 

1829 ) 

1830 # evens 

1831 else: 

1832 new_alts = [] 

1833 for alt in alts: 

1834 lst = [""] 

1835 idx = 0 

1836 for m in re.finditer( 

1837 r"(^|\w|\*)\((\w+" r"(/\w+)*)\)", 

1838 # start OR letter OR asterisk (word/word*) 

1839 # \\___________group 1_______/ \ \_g3_/// 

1840 # \ \__gr. 2_// 

1841 # \_____________group 0________________/ 

1842 alt, 

1843 ): 

1844 v = m.group(2) # (word/word/word...) 

1845 if ( 

1846 classify_desc(v) == "tags" # Tags inside parens 

1847 or m.group(0) == alt 

1848 ): # All in parens 

1849 continue 

1850 new_lst = [] 

1851 for x in lst: 

1852 x += alt[idx : m.start()] + m.group(1) 

1853 # alt until letter or asterisk 

1854 idx = m.end() 

1855 vparts = v.split("/") 

1856 # group(2) = ["word", "wörd"...] 

1857 if len(vparts) == 1: 

1858 new_lst.append(x) 

1859 new_lst.append(x + v) 

1860 # "kind(er)" -> ["kind", "kinder"] 

1861 else: 

1862 for vv in vparts: 

1863 new_lst.append(x + vv) 

1864 # "lampai(tten/den)" -> 

1865 # ["lampaitten", "lampaiden"] 

1866 lst = new_lst 

1867 for x in lst: 

1868 new_alts.append(x + alt[idx:]) 

1869 # add the end of alt 

1870 nalts = list((x, "", "") for x in new_alts) 

1871 # [form, no romz, no ipa] 

1872 return nalts 

1873 

1874 def find_semantic_parens(form: str) -> tuple[str, list[str]]: 

1875 # "Some languages" (=Greek) use brackets to mark things that 

1876 # require tags, like (informality), [rarity] and {archaicity}. 

1877 extra_tags = [] 

1878 if re.match(r"\([^][(){}]*\)$", form): 

1879 if get_lang_conf(lang, "parentheses_for_informal"): 

1880 form = form[1:-1] 

1881 extra_tags.append("informal") 

1882 else: 

1883 form = form[1:-1] 

1884 elif re.match(r"\{\[[^][(){}]*\]\}$", form): 

1885 if get_lang_conf( 1885 ↛ 1892line 1885 didn't jump to line 1892 because the condition on line 1885 was always true

1886 lang, "square_brackets_for_rare" 

1887 ) and get_lang_conf(lang, "curly_brackets_for_archaic"): 

1888 # είμαι/Greek/Verb 

1889 form = form[2:-2] 

1890 extra_tags.extend(["rare", "archaic"]) 

1891 else: 

1892 form = form[2:-2] 

1893 elif re.match(r"\{[^][(){}]*\}$", form): 

1894 if get_lang_conf(lang, "curly_brackets_for_archaic"): 1894 ↛ 1899line 1894 didn't jump to line 1899 because the condition on line 1894 was always true

1895 # είμαι/Greek/Verb 

1896 form = form[1:-1] 

1897 extra_tags.extend(["archaic"]) 

1898 else: 

1899 form = form[1:-1] 

1900 elif re.match(r"\[[^][(){}]*\]$", form): 

1901 if get_lang_conf(lang, "square_brackets_for_rare"): 1901 ↛ 1906line 1901 didn't jump to line 1906 because the condition on line 1901 was always true

1902 # είμαι/Greek/Verb 

1903 form = form[1:-1] 

1904 extra_tags.append("rare") 

1905 else: 

1906 form = form[1:-1] 

1907 return form, extra_tags 

1908 

1909 def handle_parens( 

1910 form: str, roman: str, clitic: str, extra_tags: list[str] 

1911 ) -> tuple[str, str, str]: 

1912 if re.match(r"[’'][a-z]([a-z][a-z]?)?$", paren): 

1913 # is there a clitic starting with apostrophe? 

1914 clitic = paren 

1915 # assume the whole paren is a clitic 

1916 # then remove paren from form 

1917 form = (form[: m.start()] + subst + form[m.end() :]).strip() 

1918 elif classify_desc(paren) == "tags": 

1919 tagsets1, topics1 = decode_tags(paren) 

1920 if not topics1: 1920 ↛ 1941line 1920 didn't jump to line 1941 because the condition on line 1920 was always true

1921 for ts in tagsets1: 

1922 ts = tuple(x for x in ts if " " not in x) 

1923 # There are some generated tags containing 

1924 # spaces; do not let them through here. 

1925 extra_tags.extend(ts) 

1926 form = (form[: m.start()] + subst + form[m.end() :]).strip() 

1927 # brackets contain romanization 

1928 elif ( 1928 ↛ 1937line 1928 didn't jump to line 1937 because the condition on line 1928 was never true

1929 m.start() > 0 

1930 and not roman 

1931 and classify_desc(form[: m.start()]) == "other" 

1932 and 

1933 # "other" ~ text 

1934 classify_desc(paren) in ("romanization", "english") 

1935 and not re.search(r"^with |-form$", paren) 

1936 ): 

1937 roman = paren 

1938 form = (form[: m.start()] + subst + form[m.end() :]).strip() 

1939 elif re.search(r"^with |-form", paren): 1939 ↛ 1940line 1939 didn't jump to line 1940 because the condition on line 1939 was never true

1940 form = (form[: m.start()] + subst + form[m.end() :]).strip() 

1941 return form, roman, clitic 

1942 

1943 def merge_row_and_column_tags(form, some_has_covered_text): 

1944 # Merge column tags and row tags. We give preference 

1945 # to moods etc coming from rowtags (cf. austteigen/German/Verb 

1946 # imperative forms). 

1947 

1948 # In certain cases, what a tag means depends on whether 

1949 # it is a row or column header. Depending on the language, 

1950 # we replace certain tags with others if they're in 

1951 # a column or row 

1952 

1953 ret = [] 

1954 # rtagreplacs = get_lang_conf(lang, "rowtag_replacements") 

1955 # ctagreplacs = get_lang_conf(lang, "coltag_replacements") 

1956 for rt in sorted(rowtags): 

1957 if "dummy-use-as-coltags" in rt: 1957 ↛ 1958line 1957 didn't jump to line 1958 because the condition on line 1957 was never true

1958 continue 

1959 # if lang was in rowtag_replacements) 

1960 # if not rtagreplacs == None: 

1961 # rt = replace_directional_tags(rt, rtagreplacs) 

1962 for ct in sorted(coltags): 

1963 if "dummy-use-as-rowtags" in ct: 1963 ↛ 1964line 1963 didn't jump to line 1964 because the condition on line 1963 was never true

1964 continue 

1965 # if lang was in coltag_replacements 

1966 # if not ctagreplacs == None: 

1967 # ct = replace_directional_tags(ct, 

1968 # ctagreplacs) 

1969 tags = set(global_tags) 

1970 tags.update(extra_tags) 

1971 tags.update(rt) 

1972 tags.update(refs_tags) 

1973 tags.update(tablecontext.section_header) 

1974 # Merge tags from column. For certain kinds of tags, 

1975 # those coming from row take precedence. 

1976 old_tags = set(tags) 

1977 for t in ct: 

1978 c = valid_tags[t] 

1979 if c in ("mood", "case", "number") and any( 

1980 valid_tags[tt] == c for tt in old_tags 

1981 ): 

1982 continue 

1983 tags.add(t) 

1984 

1985 # Extract language-specific tags from the 

1986 # form. This may also adjust the form. 

1987 form, lang_tags = lang_specific_tags(lang, pos, form) 

1988 tags.update(lang_tags) 

1989 

1990 # For non-finite verb forms, see if they have 

1991 # a gender/class suffix 

1992 if pos == "verb" and any( 

1993 valid_tags[t] == "non-finite" for t in tags 

1994 ): 

1995 form, tt = parse_head_final_tags(wxr, lang, form) 

1996 tags.update(tt) 

1997 

1998 # Remove "personal" tag if have nth person; these 

1999 # come up with e.g. reconhecer/Portuguese/Verb. But 

2000 # not if we also have "pronoun" 

2001 if ( 

2002 "personal" in tags 

2003 and "pronoun" not in tags 

2004 and any( 

2005 x in tags 

2006 for x in [ 

2007 "first-person", 

2008 "second-person", 

2009 "third-person", 

2010 ] 

2011 ) 

2012 ): 

2013 tags.remove("personal") 

2014 

2015 # If we have impersonal, remove person and number. 

2016 # This happens with e.g. viajar/Portuguese/Verb 

2017 if "impersonal" in tags: 

2018 tags = tags - set( 

2019 [ 

2020 "first-person", 

2021 "second-person", 

2022 "third-person", 

2023 "singular", 

2024 "plural", 

2025 ] 

2026 ) 

2027 

2028 # Remove unnecessary "positive" tag from verb forms 

2029 if pos == "verb" and "positive" in tags: 

2030 if "negative" in tags: 2030 ↛ 2031line 2030 didn't jump to line 2031 because the condition on line 2030 was never true

2031 tags.remove("negative") 

2032 tags.remove("positive") 

2033 

2034 # Many Russian (and other Slavic) inflection tables 

2035 # have animate/inanimate distinction that generates 

2036 # separate entries for neuter/feminine, but the 

2037 # distinction only applies to masculine. Remove them 

2038 # form neuter/feminine and eliminate duplicates. 

2039 if get_lang_conf(lang, "masc_only_animate"): 

2040 for t1 in ("animate", "inanimate"): 

2041 for t2 in ("neuter", "feminine"): 

2042 if ( 

2043 t1 in tags 

2044 and t2 in tags 

2045 and "masculine" not in tags 

2046 and "plural" not in tags 

2047 ): 

2048 tags.remove(t1) 

2049 

2050 # German adjective tables contain "(keiner)" etc 

2051 # for mixed declension plural. When the adjective 

2052 # disappears and it becomes just one word, remove 

2053 # the "includes-article" tag. e.g. eiskalt/German 

2054 if "includes-article" in tags and " " not in form: 

2055 tags.remove("includes-article") 

2056 

2057 # Handle ignored forms. We mark that the form was 

2058 # provided. This is important information; some words 

2059 # just do not have a certain form. However, there also 

2060 # many cases where no word in a language has a 

2061 # particular form. Post-processing could detect and 

2062 # remove such cases. 

2063 if form in IGNORED_COLVALUES: 

2064 # if cell text seems to be ignorable 

2065 if "dummy-ignore-skipped" in tags: 

2066 continue 

2067 if ( 

2068 col_idx not in has_covering_hdr 

2069 and some_has_covered_text 

2070 ): 

2071 continue 

2072 # don't ignore this cell if there's been a header 

2073 # above it 

2074 form = "-" 

2075 elif col_idx in has_covering_hdr: 

2076 some_has_covered_text = True 

2077 

2078 # Handle ambiguous object concord. If a header 

2079 # gives the "dummy-object-concord"-tag to a word, 

2080 # replace person, number and gender tags with 

2081 # their "object-" counterparts so that the verb 

2082 # agrees with the object instead. 

2083 # Use only when the verb has ONLY object agreement! 

2084 # a پخول/Pashto 

2085 if "dummy-object-concord" in tags: 2085 ↛ 2086line 2085 didn't jump to line 2086 because the condition on line 2085 was never true

2086 for subtag, objtag in object_concord_replacements.items(): 

2087 if subtag in tags: 

2088 tags.remove(subtag) 

2089 tags.add(objtag) 

2090 

2091 # Remove the dummy mood tag that we sometimes 

2092 # use to block adding other mood and related 

2093 # tags 

2094 tags = tags - set( 

2095 [ 

2096 "dummy-mood", 

2097 "dummy-tense", 

2098 "dummy-ignore-skipped", 

2099 "dummy-object-concord", 

2100 "dummy-reset-headers", 

2101 "dummy-use-as-coltags", 

2102 "dummy-use-as-rowtags", 

2103 "dummy-store-hdrspan", 

2104 "dummy-load-stored-hdrspans", 

2105 "dummy-reset-stored-hdrspans", 

2106 "dummy-section-header", 

2107 ] 

2108 ) 

2109 

2110 # Perform language-specific tag replacements according 

2111 # to rules in a table. 

2112 lang_tag_mappings = get_lang_conf(lang, "lang_tag_mappings") 

2113 if lang_tag_mappings is not None: 2113 ↛ 2114line 2113 didn't jump to line 2114 because the condition on line 2113 was never true

2114 for pre, post in lang_tag_mappings.items(): 

2115 if all(t in tags for t in pre): 

2116 tags = (tags - set(pre)) | set(post) 

2117 

2118 # Warn if there are entries with empty tags 

2119 if not tags: 2119 ↛ 2120line 2119 didn't jump to line 2120 because the condition on line 2119 was never true

2120 wxr.wtp.debug( 

2121 "inflection table: empty tags for {}".format(form), 

2122 sortid="inflection/1826", 

2123 ) 

2124 

2125 # Warn if form looks like IPA 

2126 ########## XXX ######## 

2127 # Because IPA is its own unicode block, we could also 

2128 # technically do a Unicode name check to see if a string 

2129 # contains IPA. Not all valid IPA characters are in the 

2130 # IPA extension block, so you can technically have false 

2131 # negatives if it's something like /toki/, but it 

2132 # shouldn't give false positives. 

2133 # Alternatively, you could make a list of IPA-admissible 

2134 # characters and reject non-IPA stuff with that. 

2135 if re.match(r"\s*/.*/\s*$", form): 2135 ↛ 2136line 2135 didn't jump to line 2136 because the condition on line 2135 was never true

2136 wxr.wtp.debug( 

2137 "inflection table form looks like IPA: " 

2138 "form={} tags={}".format(form, tags), 

2139 sortid="inflection/1840", 

2140 ) 

2141 

2142 # Note that this checks `form`, not `in tags` 

2143 if form == "dummy-ignored-text-cell": 2143 ↛ 2144line 2143 didn't jump to line 2144 because the condition on line 2143 was never true

2144 continue 

2145 

2146 if "dummy-remove-this-cell" in tags: 2146 ↛ 2147line 2146 didn't jump to line 2147 because the condition on line 2146 was never true

2147 continue 

2148 

2149 # Add the form 

2150 tags = list(sorted(tags)) 

2151 dt = {"form": form, "tags": tags, "source": source} 

2152 if roman: 

2153 dt["roman"] = roman 

2154 if ipa: 

2155 dt["ipa"] = ipa 

2156 ret.append(dt) 

2157 # If we got separate clitic form, add it 

2158 if clitic: 

2159 dt = { 

2160 "form": clitic, 

2161 "tags": tags + ["clitic"], 

2162 "source": source, 

2163 } 

2164 ret.append(dt) 

2165 return ret, form, some_has_covered_text 

2166 

2167 # First extract definitions from cells 

2168 # See defs_ht for footnote defs stuff 

2169 for row in rows: 

2170 for cell in row: 

2171 text, refs, defs, hdr_tags = extract_cell_content( 

2172 lang, word, cell.text 

2173 ) 

2174 # refs, defs = footnote stuff, defs -> (ref, def) 

2175 add_defs(defs) 

2176 # Extract definitions from text after table 

2177 text, refs, defs, hdr_tags = extract_cell_content(lang, word, after) 

2178 add_defs(defs) 

2179 

2180 # Then extract the actual forms 

2181 ret = [] 

2182 hdrspans = [] 

2183 first_col_has_text = False 

2184 rownum = 0 

2185 title = None 

2186 global_tags = [] 

2187 table_tags = [] 

2188 special_phrase_splits = get_lang_conf(lang, "special_phrase_splits") 

2189 form_replacements = get_lang_conf(lang, "form_replacements") 

2190 possibly_ignored_forms = get_lang_conf(lang, "conditionally_ignored_cells") 

2191 cleanup_rules = get_lang_conf(lang, "minor_text_cleanups") 

2192 

2193 for title in titles: 

2194 more_global_tags, more_table_tags, extra_forms = parse_title( 

2195 title, source 

2196 ) 

2197 global_tags.extend(more_global_tags) 

2198 table_tags.extend(more_table_tags) 

2199 ret.extend(extra_forms) 

2200 cell_rowcnt = collections.defaultdict(int) 

2201 seen_cells = set() 

2202 has_covering_hdr = set() 

2203 some_has_covered_text = False 

2204 for row in rows: 

2205 # print("ROW:", row) 

2206 # print("====") 

2207 # print(f"Start of PREVIOUS row hdrspans:" 

2208 # f"{tuple(sp.tagsets for sp in hdrspans)}") 

2209 # print(f"Start of row txt: {tuple(t.text for t in row)}") 

2210 if not row: 2210 ↛ 2211line 2210 didn't jump to line 2211 because the condition on line 2210 was never true

2211 continue # Skip empty rows 

2212 all_headers = all(x.is_title or not x.text.strip() for x in row) 

2213 text = row[0].text 

2214 if ( 

2215 row[0].is_title 

2216 and text 

2217 and not is_superscript(text[0]) 

2218 and text not in infl_map # zealous inflation map? 

2219 and ( 

2220 re.match(r"Inflection ", text) 

2221 or re.sub( 

2222 r"\s+", 

2223 " ", # flatten whitespace 

2224 re.sub( 

2225 r"\s*\([^)]*\)", 

2226 "", 

2227 # Remove whitespace+parens 

2228 text, 

2229 ), 

2230 ).strip() 

2231 not in infl_map 

2232 ) 

2233 and not re.match(infl_start_re, text) 

2234 and all( 

2235 x.is_title == row[0].is_title and x.text == text 

2236 # all InflCells in `row` have the same is_title and text 

2237 for x in row 

2238 ) 

2239 ): 

2240 if text and title is None: 

2241 # Only if there were no titles previously make the first 

2242 # text that is found the title 

2243 title = text 

2244 if re.match(r"(Note:|Notes:)", title): 2244 ↛ 2245line 2244 didn't jump to line 2245 because the condition on line 2244 was never true

2245 continue # not a title 

2246 more_global_tags, more_table_tags, extra_forms = parse_title( 

2247 title, source 

2248 ) 

2249 global_tags.extend(more_global_tags) 

2250 table_tags.extend(more_table_tags) 

2251 ret.extend(extra_forms) 

2252 continue # Skip title rows without incrementing i 

2253 if "dummy-skip-this" in global_tags: 2253 ↛ 2254line 2253 didn't jump to line 2254 because the condition on line 2253 was never true

2254 return [] 

2255 rowtags = [()] 

2256 # have_hdr = False 

2257 # have_hdr never used? 

2258 have_text = False 

2259 samecell_cnt = 0 

2260 col0_hdrspan = None # col0 or later header (despite its name) 

2261 col0_followed_by_nonempty = False 

2262 row_empty = True 

2263 for col_idx, cell in enumerate(row): 

2264 colspan = cell.colspan # >= 1 

2265 rowspan = cell.rowspan # >= 1 

2266 previously_seen = id(cell) in seen_cells 

2267 # checks to see if this cell was in the previous ROW 

2268 seen_cells.add(id(cell)) 

2269 if samecell_cnt == 0: 

2270 # First column of a (possible multi-column) cell 

2271 samecell_cnt = colspan - 1 

2272 else: 

2273 assert samecell_cnt > 0 

2274 samecell_cnt -= 1 

2275 continue 

2276 

2277 # is_first_row_of_cell = cell_rowcnt[id(cell)] == 0 

2278 # never used? 

2279 

2280 # defaultdict(int) around line 1900 

2281 cell_rowcnt[id(cell)] += 1 

2282 # => how many cols this spans 

2283 col = cell.text 

2284 if not col: 

2285 continue 

2286 row_empty = False 

2287 is_title = cell.is_title 

2288 

2289 # If the cell has a target, i.e., text after colon, interpret 

2290 # it as simply specifying a value for that value and ignore 

2291 # it otherwise. 

2292 if cell.target: 

2293 text, refs, defs, hdr_tags = extract_cell_content( 

2294 lang, word, col 

2295 ) 

2296 if not text: 2296 ↛ 2297line 2296 didn't jump to line 2297 because the condition on line 2296 was never true

2297 continue 

2298 refs_tags = set() 

2299 for ref in refs: # gets tags from footnotes 2299 ↛ 2300line 2299 didn't jump to line 2300 because the loop on line 2299 never started

2300 if ref in def_ht: 

2301 refs_tags.update(def_ht[ref]) 

2302 rowtags = expand_header( 

2303 wxr, 

2304 tablecontext, 

2305 word, 

2306 lang, 

2307 pos, 

2308 text, 

2309 [], 

2310 silent=True, 

2311 depth=depth, 

2312 ) 

2313 rowtags = list( 

2314 set(tuple(sorted(set(x) | refs_tags)) for x in rowtags) 

2315 ) 

2316 is_title = False 

2317 col = cell.target 

2318 

2319 # print(rownum, col_idx, col) 

2320 # print(f"is_title: {is_title}") 

2321 if is_title: 

2322 # It is a header cell 

2323 text, refs, defs, hdr_tags = extract_cell_content( 

2324 lang, word, col 

2325 ) 

2326 if not text: 

2327 continue 

2328 # Extract tags from referenced footnotes 

2329 refs_tags = set() 

2330 for ref in refs: 

2331 if ref in def_ht: 

2332 refs_tags.update(def_ht[ref]) 

2333 

2334 # Expand header to tags 

2335 v = expand_header( 

2336 wxr, 

2337 tablecontext, 

2338 word, 

2339 lang, 

2340 pos, 

2341 text, 

2342 [], 

2343 silent=True, 

2344 depth=depth, 

2345 ) 

2346 # print("EXPANDED {!r} to {}".format(text, v)) 

2347 

2348 if col_idx == 0: 

2349 # first_col_has_text is used for a test to ignore 

2350 # upper-left cells that are just text without 

2351 # header info 

2352 first_col_has_text = True 

2353 # Check if the header expands to reset hdrspans 

2354 if any("dummy-reset-headers" in tt for tt in v): 

2355 new_hdrspans = [] 

2356 for hdrspan in hdrspans: 

2357 # if there are HdrSpan objects (abstract headers with 

2358 # row- and column-spans) that are to the left or at the 

2359 # same row or below, KEEP those; things above and to 

2360 # the right of the hdrspan with dummy-reset-headers 

2361 # are discarded. Tags from the header together with 

2362 # dummy-reset-headers are kept as normal. 

2363 if ( 

2364 hdrspan.start + hdrspan.colspan < col_idx 

2365 or hdrspan.rownum > rownum - cell.rowspan 

2366 ): 

2367 new_hdrspans.append(hdrspan) 

2368 hdrspans = new_hdrspans 

2369 

2370 for tt in v: 

2371 if "dummy-section-header" in tt: 2371 ↛ 2372line 2371 didn't jump to line 2372 because the condition on line 2371 was never true

2372 tablecontext.section_header = tt 

2373 break 

2374 if "dummy-reset-section-header" in tt: 2374 ↛ 2375line 2374 didn't jump to line 2375 because the condition on line 2374 was never true

2375 tablecontext.section_header = [] 

2376 # Text between headers on a row causes earlier headers to 

2377 # be reset 

2378 if have_text: 

2379 # print(" HAVE_TEXT BEFORE HDR:", col) 

2380 # Reset rowtags if new title column after previous 

2381 # text cells 

2382 # +-----+-----+-----+-----+ 

2383 # |hdr-a|txt-a|hdr-B|txt-B| 

2384 # +-----+-----+-----+-----+ 

2385 # ^reset rowtags=> 

2386 # XXX beware of header "—": "" - must not clear on that if 

2387 # it expands to no tags 

2388 rowtags = [()] 

2389 # have_hdr = True 

2390 # have_hdr never used? 

2391 # print("HAVE_HDR: {} rowtags={}".format(col, rowtags)) 

2392 # Update rowtags and coltags 

2393 has_covering_hdr.add(col_idx) # col_idx == current column 

2394 # has_covering_hdr is a set that has the col_idx-ids of columns 

2395 # that have previously had some kind of header. It is never 

2396 # resetted inside the col_idx-loops OR the bigger rows-loop, so 

2397 # applies to the whole table. 

2398 

2399 rowtags, new_coltags, all_hdr_tags = generate_tags( 

2400 rowtags, table_tags 

2401 ) 

2402 

2403 if any("dummy-skip-this" in ts for ts in rowtags): 

2404 continue # Skip this cell 

2405 

2406 if any("dummy-load-stored-hdrspans" in ts for ts in v): 2406 ↛ 2407line 2406 didn't jump to line 2407 because the condition on line 2406 was never true

2407 hdrspans.extend(tablecontext.stored_hdrspans) 

2408 

2409 if any("dummy-reset-stored-hdrspans" in ts for ts in v): 2409 ↛ 2410line 2409 didn't jump to line 2410 because the condition on line 2409 was never true

2410 tablecontext.stored_hdrspans = [] 

2411 

2412 if any("dummy-store-hdrspan" in ts for ts in v): 2412 ↛ 2414line 2412 didn't jump to line 2414 because the condition on line 2412 was never true

2413 # print(f"STORED: {col}") 

2414 store_new_hdrspan = True 

2415 else: 

2416 store_new_hdrspan = False 

2417 

2418 new_coltags = list( 

2419 x 

2420 for x in new_coltags 

2421 if not any(t in noinherit_tags for t in x) 

2422 ) 

2423 # print("new_coltags={} previously_seen={} all_hdr_tags={}" 

2424 # .format(new_coltags, previously_seen, all_hdr_tags)) 

2425 if any(new_coltags): 

2426 ( 

2427 col, 

2428 col0_followed_by_nonempty, 

2429 col0_hdrspan, 

2430 ) = add_new_hdrspan( 

2431 col, 

2432 hdrspans, 

2433 store_new_hdrspan, 

2434 col0_followed_by_nonempty, 

2435 col0_hdrspan, 

2436 ) 

2437 

2438 continue 

2439 

2440 # These values are ignored, at least for now 

2441 if re.match(r"^(# |\(see )", col): 2441 ↛ 2442line 2441 didn't jump to line 2442 because the condition on line 2441 was never true

2442 continue 

2443 

2444 if any("dummy-skip-this" in ts for ts in rowtags): 

2445 continue # Skip this cell 

2446 

2447 # If the word has no rowtags and is a multi-row cell, then 

2448 # ignore this. This happens with empty separator rows 

2449 # within a rowspan>1 cell. cf. wander/English/Conjugation. 

2450 if rowtags == [()] and rowspan > 1: 

2451 continue 

2452 

2453 # Minor cleanup. See e.g. είμαι/Greek/Verb present participle. 

2454 if cleanup_rules: 

2455 for regx, substitution in cleanup_rules.items(): 

2456 col = re.sub(regx, substitution, col) 

2457 

2458 if ( 2458 ↛ 2463line 2458 didn't jump to line 2463 because the condition on line 2458 was never true

2459 col_idx == 0 

2460 and not first_col_has_text 

2461 and get_lang_conf(lang, "ignore_top_left_text_cell") is True 

2462 ): 

2463 continue # Skip text at top left, as in Icelandic, Faroese 

2464 

2465 # if col0_hdrspan is not None: 

2466 # print("COL0 FOLLOWED NONHDR: {!r} by {!r}" 

2467 # .format(col0_hdrspan.text, col)) 

2468 col0_followed_by_nonempty = True 

2469 have_text = True 

2470 

2471 # Determine column tags for the multi-column cell 

2472 combined_coltags = compute_coltags( 

2473 lang, pos, hdrspans, col_idx, colspan, col 

2474 ) 

2475 if any("dummy-ignored-text-cell" in ts for ts in combined_coltags): 2475 ↛ 2476line 2475 didn't jump to line 2476 because the condition on line 2475 was never true

2476 continue 

2477 

2478 # print("HAVE_TEXT:", repr(col)) 

2479 # Split the text into separate forms. First simplify spaces except 

2480 # newline. 

2481 col = re.sub(r"[ \t\r]+", " ", col) 

2482 # Split the cell text into alternatives 

2483 

2484 col, alts, split_extra_tags = split_text_into_alts(col) 

2485 

2486 # Some cells have mixed form content, like text and romanization, 

2487 # or text and IPA. Handle these. 

2488 alts = handle_mixed_lines(alts) 

2489 

2490 alts = list((x, combined_coltags) for x in alts) 

2491 

2492 # Generate forms from the alternatives 

2493 # alts is a list of (tuple of forms, tuple of tags) 

2494 for (form, base_roman, ipa), coltags in alts: 

2495 form = form.strip() 

2496 extra_tags = [] 

2497 extra_tags.extend(split_extra_tags) 

2498 # Handle special splits again here, so that we can have custom 

2499 # mappings from form to form and tags. 

2500 if form in form_replacements: 

2501 replacement, tags = form_replacements[form] 

2502 for x in tags.split(): 

2503 assert x in valid_tags 

2504 assert isinstance(replacement, str) 

2505 assert isinstance(tags, str) 

2506 form = replacement 

2507 extra_tags.extend(tags.split()) 

2508 # Clean the value, extracting reference symbols 

2509 form, refs, defs, hdr_tags = extract_cell_content( 

2510 lang, word, form 

2511 ) 

2512 # if refs: 

2513 # print("REFS:", refs) 

2514 extra_tags.extend(hdr_tags) 

2515 # Extract tags from referenced footnotes 

2516 # Extract tags from referenced footnotes 

2517 refs_tags = set() 

2518 for ref in refs: 

2519 if ref in def_ht: 

2520 refs_tags.update(def_ht[ref]) 

2521 

2522 if base_roman: 

2523 base_roman, _, _, hdr_tags = extract_cell_content( 

2524 lang, word, base_roman 

2525 ) 

2526 extra_tags.extend(hdr_tags) 

2527 

2528 # Do some additional cleanup on the cell. 

2529 form = re.sub(r"^\s*,\s*", "", form) 

2530 form = re.sub(r"\s*,\s*$", "", form) 

2531 form = re.sub(r"\s*(,\s*)+", ", ", form) 

2532 form = re.sub(r"(?i)^Main:", "", form) 

2533 form = re.sub(r"\s+", " ", form) 

2534 form = form.strip() 

2535 

2536 # Look for parentheses that have semantic meaning 

2537 form, et = find_semantic_parens(form) 

2538 extra_tags.extend(et) 

2539 

2540 # Handle parentheses in the table element. We parse 

2541 # tags anywhere and romanizations anywhere but beginning. 

2542 roman = base_roman 

2543 paren = None 

2544 clitic = None 

2545 m = re.search(r"(\s+|^)\(([^)]*)\)", form) 

2546 # start|spaces + (anything) 

2547 if m is not None: 

2548 subst = m.group(1) 

2549 paren = m.group(2) 

2550 else: 

2551 m = re.search(r"\(([^)]*)\)(\s+|$)", form) 

2552 # (anything) + spaces|end 

2553 if m is not None: 2553 ↛ 2554line 2553 didn't jump to line 2554 because the condition on line 2553 was never true

2554 paren = m.group(1) 

2555 subst = m.group(2) 

2556 if paren is not None: 

2557 form, roman, clitic = handle_parens( 

2558 form, roman, clitic, extra_tags 

2559 ) 

2560 

2561 # Ignore certain forms that are not really forms, 

2562 # unless they're really, really close to the article title 

2563 if form in ( 2563 ↛ 2568line 2563 didn't jump to line 2568 because the condition on line 2563 was never true

2564 "", 

2565 "unchanged", 

2566 "after an", # in sona/Irish/Adj/Mutation 

2567 ): 

2568 Lev = distw([form], word) 

2569 if form and Lev < 0.1: 

2570 wxr.wtp.debug( 

2571 "accepted possible false positive '{}' with" 

2572 "> 0.1 Levenshtein distance in {}/{}".format( 

2573 form, word, lang 

2574 ), 

2575 sortid="inflection/2213", 

2576 ) 

2577 elif form and Lev < 0.3: 

2578 wxr.wtp.debug( 

2579 "skipped possible match '{}' with > 0.3" 

2580 "Levenshtein distance in {}/{}".format( 

2581 form, word, lang 

2582 ), 

2583 sortid="inflection/2218", 

2584 ) 

2585 continue 

2586 else: 

2587 continue 

2588 # print("ROWTAGS={} COLTAGS={} REFS_TAGS={} " 

2589 # "FORM={!r} ROMAN={!r}" 

2590 # .format(rowtags, coltags, refs_tags, 

2591 # form, roman)) 

2592 

2593 # Merge tags from row and column and do miscellaneous 

2594 # tag-related handling. 

2595 ( 

2596 merge_ret, 

2597 form, 

2598 some_has_covered_text, 

2599 ) = merge_row_and_column_tags(form, some_has_covered_text) 

2600 ret.extend(merge_ret) 

2601 

2602 # End of row. 

2603 rownum += 1 

2604 # For certain languages, if the row was empty, reset 

2605 # hdrspans (saprast/Latvian/Verb, but not aussteigen/German/Verb). 

2606 if row_empty and get_lang_conf(lang, "empty_row_resets"): 

2607 hdrspans = [] 

2608 # Check if we should expand col0_hdrspan. 

2609 if col0_hdrspan is not None: 

2610 col0_allowed = get_lang_conf(lang, "hdr_expand_first") 

2611 col0_cats = tagset_cats(col0_hdrspan.tagsets) 

2612 # Only expand if col0_cats and later_cats are allowed 

2613 # and don't overlap and col0 has tags, and there have 

2614 # been no disallowed cells in between. 

2615 if ( 

2616 not col0_followed_by_nonempty 

2617 and not (col0_cats - col0_allowed) 

2618 and 

2619 # len(col0_cats) == 1 and 

2620 col_idx > col0_hdrspan.start + col0_hdrspan.colspan 

2621 ): 

2622 # If an earlier header is only followed by headers that yield 

2623 # no tags, expand it to entire row 

2624 # print("EXPANDING COL0: {} from {} to {} cols {}" 

2625 # .format(col0_hdrspan.text, col0_hdrspan.colspan, 

2626 # len(row) - col0_hdrspan.start, 

2627 # col0_hdrspan.tagsets)) 

2628 col0_hdrspan.colspan = len(row) - col0_hdrspan.start 

2629 col0_hdrspan.expanded = True 

2630 # XXX handle refs and defs 

2631 # for x in hdrspans: 

2632 # print(" HDRSPAN {} {} {} {!r}" 

2633 # .format(x.start, x.colspan, x.tagsets, x.text)) 

2634 

2635 # Post-process German nouns with articles in separate columns. We move the 

2636 # definite/indefinite/usually-without-article markers into the noun and 

2637 # remove the article entries. 

2638 if get_lang_conf(lang, "articles_in_separate_columns") and any( 

2639 "noun" in x["tags"] for x in ret 

2640 ): 

2641 new_ret = [] 

2642 saved_tags = set() 

2643 had_noun = False 

2644 for dt in ret: 

2645 tags = dt["tags"] 

2646 # print(tags) 

2647 if "noun" in tags: 

2648 tags = list( 

2649 sorted(set(t for t in tags if t != "noun") | saved_tags) 

2650 ) 

2651 had_noun = True 

2652 elif ( 2652 ↛ 2679line 2652 didn't jump to line 2679 because the condition on line 2652 was always true

2653 "indefinite" in tags 

2654 or "definite" in tags 

2655 or "usually-without-article" in tags 

2656 or "without-article" in tags 

2657 ): 

2658 if had_noun: 

2659 saved_tags = set(tags) 

2660 else: 

2661 saved_tags = saved_tags | set(tags) # E.g. Haus/German 

2662 remove_useless_tags(lang, pos, saved_tags) 

2663 saved_tags = saved_tags & set( 

2664 [ 

2665 "masculine", 

2666 "feminine", 

2667 "neuter", 

2668 "singular", 

2669 "plural", 

2670 "indefinite", 

2671 "definite", 

2672 "usually-without-article", 

2673 "without-article", 

2674 ] 

2675 ) 

2676 had_noun = False 

2677 continue # Skip the articles 

2678 

2679 dt = dt.copy() 

2680 dt["tags"] = tags 

2681 new_ret.append(dt) 

2682 ret = new_ret 

2683 

2684 elif possibly_ignored_forms: 

2685 # Some languages have tables with cells that are kind of separated 

2686 # and difficult to handle, like eulersche Formel/German where 

2687 # the definite and indefinite articles are just floating. 

2688 # If a language has a dict of conditionally_ignored_cells, 

2689 # and if the contents of a cell is found in one of the rules 

2690 # there, ignore that cell if it 

2691 # 1. Does not have the appropriate tag (like "definite" for "die") 

2692 # and 

2693 # 2. The title of the article is not one of the other co-words 

2694 # (ie. it's an article for the definite articles in german etc.) 

2695 # pass 

2696 new_ret = [] 

2697 for cell_data in ret: 

2698 tags = cell_data["tags"] 

2699 text = cell_data["form"] 

2700 skip_this = False 

2701 for key_tag, ignored_forms in possibly_ignored_forms.items(): 

2702 if text not in ignored_forms: 2702 ↛ 2704line 2702 didn't jump to line 2704 because the condition on line 2702 was always true

2703 continue 

2704 if word in ignored_forms: 

2705 continue 

2706 if key_tag not in tags: 

2707 skip_this = True 

2708 

2709 if skip_this: 2709 ↛ 2710line 2709 didn't jump to line 2710 because the condition on line 2709 was never true

2710 continue 

2711 new_ret.append(cell_data) 

2712 

2713 ret = new_ret 

2714 

2715 # Post-process English inflection tables, addding "multiword-construction" 

2716 # when the number of words has increased. 

2717 if lang == "English" and pos == "verb": 

2718 word_words = len(word.split()) 

2719 new_ret = [] 

2720 for dt in ret: 

2721 form = dt.get("form", "") 

2722 if len(form.split()) > word_words: 

2723 dt = dt.copy() 

2724 dt["tags"] = list(dt.get("tags", [])) 

2725 # This strange copy-assigning shuffle is preventative black 

2726 # magic; do not touch lest you invoke deep bugs. 

2727 data_append(dt, "tags", "multiword-construction") 

2728 new_ret.append(dt) 

2729 ret = new_ret 

2730 

2731 # Always insert "table-tags" detail as the first entry in any inflection 

2732 # table. This way we can reliably detect where a new table starts. 

2733 # Table-tags applies until the next table-tags entry. 

2734 if ret or table_tags: 

2735 table_tags = list(sorted(set(table_tags))) 

2736 dt = { 

2737 "form": " ".join(table_tags), 

2738 "source": source, 

2739 "tags": ["table-tags"], 

2740 } 

2741 if dt["form"] == "": 

2742 dt["form"] = "no-table-tags" 

2743 if tablecontext.template_name: 

2744 tn = { 

2745 "form": tablecontext.template_name, 

2746 "source": source, 

2747 "tags": ["inflection-template"], 

2748 } 

2749 ret = [dt] + [tn] + ret 

2750 else: 

2751 ret = [dt] + ret 

2752 

2753 return ret 

2754 

2755 

2756def handle_generic_table( 

2757 wxr, tablecontext, data, word, lang, pos, rows, titles, source, after, depth 

2758): 

2759 assert isinstance(wxr, WiktextractContext) 

2760 assert isinstance(data, dict) 

2761 assert isinstance(word, str) 

2762 assert isinstance(lang, str) 

2763 assert isinstance(pos, str) 

2764 assert isinstance(rows, list) 

2765 assert isinstance(source, str) 

2766 assert isinstance(after, str) 

2767 assert isinstance(depth, int) 

2768 for row in rows: 

2769 assert isinstance(row, list) 

2770 for x in row: 

2771 assert isinstance(x, InflCell) 

2772 assert isinstance(titles, list) 

2773 for x in titles: 

2774 assert isinstance(x, str) 

2775 

2776 # Try to parse the table as a simple table 

2777 ret = parse_simple_table( 

2778 wxr, tablecontext, word, lang, pos, rows, titles, source, after, depth 

2779 ) 

2780 if ret is None: 2780 ↛ 2783line 2780 didn't jump to line 2783 because the condition on line 2780 was never true

2781 # XXX handle other table formats 

2782 # We were not able to handle the table 

2783 wxr.wtp.debug( 

2784 "unhandled inflection table format, {}/{}".format(word, lang), 

2785 sortid="inflection/2370", 

2786 ) 

2787 return 

2788 

2789 # Add the returned forms but eliminate duplicates. 

2790 have_forms = set() 

2791 for dt in ret: 

2792 fdt = freeze(dt) 

2793 if fdt in have_forms: 

2794 continue # Don't add duplicates 

2795 # Some Russian words have Declension and Pre-reform declension partially 

2796 # duplicating same data. Don't add "dated" tags variant if already have 

2797 # the same without "dated" from the modern declension table 

2798 

2799 tags = dt.get("tags", []) 

2800 for dated_tag in ("dated",): 

2801 if dated_tag in tags: 

2802 dt2 = dt.copy() 

2803 tags2 = list(x for x in tags if x != dated_tag) 

2804 dt2["tags"] = tags2 

2805 if tags2 and freeze(dt2) in have_forms: 2805 ↛ 2806line 2805 didn't jump to line 2806 because the condition on line 2805 was never true

2806 break # Already have without archaic 

2807 else: 

2808 if "table-tags" not in tags: 

2809 have_forms.add(fdt) 

2810 data_append(data, "forms", dt) 

2811 

2812 

2813def determine_header( 

2814 wxr, 

2815 tablecontext, 

2816 lang, 

2817 word, 

2818 pos, 

2819 table_kind, 

2820 kind, 

2821 style, 

2822 row, 

2823 col, 

2824 celltext, 

2825 titletext, 

2826 cols_headered, 

2827 target, 

2828 cellstyle, 

2829): 

2830 assert isinstance(table_kind, NodeKind) 

2831 assert isinstance(kind, (NodeKind, str)) 

2832 assert style is None or isinstance(style, str) 

2833 assert cellstyle is None or isinstance(cellstyle, str) 

2834 

2835 if table_kind == NodeKind.TABLE: 

2836 header_kind = NodeKind.TABLE_HEADER_CELL 

2837 elif table_kind == NodeKind.HTML: 2837 ↛ 2839line 2837 didn't jump to line 2839 because the condition on line 2837 was always true

2838 header_kind = "th" 

2839 idx = celltext.find(": ") 

2840 is_title = False 

2841 # remove anything in parentheses, compress whitespace, .strip() 

2842 cleaned_titletext = re.sub( 

2843 r"\s+", " ", re.sub(r"\s*\([^)]*\)", "", titletext) 

2844 ).strip() 

2845 cleaned, _, _, _ = extract_cell_content(lang, word, celltext) 

2846 cleaned = re.sub(r"\s+", " ", cleaned) 

2847 hdr_expansion = expand_header( 

2848 wxr, 

2849 tablecontext, 

2850 word, 

2851 lang, 

2852 pos, 

2853 cleaned, 

2854 [], 

2855 silent=True, 

2856 ignore_tags=True, 

2857 ) 

2858 candidate_hdr = not any( 

2859 any(t.startswith("error-") for t in ts) for ts in hdr_expansion 

2860 ) 

2861 # KJ candidate_hdr says that a specific cell is a candidate 

2862 # for being a header because it passed through expand_header 

2863 # without getting any "error-" tags; that is, the contents 

2864 # is "valid" for being a header; these are the false positives 

2865 # we want to catch 

2866 ignored_cell = any( 

2867 any(t.startswith("dummy-") for t in ts) for ts in hdr_expansion 

2868 ) 

2869 # ignored_cell should NOT be used to filter for headers, like 

2870 # candidate_hdr is used, but only to filter for related *debug 

2871 # messages*: some dummy-tags are actually half-way to headers, 

2872 # like ones with "Notes", so they MUST be headers, but later 

2873 # on they're ignored *as* headers so they don't need to print 

2874 # out any cells-as-headers debug messages. 

2875 if ( 

2876 candidate_hdr 

2877 and kind != header_kind 

2878 and cleaned != "" 

2879 and cleaned != "dummy-ignored-text-cell" 

2880 and cleaned not in IGNORED_COLVALUES 

2881 ): 

2882 # print("col: {}".format(col)) 

2883 if not ignored_cell and lang not in LANGUAGES_WITH_CELLS_AS_HEADERS: 

2884 wxr.wtp.debug( 

2885 "rejected heuristic header: " 

2886 "table cell identified as header and given " 

2887 "candidate status, BUT {} is not in " 

2888 "LANGUAGES_WITH_CELLS_AS_HEADERS; " 

2889 "cleaned text: {}".format(lang, cleaned), 

2890 sortid="inflection/2447", 

2891 ) 

2892 candidate_hdr = False 

2893 elif cleaned not in LANGUAGES_WITH_CELLS_AS_HEADERS.get(lang, ""): 

2894 wxr.wtp.debug( 

2895 "rejected heuristic header: " 

2896 "table cell identified as header and given " 

2897 "candidate status, BUT the cleaned text is " 

2898 "not in LANGUAGES_WITH_CELLS_AS_HEADERS[{}]; " 

2899 "cleaned text: {}".format(lang, cleaned), 

2900 sortid="inflection/2457", 

2901 ) 

2902 candidate_hdr = False 

2903 else: 

2904 wxr.wtp.debug( 

2905 "accepted heuristic header: " 

2906 "table cell identified as header and given " 

2907 "candidate status, AND the cleaned text is " 

2908 "in LANGUAGES_WITH_CELLS_AS_HEADERS[{}]; " 

2909 "cleaned text: {}".format(lang, cleaned), 

2910 sortid="inflection/2466", 

2911 ) 

2912 

2913 # If the cell starts with something that could start a 

2914 # definition (typically a reference symbol), make it a candidate 

2915 # regardless of whether the language is listed. 

2916 if re.match(def_re, cleaned) and not re.match(nondef_re, cleaned): 2916 ↛ 2917line 2916 didn't jump to line 2917 because the condition on line 2916 was never true

2917 candidate_hdr = True 

2918 

2919 # print("titletext={!r} hdr_expansion={!r} candidate_hdr={!r} " 

2920 # "lang={} pos={}" 

2921 # .format(titletext, hdr_expansion, candidate_hdr, 

2922 # lang, pos)) 

2923 if idx >= 0 and titletext[:idx] in infl_map: 

2924 target = titletext[idx + 2 :].strip() 

2925 celltext = celltext[:idx] 

2926 is_title = True 

2927 elif ( 

2928 kind == header_kind 

2929 and " + " not in titletext # For "avoir + blah blah"? 

2930 and not any( 

2931 isinstance(x, WikiNode) 

2932 and x.kind == NodeKind.HTML 

2933 and x.sarg == "span" 

2934 and x.attrs.get("lang") in ("az",) 

2935 for x in col.children 

2936 ) 

2937 ): 

2938 is_title = True 

2939 elif ( 

2940 candidate_hdr 

2941 and cleaned_titletext not in IGNORED_COLVALUES 

2942 and distw([cleaned_titletext], word) > 0.3 

2943 and cleaned_titletext not in ("I", "es") 

2944 ): 

2945 is_title = True 

2946 # if first column or same style as first column 

2947 elif ( 

2948 style == cellstyle 

2949 and 

2950 # and title is not identical to word name 

2951 titletext != word 

2952 and cleaned not in IGNORED_COLVALUES 

2953 and cleaned != "dummy-ignored-text-cell" 

2954 and 

2955 # the style composite string is not broken 

2956 not style.startswith("////") 

2957 and " + " not in titletext 

2958 ): 

2959 if not ignored_cell and lang not in LANGUAGES_WITH_CELLS_AS_HEADERS: 2959 ↛ 2960line 2959 didn't jump to line 2960 because the condition on line 2959 was never true

2960 wxr.wtp.debug( 

2961 "rejected heuristic header: " 

2962 "table cell identified as header based " 

2963 "on style, BUT {} is not in " 

2964 "LANGUAGES_WITH_CELLS_AS_HEADERS; " 

2965 "cleaned text: {}, style: {}".format(lang, cleaned, style), 

2966 sortid="inflection/2512", 

2967 ) 

2968 elif ( 2968 ↛ 2972line 2968 didn't jump to line 2972 because the condition on line 2968 was never true

2969 not ignored_cell 

2970 and cleaned not in LANGUAGES_WITH_CELLS_AS_HEADERS.get(lang, "") 

2971 ): 

2972 wxr.wtp.debug( 

2973 "rejected heuristic header: " 

2974 "table cell identified as header based " 

2975 "on style, BUT the cleaned text is " 

2976 "not in LANGUAGES_WITH_CELLS_AS_HEADERS[{}]; " 

2977 "cleaned text: {}, style: {}".format(lang, cleaned, style), 

2978 sortid="inflection/2522", 

2979 ) 

2980 else: 

2981 wxr.wtp.debug( 

2982 "accepted heuristic header: " 

2983 "table cell identified as header based " 

2984 "on style, AND the cleaned text is " 

2985 "in LANGUAGES_WITH_CELLS_AS_HEADERS[{}]; " 

2986 "cleaned text: {}, style: {}".format(lang, cleaned, style), 

2987 sortid="inflection/2530", 

2988 ) 

2989 is_title = True 

2990 if ( 2990 ↛ 2997line 2990 didn't jump to line 2997 because the condition on line 2990 was never true

2991 not is_title 

2992 and len(row) < len(cols_headered) 

2993 and cols_headered[len(row)] 

2994 ): 

2995 # Whole column has title suggesting they are headers 

2996 # (e.g. "Case") 

2997 is_title = True 

2998 if re.match( 

2999 r"Conjugation of |Declension of |Inflection of |" 

3000 r"Mutation of |Notes\b", # \b is word-boundary 

3001 titletext, 

3002 ): 

3003 is_title = True 

3004 return is_title, hdr_expansion, target, celltext 

3005 

3006 

3007class TableContext: 

3008 """Saved context used when parsing a table and its subtables.""" 

3009 

3010 __slot__ = ( 

3011 "stored_hdrspans", 

3012 "section_header", 

3013 "template_name", 

3014 ) 

3015 

3016 def __init__(self, template_name=None): 

3017 self.stored_hdrspans = [] 

3018 self.section_header = [] 

3019 if not template_name: 

3020 self.template_name = "" 

3021 else: 

3022 self.template_name = template_name 

3023 

3024 

3025def handle_wikitext_or_html_table( 

3026 wxr, word, lang, pos, data, tree, titles, source, after, tablecontext=None 

3027): 

3028 """Parses a table from parsed Wikitext format into rows and columns of 

3029 InflCell objects and then calls handle_generic_table() to parse it into 

3030 forms. This adds the forms into ``data``.""" 

3031 assert isinstance(wxr, WiktextractContext) 

3032 assert isinstance(word, str) 

3033 assert isinstance(lang, str) 

3034 assert isinstance(pos, str) 

3035 assert isinstance(data, dict) 

3036 assert isinstance(tree, WikiNode) 

3037 assert tree.kind == NodeKind.TABLE or ( 

3038 tree.kind == NodeKind.HTML and tree.sarg == "table" 

3039 ) 

3040 assert isinstance(titles, list) 

3041 assert isinstance(source, str) 

3042 for x in titles: 

3043 assert isinstance(x, str) 

3044 assert isinstance(after, str) 

3045 assert tablecontext is None or isinstance(tablecontext, TableContext) 

3046 # Imported here to avoid a circular import 

3047 from wiktextract.page import clean_node, recursively_extract 

3048 

3049 if not tablecontext: 

3050 tablecontext = TableContext() 

3051 

3052 def handle_table1( 

3053 wxr, 

3054 tablecontext, 

3055 word, 

3056 lang, 

3057 pos, 

3058 data, 

3059 tree, 

3060 titles, 

3061 source, 

3062 after, 

3063 depth, 

3064 ): 

3065 """Helper function allowing the 'flattening' out of the table 

3066 recursion: instead of handling the tables in the wrong order 

3067 (recursively), this function adds to new_row that is then 

3068 iterated through in the main function at the end, creating 

3069 a longer table (still in pieces) in the correct order.""" 

3070 

3071 assert isinstance(data, dict) 

3072 assert isinstance(titles, list) 

3073 assert isinstance(source, str) 

3074 for x in titles: 

3075 assert isinstance(x, str) 

3076 assert isinstance(after, str) 

3077 assert isinstance(depth, int) 

3078 # print("HANDLE_WIKITEXT_TABLE", titles) 

3079 

3080 col_gap_data = [] # Filling for columns with rowspan > 1 

3081 # col_gap_data contains None or InflCell 

3082 vertical_still_left = [] # Number of remaining rows for which to fill 

3083 # the column; vertical_still_left contains int 

3084 cols_headered = [] # [F, T, F, F...] 

3085 # True when the whole column contains headers, even 

3086 # when the cell is not considered a header; triggered 

3087 # by the "*" inflmap meta-tag. 

3088 rows = [] 

3089 

3090 sub_ret = [] 

3091 

3092 for node in tree.children: 

3093 if not isinstance(node, WikiNode): 

3094 continue 

3095 if node.kind == NodeKind.HTML: 

3096 kind = node.sarg 

3097 else: 

3098 kind = node.kind 

3099 

3100 # print(" {}".format(node)) 

3101 if kind in (NodeKind.TABLE_CAPTION, "caption"): 

3102 # print(" CAPTION:", node) 

3103 pass 

3104 elif kind in (NodeKind.TABLE_ROW, "tr"): 

3105 if "vsShow" in node.attrs.get("class", "").split(): 

3106 # vsShow rows are those that are intially shown in tables 

3107 # that have more data. The hidden data duplicates these 

3108 # rows, so we skip it and just process the hidden data. 

3109 continue 

3110 

3111 # Parse a table row. 

3112 row = [] 

3113 style = None 

3114 row_has_nonempty_cells = False 

3115 # Have nonempty cell not from rowspan 

3116 for col in node.children: 

3117 # loop through each cell in the ROW 

3118 if not isinstance(col, WikiNode): 

3119 # This skip is not used for counting, 

3120 # "None" is not used in 

3121 # indexing or counting or looping. 

3122 continue 

3123 if col.kind == NodeKind.HTML: 

3124 kind = col.sarg 

3125 else: 

3126 kind = col.kind 

3127 if kind not in ( 3127 ↛ 3133line 3127 didn't jump to line 3133 because the condition on line 3127 was never true

3128 NodeKind.TABLE_HEADER_CELL, 

3129 NodeKind.TABLE_CELL, 

3130 "th", 

3131 "td", 

3132 ): 

3133 print(" UNEXPECTED ROW CONTENT: {}".format(col)) 

3134 continue 

3135 

3136 while ( 

3137 len(row) < len(vertical_still_left) 

3138 and vertical_still_left[len(row)] > 0 

3139 ): 

3140 # vertical_still_left is [...0, 0, 2...] for each 

3141 # column. It is populated at the end of the loop, at the 

3142 # same time as col_gap_data. This needs to be looped and 

3143 # filled this way because each `for col`-looping jumps 

3144 # straight to the next meaningful cell; there is no 

3145 # "None" cells, only emptiness between, and rowspan and 

3146 # colspan are just to generate the "fill- 

3147 vertical_still_left[len(row)] -= 1 

3148 row.append(col_gap_data[len(row)]) 

3149 

3150 # appending row is how "indexing" is 

3151 # done here; something is appended, 

3152 # like a filler-cell here or a "start" 

3153 # cell at the end of the row-loop, 

3154 # which increased len(row) which is 

3155 # then used as the target-index to check 

3156 # for gaps. vertical_still_left is 

3157 # the countdown to when to stop 

3158 # filling in gaps, and goes down to 0, 

3159 # and col_gap_data is not touched 

3160 # except when a new rowspan is needed, 

3161 # at the same time that 

3162 # vertical_still_left gets reassigned. 

3163 

3164 try: 

3165 rowspan = int(col.attrs.get("rowspan", "1")) # 🡙 

3166 colspan = int(col.attrs.get("colspan", "1")) # 🡘 

3167 except ValueError: 

3168 rowspan = 1 

3169 colspan = 1 

3170 # print("COL:", col) 

3171 

3172 # Process any nested tables recursively. 

3173 tables, rest = recursively_extract( 

3174 col, 

3175 lambda x: isinstance(x, WikiNode) 

3176 and (x.kind == NodeKind.TABLE or x.sarg == "table"), 

3177 ) 

3178 

3179 # Clean the rest of the cell. 

3180 celltext = clean_node(wxr, None, rest) 

3181 # print("CLEANED:", celltext) 

3182 

3183 # Handle nested tables. 

3184 for tbl in tables: 

3185 # Some nested tables (e.g., croí/Irish) have subtitles 

3186 # as normal paragraphs in the same cell under a descrip- 

3187 # tive text that should be treated as a title (e.g., 

3188 # "Forms with the definite article", with "definite" not 

3189 # mentioned elsewhere). 

3190 new_titles = list(titles) 

3191 if celltext: 

3192 new_titles.append(celltext) 

3193 subtbl = handle_table1( 

3194 wxr, 

3195 tablecontext, 

3196 word, 

3197 lang, 

3198 pos, 

3199 data, 

3200 tbl, 

3201 new_titles, 

3202 source, 

3203 "", 

3204 depth + 1, 

3205 ) 

3206 if subtbl: 3206 ↛ 3184line 3206 didn't jump to line 3184 because the condition on line 3206 was always true

3207 sub_ret.append((rows, titles, after, depth)) 

3208 rows = [] 

3209 titles = [] 

3210 after = "" 

3211 sub_ret.extend(subtbl) 

3212 

3213 # This magic value is used as part of header detection 

3214 cellstyle = ( 

3215 col.attrs.get("style", "") 

3216 + "//" 

3217 + col.attrs.get("class", "") 

3218 + "//" 

3219 + str(kind) 

3220 ) 

3221 

3222 if not row: # if first column in row 

3223 style = cellstyle 

3224 target = None 

3225 titletext = celltext.strip() 

3226 while titletext and is_superscript(titletext[-1]): 

3227 titletext = titletext[:-1] 

3228 

3229 ( 

3230 is_title, 

3231 hdr_expansion, 

3232 target, 

3233 celltext, 

3234 ) = determine_header( 

3235 wxr, 

3236 tablecontext, 

3237 lang, 

3238 word, 

3239 pos, 

3240 tree.kind, 

3241 kind, 

3242 style, 

3243 row, 

3244 col, 

3245 celltext, 

3246 titletext, 

3247 cols_headered, 

3248 None, 

3249 cellstyle, 

3250 ) 

3251 

3252 if is_title: 

3253 # If this cell gets a "*" tag, make the whole column 

3254 # below it (toggling it in cols_headered = [F, F, T...]) 

3255 # into headers. 

3256 while len(cols_headered) <= len(row): 

3257 cols_headered.append(False) 

3258 if any("*" in tt for tt in hdr_expansion): 

3259 cols_headered[len(row)] = True 

3260 celltext = "" 

3261 # if row_has_nonempty_cells has been True at some point, it 

3262 # keeps on being True. 

3263 # if row_has_nonempty_cells or is_title or celltext != "": 

3264 # row_has_nonempty_cells = True 

3265 # ⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓⇓ 

3266 row_has_nonempty_cells |= is_title or celltext != "" 

3267 cell = InflCell( 

3268 celltext, is_title, colspan, rowspan, target 

3269 ) 

3270 for _ in range(0, colspan): 

3271 # colspan🡘 current loop (col) or 1 

3272 # All the data-filling for colspan 

3273 # is done simply in this loop, 

3274 # while rowspan needs to use 

3275 # vertical_still_left to count gaps 

3276 # and col_gap_data to fill in 

3277 # those gaps with InflCell data. 

3278 if rowspan > 1: # rowspan🡙 current loop (col) or 1 

3279 while len(col_gap_data) <= len(row): 

3280 # Initialize col_gap_data/ed if 

3281 # it is lacking slots 

3282 # for each column; col_gap_data and 

3283 # vertical_still_left are never 

3284 # reset to [], during 

3285 # the whole table function. 

3286 col_gap_data.append(None) 

3287 vertical_still_left.append(0) 

3288 # Below is where the "rectangle" block of rowspan 

3289 # and colspan is filled for the future. 

3290 col_gap_data[len(row)] = cell 

3291 # col_gap_data contains cells that 

3292 # will be used in the 

3293 # future, or None 

3294 vertical_still_left[len(row)] = rowspan - 1 

3295 # A counter for how many gaps🡙 are still left to be 

3296 # filled (row.append or 

3297 # row[col_gap_data[len(row)] => 

3298 # rows), it is not reset to [], but decremented to 0 

3299 # each time a row gets something from col_gap_data. 

3300 # Append this cell 1+ times for colspan🡘 

3301 row.append(cell) 

3302 if not row: 

3303 continue 

3304 # After looping the original row-nodes above, fill 

3305 # in the rest of the row if the final cell has colspan 

3306 # (inherited from above, so a cell with rowspan and colspan) 

3307 for i in range(len(row), len(vertical_still_left)): 

3308 if vertical_still_left[i] <= 0: 

3309 continue 

3310 vertical_still_left[i] -= 1 

3311 while len(row) < i: 

3312 row.append(InflCell("", False, 1, 1, None)) 

3313 row.append(col_gap_data[i]) 

3314 # print(" ROW {!r}".format(row)) 

3315 if row_has_nonempty_cells: 3315 ↛ 3092line 3315 didn't jump to line 3092 because the condition on line 3315 was always true

3316 rows.append(row) 

3317 elif kind in ( 3317 ↛ 3092line 3317 didn't jump to line 3092 because the condition on line 3317 was always true

3318 NodeKind.TABLE_HEADER_CELL, 

3319 NodeKind.TABLE_CELL, 

3320 "th", 

3321 "td", 

3322 "span", 

3323 ): 

3324 # print(" TOP-LEVEL CELL", node) 

3325 pass 

3326 

3327 if sub_ret: 

3328 main_ret = sub_ret 

3329 main_ret.append((rows, titles, after, depth)) 

3330 else: 

3331 main_ret = [(rows, titles, after, depth)] 

3332 return main_ret 

3333 

3334 new_rows = handle_table1( 

3335 wxr, tablecontext, word, lang, pos, data, tree, titles, source, after, 0 

3336 ) 

3337 

3338 # Now we have a table that has been parsed into rows and columns of 

3339 # InflCell objects. Parse the inflection table from that format. 

3340 if new_rows: 3340 ↛ exitline 3340 didn't return from function 'handle_wikitext_or_html_table' because the condition on line 3340 was always true

3341 for rows, titles, after, depth in new_rows: 

3342 handle_generic_table( 

3343 wxr, 

3344 tablecontext, 

3345 data, 

3346 word, 

3347 lang, 

3348 pos, 

3349 rows, 

3350 titles, 

3351 source, 

3352 after, 

3353 depth, 

3354 ) 

3355 

3356 

3357def handle_html_table( 

3358 wxr, word, lang, pos, data, tree, titles, source, after, tablecontext=None 

3359): 

3360 """A passer-on function for html-tables, XXX, remove these?""" 

3361 handle_wikitext_or_html_table( 

3362 wxr, word, lang, pos, data, tree, titles, source, after, tablecontext 

3363 ) 

3364 

3365 

3366def handle_wikitext_table( 

3367 wxr, word, lang, pos, data, tree, titles, source, after, tablecontext=None 

3368): 

3369 """A passer-on function for html-tables, XXX, remove these?""" 

3370 handle_wikitext_or_html_table( 

3371 wxr, word, lang, pos, data, tree, titles, source, after, tablecontext 

3372 ) 

3373 

3374 

3375def parse_inflection_section( 

3376 wxr, data, word, lang, pos, section, tree, tablecontext=None 

3377): 

3378 """Parses an inflection section on a page. ``data`` should be the 

3379 data for a part-of-speech, and inflections will be added to it.""" 

3380 

3381 # print("PARSE_INFLECTION_SECTION {}/{}/{}/{}" 

3382 # .format(word, lang, pos, section)) 

3383 assert isinstance(wxr, WiktextractContext) 

3384 assert isinstance(data, dict) 

3385 assert isinstance(word, str) 

3386 assert isinstance(lang, str) 

3387 assert isinstance(section, str) 

3388 assert isinstance(tree, WikiNode) 

3389 assert tablecontext is None or isinstance(tablecontext, TableContext) 

3390 source = section 

3391 tables = [] 

3392 titleparts = [] 

3393 preceding_bolded_title = "" 

3394 

3395 def process_tables(): 

3396 for kind, node, titles, after in tables: 

3397 after = "".join(after).strip() 

3398 after = clean_value(wxr, after) 

3399 if kind == "wikitext": 

3400 handle_wikitext_table( 

3401 wxr, 

3402 word, 

3403 lang, 

3404 pos, 

3405 data, 

3406 node, 

3407 titles, 

3408 source, 

3409 after, 

3410 tablecontext=tablecontext, 

3411 ) 

3412 elif kind == "html": 3412 ↛ 3426line 3412 didn't jump to line 3426 because the condition on line 3412 was always true

3413 handle_html_table( 

3414 wxr, 

3415 word, 

3416 lang, 

3417 pos, 

3418 data, 

3419 node, 

3420 titles, 

3421 source, 

3422 after, 

3423 tablecontext=tablecontext, 

3424 ) 

3425 else: 

3426 raise RuntimeError( 

3427 "{}: unimplemented table kind {}".format(word, kind) 

3428 ) 

3429 

3430 def recurse_navframe(node, titles): 

3431 nonlocal tables 

3432 nonlocal titleparts 

3433 titleparts = [] 

3434 old_tables = tables 

3435 tables = [] 

3436 

3437 recurse(node, [], navframe=True) 

3438 

3439 process_tables() 

3440 tables = old_tables 

3441 

3442 def recurse(node, titles, navframe=False): 

3443 nonlocal tables 

3444 if isinstance(node, (list, tuple)): 

3445 for x in node: 

3446 recurse(x, titles, navframe) 

3447 return 

3448 if isinstance(node, str): 

3449 if tables: 

3450 tables[-1][-1].append(node) 

3451 elif navframe: 

3452 titleparts.append(node) 

3453 return 

3454 if not isinstance(node, WikiNode): 3454 ↛ 3455line 3454 didn't jump to line 3455 because the condition on line 3454 was never true

3455 if navframe: 

3456 wxr.wtp.debug( 

3457 "inflection table: unhandled in NavFrame: {}".format(node), 

3458 sortid="inflection/2907", 

3459 ) 

3460 return 

3461 kind = node.kind 

3462 if navframe: 

3463 if kind == NodeKind.HTML: 

3464 classes = node.attrs.get("class", "").split() 

3465 if "NavToggle" in classes: 3465 ↛ 3466line 3465 didn't jump to line 3466 because the condition on line 3465 was never true

3466 return 

3467 if "NavHead" in classes: 

3468 # print("NAVHEAD:", node) 

3469 recurse(node.children, titles, navframe) 

3470 return 

3471 if "NavContent" in classes: 

3472 # print("NAVCONTENT:", node) 

3473 title = "".join(titleparts).strip() 

3474 title = html.unescape(title) 

3475 title = title.strip() 

3476 new_titles = list(titles) 

3477 if not re.match(r"(Note:|Notes:)", title): 3477 ↛ 3479line 3477 didn't jump to line 3479 because the condition on line 3477 was always true

3478 new_titles.append(title) 

3479 recurse(node, new_titles, navframe=False) 

3480 return 

3481 else: 

3482 if kind == NodeKind.TABLE: 

3483 tables.append(["wikitext", node, titles, []]) 

3484 return 

3485 elif kind == NodeKind.HTML and node.sarg == "table": 

3486 classes = node.attrs.get("class", ()) 

3487 if "audiotable" in classes: 

3488 return 

3489 tables.append(["html", node, titles, []]) 

3490 return 

3491 elif kind in ( 3491 ↛ 3498line 3491 didn't jump to line 3498 because the condition on line 3491 was never true

3492 NodeKind.LEVEL2, 

3493 NodeKind.LEVEL3, 

3494 NodeKind.LEVEL4, 

3495 NodeKind.LEVEL5, 

3496 NodeKind.LEVEL6, 

3497 ): 

3498 return # Skip subsections 

3499 if ( 

3500 kind == NodeKind.HTML 

3501 and node.sarg == "div" 

3502 and "NavFrame" in node.attrs.get("class", "").split() 

3503 ): 

3504 recurse_navframe(node, titles) 

3505 return 

3506 if kind == NodeKind.LINK: 

3507 if len(node.largs) > 1: 

3508 recurse(node.largs[1:], titles, navframe) 

3509 else: 

3510 recurse(node.largs[0], titles, navframe) 

3511 return 

3512 if kind == NodeKind.LIST and node.sarg == ";": 3512 ↛ 3514line 3512 didn't jump to line 3514 because the condition on line 3512 was never true

3513 nonlocal preceding_bolded_title 

3514 from wiktextract.page import clean_node 

3515 preceding_bolded_title = clean_node(wxr, None, node).strip("; ") 

3516 for x in node.children: 

3517 recurse(x, titles, navframe) 

3518 

3519 assert tree.kind == NodeKind.ROOT 

3520 for x in tree.children: 

3521 if preceding_bolded_title != "": 3521 ↛ 3522line 3521 didn't jump to line 3522 because the condition on line 3521 was never true

3522 recurse(x, [preceding_bolded_title]) 

3523 else: 

3524 recurse(x, []) 

3525 

3526 # Process the tables we found 

3527 process_tables() 

3528 

3529 # XXX this code is used for extracting tables for inflection tests 

3530 if wxr.config.expand_tables: 3530 ↛ 3531line 3530 didn't jump to line 3531 because the condition on line 3530 was never true

3531 if section != "Mutation": 

3532 with open(wxr.config.expand_tables, "w") as f: 

3533 f.write(word + "\n") 

3534 f.write(lang + "\n") 

3535 f.write(pos + "\n") 

3536 f.write(section + "\n") 

3537 text = wxr.wtp.node_to_wikitext(tree) 

3538 f.write(text + "\n")