Coverage for src/wiktextract/extractor/el/table.py: 67%

239 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2025-10-13 10:14 +0000

1import re 

2from typing import TypeAlias 

3from unicodedata import name as unicode_name 

4 

5from wikitextprocessor import HTMLNode, NodeKind, TemplateNode, WikiNode 

6 

7from wiktextract.clean import clean_value 

8from wiktextract.extractor.el.tags import base_tag_map 

9from wiktextract.wxr_context import WiktextractContext 

10from wiktextract.wxr_logging import logger 

11 

12from .models import Form, WordEntry 

13from .parse_utils import GREEK_LANGCODES, remove_duplicate_forms 

14 

15# from .simple_tags import simple_tag_map 

16# from .tags_utils import convert_tags 

17 

18# Shorthand for this file. Could be an import, but it's so simple... 

19Node = str | WikiNode 

20 

21 

22# GREEK TABLE HEURISTICS: 

23# If it's a table for a Greek language entry, if it's in a header or is in 

24# italics, it's a header. 

25# If it's NOT a Greek entry and has Greek text, it's a header. 

26 

27 

28# node_fns are different from template_fns. template_fns are functions that 

29# are used to handle how to expand (and otherwise process) templates, while 

30# node functions are used when turning any parsed "abstract" nodes into strings. 

31def cell_node_fn( 

32 node: WikiNode, 

33) -> list[Node] | None: 

34 """Handle nodes in the parse tree specially.""" 

35 assert isinstance(node, WikiNode) 

36 if node.kind == NodeKind.ITALIC: 36 ↛ 37line 36 didn't jump to line 37 because the condition on line 36 was never true

37 return ["__I__", *node.children, "__/I__"] 

38 if node.kind == NodeKind.BOLD: 38 ↛ 39line 38 didn't jump to line 39 because the condition on line 38 was never true

39 return ["__B__", *node.children, "__/B__"] 

40 # In case someone puts tables inside tables... 

41 kind = node.kind 

42 if kind in { 

43 NodeKind.TABLE_CELL, 

44 NodeKind.TABLE_HEADER_CELL, 

45 }: 

46 return node.children 

47 return None 

48 

49 

50BOLD_RE = re.compile(r"(__/?[BI]__)") 

51 

52ARTICLES: set[str] = { 

53 "ο", 

54 "η", 

55 "το", 

56 "την", 

57 "της", 

58 "τον", 

59 "τη", 

60 "το", 

61 "οι", 

62 "οι", 

63 "τα", 

64 "των", 

65 "τους", 

66 "του", 

67 "τις", 

68 "τα", 

69} 

70 

71 

72def localize_verb_inflection_raw_tags(form: Form) -> None: 

73 # Leaves raw_tags untouched 

74 verb_tags = [] 

75 

76 for raw_tag in form.raw_tags: 

77 clean_raw_tag = raw_tag.replace("\n", " ").lower() 

78 localized = base_tag_map.get(clean_raw_tag) 

79 if localized is not None: 79 ↛ 76line 79 didn't jump to line 76 because the condition on line 79 was always true

80 verb_tags.extend(localized) 

81 

82 unique_tags = list(set(verb_tags)) 

83 unique_tags.sort() 

84 form.tags.extend(unique_tags) 

85 

86 

87def process_inflection_section( 

88 wxr: WiktextractContext, data: WordEntry, snode: WikiNode 

89): 

90 table_nodes: list[tuple[str | None, WikiNode]] = [] 

91 # template_depth is used as a nonlocal variable in bold_node_handler 

92 # to gauge how deep inside a top-level template we are; we want to 

93 # collect template data only for the top-level templates that are 

94 # visible in the wikitext, not templates inside templates. 

95 template_depth = 0 

96 top_template_name: str | None = None 

97 

98 def table_node_handler_fn( 

99 node: WikiNode, 

100 ) -> list[str | WikiNode] | None: 

101 """Insert special markers `__*__` and `__/*__` around bold nodes so 

102 that the strings can later be split into "head-word" and "tag-words" 

103 parts. Collect incidental stuff, like side-tables, that are often 

104 put around the head.""" 

105 assert isinstance(node, WikiNode) 

106 kind = node.kind 

107 nonlocal template_depth 

108 nonlocal top_template_name 

109 if isinstance(node, TemplateNode): 109 ↛ 113line 109 didn't jump to line 113 because the condition on line 109 was never true

110 # Recursively expand templates so that even nodes inside the 

111 # the templates are handled with bold_node_handler. 

112 # Argh. Don't use "node_to_text", that causes bad output... 

113 expanded = wxr.wtp.expand(wxr.wtp.node_to_wikitext(node)) 

114 if template_depth == 0: 

115 # We are looking at a top-level template in the original 

116 # wikitext. 

117 top_template_name = node.template_name 

118 new_node = wxr.wtp.parse(expanded) 

119 

120 template_depth += 1 

121 ret = wxr.wtp.node_to_text( 

122 new_node, node_handler_fn=table_node_handler_fn 

123 ) 

124 template_depth -= 1 

125 if template_depth == 0: 

126 top_template_name = None 

127 return ret 

128 

129 if kind in { 

130 NodeKind.TABLE, 

131 }: 

132 # XXX Handle tables here 

133 # template depth and top-level template name 

134 nonlocal table_nodes 

135 table_nodes.append((top_template_name, node)) 

136 return [""] 

137 return None 

138 

139 _ = wxr.wtp.node_to_html(snode, node_handler_fn=table_node_handler_fn) 

140 

141 if len(table_nodes) > 0: 141 ↛ 154line 141 didn't jump to line 154 because the condition on line 141 was always true

142 for template_name, table_node in table_nodes: 

143 # XXX template_name 

144 parse_table( 

145 wxr, 

146 table_node, 

147 data, 

148 data.lang_code in GREEK_LANGCODES, 

149 template_name=template_name or "", 

150 ) 

151 for form in data.forms: 

152 localize_verb_inflection_raw_tags(form) 

153 

154 data.forms = remove_duplicate_forms(wxr, data.forms) 

155 

156 

157def parse_table( 

158 wxr: WiktextractContext, 

159 tnode: WikiNode, 

160 data: WordEntry, 

161 is_greek_entry: bool = False, # Whether the entry is for a Greek word 

162 template_name: str = "", 

163) -> None: 

164 """Parse inflection table. Generates 'form' data; 'foos' is a form of 'foo' 

165 with the tags ['plural'].""" 

166 assert (isinstance(tnode, WikiNode) and tnode.kind == NodeKind.TABLE) or ( 

167 isinstance(tnode, HTMLNode) and tnode.tag == "table" 

168 ) 

169 

170 is_html_table = isinstance(tnode, HTMLNode) 

171 

172 # Some debugging code: if wiktwords is passed a --inflection-tables-file 

173 # argument, we save tables to a file for debugging purposes, or for just 

174 # getting tables that can be used as test data. 

175 if wxr.config.expand_tables: 175 ↛ 176line 175 didn't jump to line 176 because the condition on line 175 was never true

176 with open(wxr.config.expand_tables, "w") as f: 

177 f.write(f"{wxr.wtp.title=}\n") 

178 text = wxr.wtp.node_to_wikitext(tnode) 

179 f.write(f"{text}\n") 

180 

181 Row: TypeAlias = int 

182 Column: TypeAlias = int 

183 

184 # We complete the table using nested dicts (instead of arrays for 

185 # convenience) such that when we come across a node, we push that node's 

186 # reference to each coordinate point in the table grid it occupies. Each 

187 # grid point can then be checked for if it's been handled already and 

188 # skipped if needed. 

189 table_grid: dict[Row, dict[Column, WikiNode]] = {} 

190 

191 first_column_is_headers = True 

192 

193 for r, row in enumerate( 

194 tnode.find_html_recursively("tr") 

195 if is_html_table 

196 else tnode.find_child_recursively(NodeKind.TABLE_ROW) 

197 ): 

198 c = 0 

199 # print(f"{r=}, {row=}") 

200 if r not in table_grid: 

201 table_grid[r] = {} 

202 

203 for cell in ( 

204 row.find_html(["th", "td"]) 

205 if is_html_table 

206 else row.find_child( 

207 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL, 

208 ) 

209 ): 

210 while c in table_grid[r]: 

211 c += 1 

212 

213 try: 

214 rowspan = int(cell.attrs.get("rowspan", "1")) # 🡙 

215 colspan = int(cell.attrs.get("colspan", "1")) # 🡘 

216 except ValueError: 

217 rowspan = 1 

218 colspan = 1 

219 # print("COL:", col) 

220 

221 if colspan > 30: 221 ↛ 222line 221 didn't jump to line 222 because the condition on line 221 was never true

222 wxr.wtp.error( 

223 f"Colspan {colspan} over 30, set to 1", 

224 sortid="table/128/20250207", 

225 ) 

226 colspan = 1 

227 if rowspan > 30: 227 ↛ 228line 227 didn't jump to line 228 because the condition on line 227 was never true

228 wxr.wtp.error( 

229 f"Rowspan {rowspan} over 30, set to 1", 

230 sortid="table/134/20250207b", 

231 ) 

232 rowspan = 1 

233 

234 for rr in range(r, r + rowspan): 

235 if rr not in table_grid: 

236 table_grid[rr] = {} 

237 for cc in range(c, c + colspan): 

238 table_grid[rr][cc] = cell 

239 

240 if not table_grid[len(table_grid) - 1]: 240 ↛ 242line 240 didn't jump to line 242 because the condition on line 240 was never true

241 # Last row is empty; traverse backwards to skip empty rows at end 

242 last_item = None 

243 for i, rowd in reversed(table_grid.items()): 

244 if rowd: 

245 last_item = i 

246 break 

247 

248 assert last_item is not None 

249 

250 new_table_grid = dict() 

251 for i, rowd in table_grid.items(): 

252 if i > last_item: 

253 continue 

254 new_table_grid[i] = rowd 

255 table_grid = new_table_grid 

256 

257 if len(table_grid[0]) == 1: 257 ↛ 259line 257 didn't jump to line 259 because the condition on line 257 was never true

258 # Table is one column in width, no headers on rows 

259 first_column_is_headers = False 

260 

261 if len(table_grid) == 2: 261 ↛ 263line 261 didn't jump to line 263 because the condition on line 261 was never true

262 # There's only one or two rows 

263 first_column_is_headers = False 

264 

265 # Headers are saved in two dict that has their keys made out of tuples 

266 # made of their "bookends": so {(1,1), "foo"} for a header that is made 

267 # up of the first cell only of a row in the column_hdrs dict. 

268 # If we come across a header that has those exact same bookends, only 

269 # then do we replace the previous tags with it; if you have overlapping 

270 # 'widths', leave them so that we inherit different 'levels' of headers. 

271 Spread = tuple[int, int] 

272 SpreadDict = dict[Spread, str] 

273 # The column and row headers are saved into big dicts: column_hdrs is a dict 

274 # whose key is what row or column we are in. The values of that table grid 

275 # square is a dict with the bookends (`Spread`) and the tags associated with 

276 # those bookends 

277 column_hdrs_all: dict[Column, SpreadDict] = {} 

278 row_hdrs_all: dict[Row, dict[Column, SpreadDict]] = {} 

279 

280 forms: list[Form] = [] 

281 processed: set[WikiNode] = set() 

282 # Some tables have cells with stuff like `του` we want to add to the 

283 # next cell 

284 prefix: str | None = None 

285 

286 # print(f"{table_grid=}") 

287 

288 first_cells_are_bold = False 

289 found_unformatted_text = False 

290 

291 for r, row_d in table_grid.items(): 

292 # Check for previously added row headers that may have spread lower; 

293 # Remove old row headers that don't exist on this row. 

294 for c, cell in row_d.items(): 

295 if cell in processed: 

296 continue 

297 processed.add(cell) 

298 

299 try: 

300 rowspan = int(cell.attrs.get("rowspan", "1")) # 🡙 

301 colspan = int(cell.attrs.get("colspan", "1")) # 🡘 

302 except ValueError: 

303 rowspan = 1 

304 colspan = 1 

305 

306 spans = process_cell_text(wxr, cell) 

307 

308 if len(spans) <= 0: 

309 continue 

310 

311 if r == 0: 

312 if spans[0][0]: # starts_bold 312 ↛ 313line 312 didn't jump to line 313 because the condition on line 312 was never true

313 first_cells_are_bold = True 

314 

315 text = clean_value(wxr, " ".join(span[3] for span in spans)) 

316 # print(f"{text=}") 

317 

318 this_is_header, unformatted_text = is_header( 

319 wxr, 

320 cell, 

321 spans, 

322 is_greek_entry, 

323 found_unformatted_text, 

324 first_cells_are_bold, 

325 ) 

326 

327 if unformatted_text is True: 

328 found_unformatted_text = True 

329 

330 if this_is_header or (c == 0 and first_column_is_headers is True): 

331 # Because Greek wiktionary has its own written script to rely 

332 # in heuristics, we can use that. It also seems that for 

333 # tables in Greek-language entries even if the table doesn't 

334 # use proper header cells, you can trust bolding and italics. 

335 

336 # Currently we don't care which "direction" the header points: 

337 # we add the tag to both column headers and row headers, and 

338 # rely on that all headers are on only rows or columns that 

339 # don't have data cells; ie. headers and data aren't mixed. 

340 

341 # Each row and each column gets its own header data. 

342 # The Spread key is used to keep track which headers should 

343 # "overlap": if the spread is different, that should always 

344 # mean that one is contained within another and thus they're 

345 # not complementary headers, but one "bigger" category and 

346 # one "specific" category. If the Spread is identical, then 

347 # that's obviously two complementary headers, and the later one 

348 # overwrites the other. 

349 for rr in range(r, r + rowspan): 

350 if rr not in row_hdrs_all: 

351 row_hdrs_all[rr] = {c: {(r, r + rowspan): text}} 

352 elif c not in row_hdrs_all[rr]: 352 ↛ 357line 352 didn't jump to line 357 because the condition on line 352 was always true

353 row_hdrs_all[rr][c] = {(r, r + rowspan): text} 

354 else: 

355 # Also overwrites headers with the same "span"; simple 

356 # way to have overlapping sections. 

357 row_hdrs_all[rr][c][(r, r + rowspan)] = text 

358 

359 for cc in range(c, c + colspan): 

360 if cc not in column_hdrs_all: 

361 column_hdrs_all[cc] = {(c, c + colspan): text} 

362 else: 

363 column_hdrs_all[cc][(c, c + colspan)] = text 

364 

365 prefix = None 

366 

367 elif text in ARTICLES: 367 ↛ 368line 367 didn't jump to line 368 because the condition on line 367 was never true

368 prefix = text 

369 else: 

370 # cell is data 

371 if text in ( 371 ↛ 393line 371 didn't jump to line 393 because the condition on line 371 was never true

372 "αι", 

373 "ένα", 

374 "ένας", 

375 "στα", 

376 "στη", 

377 "στην", 

378 "στης", 

379 "στις", 

380 "στο", 

381 "στον", 

382 "στου", 

383 "στους", 

384 "στων", 

385 "τ'", 

386 "ταις", 

387 "τας", 

388 "τες", 

389 "τη", 

390 "τοις", 

391 "τω", 

392 ): 

393 wxr.wtp.debug( 

394 f"Found '{text}' in table '{wxr.wtp.title}'", 

395 sortid="table/335", 

396 ) 

397 tags: set[str] = set() 

398 for cc, vd in row_hdrs_all.get(r, {}).items(): 

399 if c <= cc: 399 ↛ 400line 399 didn't jump to line 400 because the condition on line 399 was never true

400 continue 

401 for (start, end), tag in vd.items(): 

402 if start > r or end < r + rowspan: 

403 continue 

404 tags.add(tag) 

405 for (start, end), tag in column_hdrs_all.get(c, {}).items(): 

406 if start > c or end < c + colspan: 406 ↛ 407line 406 didn't jump to line 407 because the condition on line 406 was never true

407 continue 

408 tags.add(tag) 

409 texts = [text] 

410 if "&" in text: 410 ↛ 411line 410 didn't jump to line 411 because the condition on line 410 was never true

411 texts = [t.strip() for t in text.split("&")] 

412 # Avert your eyes... Python list comprehension syntax amirite 

413 texts = [line for text in texts for line in text.splitlines()] 

414 if prefix is not None: 414 ↛ 415line 414 didn't jump to line 415 because the condition on line 414 was never true

415 texts = [f"{prefix} {t}" for t in texts] 

416 prefix = None 

417 if len(tags) > 0: 417 ↛ 424line 417 didn't jump to line 424 because the condition on line 417 was always true

418 # If a cell has no tags in a table, it's probably a note 

419 # or something. 

420 forms.extend( 

421 Form(form=text, raw_tags=list(tags)) for text in texts 

422 ) 

423 else: 

424 wxr.wtp.warning( 

425 f"Cell without any tags in table: {text}", 

426 sortid="table/300/20250217", 

427 ) 

428 

429 # logger.debug( 

430 # f"{wxr.wtp.title}\n{print_tree(tree, indent=2, ret_value=True)}" 

431 # ) 

432 # print(forms) 

433 

434 # # Replace raw_tags with tags if appropriate 

435 # for form in forms: 

436 # legit_tags, new_raw_tags, poses = convert_tags(form.raw_tags) 

437 # # Poses are strings like "adj 1", used in pronunciation data 

438 # # to later associate sound data with the correct pos entry. 

439 # # Ignored here. 

440 # if legit_tags: 

441 # form.tags = legit_tags 

442 # form.tags.extend(poses) 

443 # form.raw_tags = new_raw_tags 

444 # print(f"Inside parse_table: {forms=}") 

445 

446 if len(forms) > 0: 446 ↛ exitline 446 didn't return from function 'parse_table' because the condition on line 446 was always true

447 data.forms.append( 

448 Form(form=template_name, tags=["inflection-template"]) 

449 ) 

450 

451 data.forms.extend(forms) 

452 

453 

454def process_cell_text( 

455 wxr: WiktextractContext, cell: WikiNode 

456) -> list[tuple[bool, bool, bool, str]]: 

457 cell_text = wxr.wtp.node_to_text(cell, node_handler_fn=cell_node_fn) 

458 cell_text = clean_value(wxr, cell_text) 

459 split_text = BOLD_RE.split(cell_text) 

460 

461 # bold, italics, is greek, text 

462 spans: list[tuple[bool, bool, bool, str]] = [] 

463 

464 inside_bold = False 

465 inside_italics = False 

466 for i, text in enumerate(split_text): 

467 text = text.strip() 

468 if not text: 

469 continue 

470 if i % 2 == 0: 470 ↛ 482line 470 didn't jump to line 482 because the condition on line 470 was always true

471 for ch in text: 471 ↛ 478line 471 didn't jump to line 478 because the loop on line 471 didn't complete

472 if not ch.isalpha(): 472 ↛ 473line 472 didn't jump to line 473 because the condition on line 472 was never true

473 continue 

474 greek = unicode_name(ch).startswith("GREEK") 

475 break 

476 else: 

477 # no alphanumerics detected 

478 continue 

479 

480 spans.append((inside_bold, inside_italics, greek, text)) 

481 continue 

482 match text: 

483 case "__B__": 

484 inside_bold = True 

485 case "__/B__": 

486 inside_bold = False 

487 case "__I__": 

488 inside_italics = True 

489 case "__/I__": 

490 inside_italics = False 

491 

492 return spans 

493 

494 

495UnformattedFound: TypeAlias = bool 

496 

497 

498def is_header( 

499 wxr: WiktextractContext, 

500 cell: WikiNode, 

501 spans: list[tuple[bool, bool, bool, str]], 

502 is_greek_entry: bool, 

503 unformatted_text_found: bool, 

504 first_cells_are_bold, 

505) -> tuple[bool, UnformattedFound]: 

506 # Container for more complex logic stuff because trying to figure out 

507 # if something is a header can get messy. 

508 if cell.kind == NodeKind.TABLE_HEADER_CELL: 

509 return True, False 

510 

511 starts_bold, starts_italicized, starts_greek, text = spans[0] 

512 

513 if "bold" in cell.attrs.get("style", ""): 513 ↛ 514line 513 didn't jump to line 514 because the condition on line 513 was never true

514 starts_bold = True 

515 if "italic" in cell.attrs.get("style", ""): 515 ↛ 516line 515 didn't jump to line 516 because the condition on line 515 was never true

516 starts_italicized = True 

517 

518 # Not a Greek entry 

519 if not is_greek_entry: 519 ↛ 520line 519 didn't jump to line 520 because the condition on line 519 was never true

520 if starts_greek: 

521 # If the table is for another language other than Greek, a cell 

522 # starting with Greek text is a table header 

523 return True, (starts_bold or starts_italicized) 

524 else: 

525 return False, (starts_bold or starts_italicized) 

526 

527 # Is a Greek entry 

528 if starts_italicized is True: 528 ↛ 529line 528 didn't jump to line 529 because the condition on line 528 was never true

529 return True, False 

530 

531 if starts_bold is False: 531 ↛ 534line 531 didn't jump to line 534 because the condition on line 531 was always true

532 return False, True 

533 

534 if unformatted_text_found: 

535 # This is bolded, but we've seen unformatted text before 

536 return True, False 

537 # print(f"{text=}-> {starts_bold=}, {starts_italicized=}, {starts_greek=}") 

538 

539 if first_cells_are_bold: 

540 return True, False 

541 

542 wxr.wtp.warning( 

543 f"Can't be sure if bolded text entry '{text}' is a header or not", 

544 sortid="table/20250210a", 

545 ) 

546 return False, False