Coverage for src/wiktextract/extractor/el/table.py: 5%

226 statements  

« prev     ^ index     » next       coverage.py v7.10.3, created at 2025-08-15 05:18 +0000

1import re 

2from typing import TypeAlias 

3from unicodedata import name as unicode_name 

4 

5from wikitextprocessor import HTMLNode, NodeKind, TemplateNode, WikiNode 

6 

7from wiktextract.clean import clean_value 

8from wiktextract.wxr_context import WiktextractContext 

9from wiktextract.wxr_logging import logger 

10 

11from .models import Form, WordEntry 

12from .parse_utils import GREEK_LANGCODES, remove_duplicate_forms 

13 

14# from .simple_tags import simple_tag_map 

15# from .tags_utils import convert_tags 

16 

17# Shorthand for this file. Could be an import, but it's so simple... 

18Node = str | WikiNode 

19 

20 

21# GREEK TABLE HEURISTICS: 

22# If it's a table for a Greek language entry, if it's in a header or is in 

23# italics, it's a header. 

24# If it's NOT a Greek entry and has Greek text, it's a header. 

25 

26 

27# node_fns are different from template_fns. template_fns are functions that 

28# are used to handle how to expand (and otherwise process) templates, while 

29# node functions are used when turning any parsed "abstract" nodes into strings. 

30def cell_node_fn( 

31 node: WikiNode, 

32) -> list[Node] | None: 

33 """Handle nodes in the parse tree specially.""" 

34 assert isinstance(node, WikiNode) 

35 if node.kind == NodeKind.ITALIC: 

36 return ["__I__", *node.children, "__/I__"] 

37 if node.kind == NodeKind.BOLD: 

38 return ["__B__", *node.children, "__/B__"] 

39 # In case someone puts tables inside tables... 

40 kind = node.kind 

41 if kind in { 

42 NodeKind.TABLE_CELL, 

43 NodeKind.TABLE_HEADER_CELL, 

44 }: 

45 return node.children 

46 return None 

47 

48 

49BOLD_RE = re.compile(r"(__/?[BI]__)") 

50 

51ARTICLES: set[str] = { 

52 "ο", 

53 "η", 

54 "το", 

55 "την", 

56 "της", 

57 "τον", 

58 "τη", 

59 "το", 

60 "οι", 

61 "οι", 

62 "τα", 

63 "των", 

64 "τους", 

65 "του", 

66 "τις", 

67 "τα", 

68} 

69 

70 

71def process_inflection_section( 

72 wxr: WiktextractContext, data: WordEntry, snode: WikiNode 

73): 

74 table_nodes: list[tuple[str | None, WikiNode]] = [] 

75 # template_depth is used as a nonlocal variable in bold_node_handler 

76 # to gauge how deep inside a top-level template we are; we want to 

77 # collect template data only for the top-level templates that are 

78 # visible in the wikitext, not templates inside templates. 

79 template_depth = 0 

80 top_template_name: str | None = None 

81 

82 def table_node_handler_fn( 

83 node: WikiNode, 

84 ) -> list[str | WikiNode] | None: 

85 """Insert special markers `__*__` and `__/*__` around bold nodes so 

86 that the strings can later be split into "head-word" and "tag-words" 

87 parts. Collect incidental stuff, like side-tables, that are often 

88 put around the head.""" 

89 assert isinstance(node, WikiNode) 

90 kind = node.kind 

91 nonlocal template_depth 

92 nonlocal top_template_name 

93 if isinstance(node, TemplateNode): 

94 # Recursively expand templates so that even nodes inside the 

95 # the templates are handled with bold_node_handler. 

96 # Argh. Don't use "node_to_text", that causes bad output... 

97 expanded = wxr.wtp.expand(wxr.wtp.node_to_wikitext(node)) 

98 if template_depth == 0: 

99 # We are looking at a top-level template in the original 

100 # wikitext. 

101 top_template_name = node.template_name 

102 new_node = wxr.wtp.parse(expanded) 

103 

104 template_depth += 1 

105 ret = wxr.wtp.node_to_text( 

106 new_node, node_handler_fn=table_node_handler_fn 

107 ) 

108 template_depth -= 1 

109 if template_depth == 0: 

110 top_template_name = None 

111 return ret 

112 

113 if kind in { 

114 NodeKind.TABLE, 

115 }: 

116 # XXX Handle tables here 

117 # template depth and top-level template name 

118 nonlocal table_nodes 

119 table_nodes.append((top_template_name, node)) 

120 return [""] 

121 return None 

122 

123 _ = wxr.wtp.node_to_html(snode, node_handler_fn=table_node_handler_fn) 

124 

125 if len(table_nodes) > 0: 

126 for template_name, table_node in table_nodes: 

127 # XXX template_name 

128 parse_table( 

129 wxr, 

130 table_node, 

131 data, 

132 data.lang_code in GREEK_LANGCODES, 

133 template_name=template_name or "", 

134 ) 

135 

136 data.forms = remove_duplicate_forms(wxr, data.forms) 

137 

138 

139def parse_table( 

140 wxr: WiktextractContext, 

141 tnode: WikiNode, 

142 data: WordEntry, 

143 is_greek_entry: bool = False, # Whether the entry is for a Greek word 

144 template_name: str = "", 

145) -> None: 

146 """Parse inflection table. Generates 'form' data; 'foos' is a form of 'foo' 

147 with the tags ['plural'].""" 

148 assert (isinstance(tnode, WikiNode) and tnode.kind == NodeKind.TABLE) or ( 

149 isinstance(tnode, HTMLNode) and tnode.tag == "table" 

150 ) 

151 

152 is_html_table = isinstance(tnode, HTMLNode) 

153 

154 # Some debugging code: if wiktwords is passed a --inflection-tables-file 

155 # argument, we save tables to a file for debugging purposes, or for just 

156 # getting tables that can be used as test data. 

157 if wxr.config.expand_tables: 

158 with open(wxr.config.expand_tables, "w") as f: 

159 f.write(f"{wxr.wtp.title=}\n") 

160 text = wxr.wtp.node_to_wikitext(tnode) 

161 f.write(f"{text}\n") 

162 

163 Row: TypeAlias = int 

164 Column: TypeAlias = int 

165 

166 # We complete the table using nested dicts (instead of arrays for 

167 # convenience) such that when we come across a node, we push that node's 

168 # reference to each coordinate point in the table grid it occupies. Each 

169 # grid point can then be checked for if it's been handled already and 

170 # skipped if needed. 

171 table_grid: dict[Row, dict[Column, WikiNode]] = {} 

172 

173 first_column_is_headers = True 

174 

175 for r, row in enumerate( 

176 tnode.find_html_recursively("tr") 

177 if is_html_table 

178 else tnode.find_child_recursively(NodeKind.TABLE_ROW) 

179 ): 

180 c = 0 

181 # print(f"{r=}, {row=}") 

182 if r not in table_grid: 

183 table_grid[r] = {} 

184 

185 for cell in ( 

186 row.find_html(["th", "td"]) 

187 if is_html_table 

188 else row.find_child( 

189 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL, 

190 ) 

191 ): 

192 while c in table_grid[r]: 

193 c += 1 

194 

195 try: 

196 rowspan = int(cell.attrs.get("rowspan", "1")) # 🡙 

197 colspan = int(cell.attrs.get("colspan", "1")) # 🡘 

198 except ValueError: 

199 rowspan = 1 

200 colspan = 1 

201 # print("COL:", col) 

202 

203 if colspan > 30: 

204 wxr.wtp.error( 

205 f"Colspan {colspan} over 30, set to 1", 

206 sortid="table/128/20250207", 

207 ) 

208 colspan = 1 

209 if rowspan > 30: 

210 wxr.wtp.error( 

211 f"Rowspan {rowspan} over 30, set to 1", 

212 sortid="table/134/20250207b", 

213 ) 

214 rowspan = 1 

215 

216 for rr in range(r, r + rowspan): 

217 if rr not in table_grid: 

218 table_grid[rr] = {} 

219 for cc in range(c, c + colspan): 

220 table_grid[rr][cc] = cell 

221 

222 if not table_grid[len(table_grid) - 1]: 

223 # Last row is empty; traverse backwards to skip empty rows at end 

224 last_item = None 

225 for i, rowd in reversed(table_grid.items()): 

226 if rowd: 

227 last_item = i 

228 break 

229 

230 assert last_item is not None 

231 

232 new_table_grid = dict() 

233 for i, rowd in table_grid.items(): 

234 if i > last_item: 

235 continue 

236 new_table_grid[i] = rowd 

237 table_grid = new_table_grid 

238 

239 if len(table_grid[0]) == 1: 

240 # Table is one column in width, no headers on rows 

241 first_column_is_headers = False 

242 

243 if len(table_grid) == 2: 

244 # There's only one or two rows 

245 first_column_is_headers = False 

246 

247 # Headers are saved in two dict that has their keys made out of tuples 

248 # made of their "bookends": so {(1,1), "foo"} for a header that is made 

249 # up of the first cell only of a row in the column_hdrs dict. 

250 # If we come across a header that has those exact same bookends, only 

251 # then do we replace the previous tags with it; if you have overlapping 

252 # 'widths', leave them so that we inherit different 'levels' of headers. 

253 Spread = tuple[int, int] 

254 SpreadDict = dict[Spread, str] 

255 # The column and row headers are saved into big dicts: column_hdrs is a dict 

256 # whose key is what row or column we are in. The values of that table grid 

257 # square is a dict with the bookends (`Spread`) and the tags associated with 

258 # those bookends 

259 column_hdrs_all: dict[Column, SpreadDict] = {} 

260 row_hdrs_all: dict[Row, dict[Column, SpreadDict]] = {} 

261 

262 forms: list[Form] = [] 

263 processed: set[WikiNode] = set() 

264 # Some tables have cells with stuff like `του` we want to add to the 

265 # next cell 

266 prefix: str | None = None 

267 

268 # print(f"{table_grid=}") 

269 

270 first_cells_are_bold = False 

271 found_unformatted_text = False 

272 

273 for r, row_d in table_grid.items(): 

274 # Check for previously added row headers that may have spread lower; 

275 # Remove old row headers that don't exist on this row. 

276 for c, cell in row_d.items(): 

277 if cell in processed: 

278 continue 

279 processed.add(cell) 

280 

281 try: 

282 rowspan = int(cell.attrs.get("rowspan", "1")) # 🡙 

283 colspan = int(cell.attrs.get("colspan", "1")) # 🡘 

284 except ValueError: 

285 rowspan = 1 

286 colspan = 1 

287 

288 spans = process_cell_text(wxr, cell) 

289 

290 if len(spans) <= 0: 

291 continue 

292 

293 if r == 0: 

294 if spans[0][0]: # starts_bold 

295 first_cells_are_bold = True 

296 

297 text = clean_value(wxr, " ".join(span[3] for span in spans)) 

298 # print(f"{text=}") 

299 

300 this_is_header, unformatted_text = is_header( 

301 wxr, 

302 cell, 

303 spans, 

304 is_greek_entry, 

305 found_unformatted_text, 

306 first_cells_are_bold, 

307 ) 

308 

309 if unformatted_text is True: 

310 found_unformatted_text = True 

311 

312 if this_is_header or (c == 0 and first_column_is_headers is True): 

313 # Because Greek wiktionary has its own written script to rely 

314 # in heuristics, we can use that. It also seems that for 

315 # tables in Greek-language entries even if the table doesn't 

316 # use proper header cells, you can trust bolding and italics. 

317 

318 # Currently we don't care which "direction" the header points: 

319 # we add the tag to both column headers and row headers, and 

320 # rely on that all headers are on only rows or columns that 

321 # don't have data cells; ie. headers and data aren't mixed. 

322 

323 # Each row and each column gets its own header data. 

324 # The Spread key is used to keep track which headers should 

325 # "overlap": if the spread is different, that should always 

326 # mean that one is contained within another and thus they're 

327 # not complementary headers, but one "bigger" category and 

328 # one "specific" category. If the Spread is identical, then 

329 # that's obviously two complementary headers, and the later one 

330 # overwrites the other. 

331 for rr in range(r, r + rowspan): 

332 if rr not in row_hdrs_all: 

333 row_hdrs_all[rr] = {c: {(r, r + rowspan): text}} 

334 elif c not in row_hdrs_all[rr]: 

335 row_hdrs_all[rr][c] = {(r, r + rowspan): text} 

336 else: 

337 # Also overwrites headers with the same "span"; simple 

338 # way to have overlapping sections. 

339 row_hdrs_all[rr][c][(r, r + rowspan)] = text 

340 

341 for cc in range(c, c + colspan): 

342 if cc not in column_hdrs_all: 

343 column_hdrs_all[cc] = {(c, c + colspan): text} 

344 else: 

345 column_hdrs_all[cc][(c, c + colspan)] = text 

346 

347 prefix = None 

348 

349 elif text in ARTICLES: 

350 prefix = text 

351 else: 

352 # cell is data 

353 if text in ( 

354 "αι", 

355 "ένα", 

356 "ένας", 

357 "στα", 

358 "στη", 

359 "στην", 

360 "στης", 

361 "στις", 

362 "στο", 

363 "στον", 

364 "στου", 

365 "στους", 

366 "στων", 

367 "τ'", 

368 "ταις", 

369 "τας", 

370 "τες", 

371 "τη", 

372 "τοις", 

373 "τω", 

374 ): 

375 wxr.wtp.debug( 

376 f"Found '{text}' in table '{wxr.wtp.title}'", 

377 sortid="table/335", 

378 ) 

379 tags: set[str] = set() 

380 for cc, vd in row_hdrs_all.get(r, {}).items(): 

381 if c <= cc: 

382 continue 

383 for (start, end), tag in vd.items(): 

384 if start > r or end < r + rowspan: 

385 continue 

386 tags.add(tag) 

387 for (start, end), tag in column_hdrs_all.get(c, {}).items(): 

388 if start > c or end < c + colspan: 

389 continue 

390 tags.add(tag) 

391 texts = [text] 

392 if "&" in text: 

393 texts = [t.strip() for t in text.split("&")] 

394 # Avert your eyes... Python list comprehension syntax amirite 

395 texts = [line for text in texts for line in text.splitlines()] 

396 if prefix is not None: 

397 texts = [f"{prefix} {t}" for t in texts] 

398 prefix = None 

399 if len(tags) > 0: 

400 # If a cell has no tags in a table, it's probably a note 

401 # or something. 

402 forms.extend( 

403 Form(form=text, raw_tags=list(tags)) for text in texts 

404 ) 

405 else: 

406 wxr.wtp.warning( 

407 f"Cell without any tags in table: {text}", 

408 sortid="table/300/20250217", 

409 ) 

410 

411 # logger.debug( 

412 # f"{wxr.wtp.title}\n{print_tree(tree, indent=2, ret_value=True)}" 

413 # ) 

414 # print(forms) 

415 

416 # # Replace raw_tags with tags if appropriate 

417 # for form in forms: 

418 # legit_tags, new_raw_tags, poses = convert_tags(form.raw_tags) 

419 # # Poses are strings like "adj 1", used in pronunciation data 

420 # # to later associate sound data with the correct pos entry. 

421 # # Ignored here. 

422 # if legit_tags: 

423 # form.tags = legit_tags 

424 # form.tags.extend(poses) 

425 # form.raw_tags = new_raw_tags 

426 # print(f"Inside parse_table: {forms=}") 

427 

428 if len(forms) > 0: 

429 data.forms.append( 

430 Form(form=template_name, tags=["inflection-template"]) 

431 ) 

432 

433 data.forms.extend(forms) 

434 

435 

436def process_cell_text( 

437 wxr: WiktextractContext, cell: WikiNode 

438) -> list[tuple[bool, bool, bool, str]]: 

439 cell_text = wxr.wtp.node_to_text(cell, node_handler_fn=cell_node_fn) 

440 cell_text = clean_value(wxr, cell_text) 

441 split_text = BOLD_RE.split(cell_text) 

442 

443 # bold, italics, is greek, text 

444 spans: list[tuple[bool, bool, bool, str]] = [] 

445 

446 inside_bold = False 

447 inside_italics = False 

448 for i, text in enumerate(split_text): 

449 text = text.strip() 

450 if not text: 

451 continue 

452 if i % 2 == 0: 

453 for ch in text: 

454 if not ch.isalpha(): 

455 continue 

456 greek = unicode_name(ch).startswith("GREEK") 

457 break 

458 else: 

459 # no alphanumerics detected 

460 continue 

461 

462 spans.append((inside_bold, inside_italics, greek, text)) 

463 continue 

464 match text: 

465 case "__B__": 

466 inside_bold = True 

467 case "__/B__": 

468 inside_bold = False 

469 case "__I__": 

470 inside_italics = True 

471 case "__/I__": 

472 inside_italics = False 

473 

474 return spans 

475 

476 

477UnformattedFound: TypeAlias = bool 

478 

479 

480def is_header( 

481 wxr: WiktextractContext, 

482 cell: WikiNode, 

483 spans: list[tuple[bool, bool, bool, str]], 

484 is_greek_entry: bool, 

485 unformatted_text_found: bool, 

486 first_cells_are_bold, 

487) -> tuple[bool, UnformattedFound]: 

488 # Container for more complex logic stuff because trying to figure out 

489 # if something is a header can get messy. 

490 if cell.kind == NodeKind.TABLE_HEADER_CELL: 

491 return True, False 

492 

493 starts_bold, starts_italicized, starts_greek, text = spans[0] 

494 

495 if "bold" in cell.attrs.get("style", ""): 

496 starts_bold = True 

497 if "italic" in cell.attrs.get("style", ""): 

498 starts_italicized = True 

499 

500 # Not a Greek entry 

501 if not is_greek_entry: 

502 if starts_greek: 

503 # If the table is for another language other than Greek, a cell 

504 # starting with Greek text is a table header 

505 return True, (starts_bold or starts_italicized) 

506 else: 

507 return False, (starts_bold or starts_italicized) 

508 

509 # Is a Greek entry 

510 if starts_italicized is True: 

511 return True, False 

512 

513 if starts_bold is False: 

514 return False, True 

515 

516 if unformatted_text_found: 

517 # This is bolded, but we've seen unformatted text before 

518 return True, False 

519 # print(f"{text=}-> {starts_bold=}, {starts_italicized=}, {starts_greek=}") 

520 

521 if first_cells_are_bold: 

522 return True, False 

523 

524 wxr.wtp.warning( 

525 f"Can't be sure if bolded text entry '{text}' is a header or not", 

526 sortid="table/20250210a", 

527 ) 

528 return False, False