Coverage for src/wiktextract/extractor/el/table.py: 5%
226 statements
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-15 05:18 +0000
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-15 05:18 +0000
1import re
2from typing import TypeAlias
3from unicodedata import name as unicode_name
5from wikitextprocessor import HTMLNode, NodeKind, TemplateNode, WikiNode
7from wiktextract.clean import clean_value
8from wiktextract.wxr_context import WiktextractContext
9from wiktextract.wxr_logging import logger
11from .models import Form, WordEntry
12from .parse_utils import GREEK_LANGCODES, remove_duplicate_forms
14# from .simple_tags import simple_tag_map
15# from .tags_utils import convert_tags
17# Shorthand for this file. Could be an import, but it's so simple...
18Node = str | WikiNode
21# GREEK TABLE HEURISTICS:
22# If it's a table for a Greek language entry, if it's in a header or is in
23# italics, it's a header.
24# If it's NOT a Greek entry and has Greek text, it's a header.
27# node_fns are different from template_fns. template_fns are functions that
28# are used to handle how to expand (and otherwise process) templates, while
29# node functions are used when turning any parsed "abstract" nodes into strings.
30def cell_node_fn(
31 node: WikiNode,
32) -> list[Node] | None:
33 """Handle nodes in the parse tree specially."""
34 assert isinstance(node, WikiNode)
35 if node.kind == NodeKind.ITALIC:
36 return ["__I__", *node.children, "__/I__"]
37 if node.kind == NodeKind.BOLD:
38 return ["__B__", *node.children, "__/B__"]
39 # In case someone puts tables inside tables...
40 kind = node.kind
41 if kind in {
42 NodeKind.TABLE_CELL,
43 NodeKind.TABLE_HEADER_CELL,
44 }:
45 return node.children
46 return None
49BOLD_RE = re.compile(r"(__/?[BI]__)")
51ARTICLES: set[str] = {
52 "ο",
53 "η",
54 "το",
55 "την",
56 "της",
57 "τον",
58 "τη",
59 "το",
60 "οι",
61 "οι",
62 "τα",
63 "των",
64 "τους",
65 "του",
66 "τις",
67 "τα",
68}
71def process_inflection_section(
72 wxr: WiktextractContext, data: WordEntry, snode: WikiNode
73):
74 table_nodes: list[tuple[str | None, WikiNode]] = []
75 # template_depth is used as a nonlocal variable in bold_node_handler
76 # to gauge how deep inside a top-level template we are; we want to
77 # collect template data only for the top-level templates that are
78 # visible in the wikitext, not templates inside templates.
79 template_depth = 0
80 top_template_name: str | None = None
82 def table_node_handler_fn(
83 node: WikiNode,
84 ) -> list[str | WikiNode] | None:
85 """Insert special markers `__*__` and `__/*__` around bold nodes so
86 that the strings can later be split into "head-word" and "tag-words"
87 parts. Collect incidental stuff, like side-tables, that are often
88 put around the head."""
89 assert isinstance(node, WikiNode)
90 kind = node.kind
91 nonlocal template_depth
92 nonlocal top_template_name
93 if isinstance(node, TemplateNode):
94 # Recursively expand templates so that even nodes inside the
95 # the templates are handled with bold_node_handler.
96 # Argh. Don't use "node_to_text", that causes bad output...
97 expanded = wxr.wtp.expand(wxr.wtp.node_to_wikitext(node))
98 if template_depth == 0:
99 # We are looking at a top-level template in the original
100 # wikitext.
101 top_template_name = node.template_name
102 new_node = wxr.wtp.parse(expanded)
104 template_depth += 1
105 ret = wxr.wtp.node_to_text(
106 new_node, node_handler_fn=table_node_handler_fn
107 )
108 template_depth -= 1
109 if template_depth == 0:
110 top_template_name = None
111 return ret
113 if kind in {
114 NodeKind.TABLE,
115 }:
116 # XXX Handle tables here
117 # template depth and top-level template name
118 nonlocal table_nodes
119 table_nodes.append((top_template_name, node))
120 return [""]
121 return None
123 _ = wxr.wtp.node_to_html(snode, node_handler_fn=table_node_handler_fn)
125 if len(table_nodes) > 0:
126 for template_name, table_node in table_nodes:
127 # XXX template_name
128 parse_table(
129 wxr,
130 table_node,
131 data,
132 data.lang_code in GREEK_LANGCODES,
133 template_name=template_name or "",
134 )
136 data.forms = remove_duplicate_forms(wxr, data.forms)
139def parse_table(
140 wxr: WiktextractContext,
141 tnode: WikiNode,
142 data: WordEntry,
143 is_greek_entry: bool = False, # Whether the entry is for a Greek word
144 template_name: str = "",
145) -> None:
146 """Parse inflection table. Generates 'form' data; 'foos' is a form of 'foo'
147 with the tags ['plural']."""
148 assert (isinstance(tnode, WikiNode) and tnode.kind == NodeKind.TABLE) or (
149 isinstance(tnode, HTMLNode) and tnode.tag == "table"
150 )
152 is_html_table = isinstance(tnode, HTMLNode)
154 # Some debugging code: if wiktwords is passed a --inflection-tables-file
155 # argument, we save tables to a file for debugging purposes, or for just
156 # getting tables that can be used as test data.
157 if wxr.config.expand_tables:
158 with open(wxr.config.expand_tables, "w") as f:
159 f.write(f"{wxr.wtp.title=}\n")
160 text = wxr.wtp.node_to_wikitext(tnode)
161 f.write(f"{text}\n")
163 Row: TypeAlias = int
164 Column: TypeAlias = int
166 # We complete the table using nested dicts (instead of arrays for
167 # convenience) such that when we come across a node, we push that node's
168 # reference to each coordinate point in the table grid it occupies. Each
169 # grid point can then be checked for if it's been handled already and
170 # skipped if needed.
171 table_grid: dict[Row, dict[Column, WikiNode]] = {}
173 first_column_is_headers = True
175 for r, row in enumerate(
176 tnode.find_html_recursively("tr")
177 if is_html_table
178 else tnode.find_child_recursively(NodeKind.TABLE_ROW)
179 ):
180 c = 0
181 # print(f"{r=}, {row=}")
182 if r not in table_grid:
183 table_grid[r] = {}
185 for cell in (
186 row.find_html(["th", "td"])
187 if is_html_table
188 else row.find_child(
189 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL,
190 )
191 ):
192 while c in table_grid[r]:
193 c += 1
195 try:
196 rowspan = int(cell.attrs.get("rowspan", "1")) # 🡙
197 colspan = int(cell.attrs.get("colspan", "1")) # 🡘
198 except ValueError:
199 rowspan = 1
200 colspan = 1
201 # print("COL:", col)
203 if colspan > 30:
204 wxr.wtp.error(
205 f"Colspan {colspan} over 30, set to 1",
206 sortid="table/128/20250207",
207 )
208 colspan = 1
209 if rowspan > 30:
210 wxr.wtp.error(
211 f"Rowspan {rowspan} over 30, set to 1",
212 sortid="table/134/20250207b",
213 )
214 rowspan = 1
216 for rr in range(r, r + rowspan):
217 if rr not in table_grid:
218 table_grid[rr] = {}
219 for cc in range(c, c + colspan):
220 table_grid[rr][cc] = cell
222 if not table_grid[len(table_grid) - 1]:
223 # Last row is empty; traverse backwards to skip empty rows at end
224 last_item = None
225 for i, rowd in reversed(table_grid.items()):
226 if rowd:
227 last_item = i
228 break
230 assert last_item is not None
232 new_table_grid = dict()
233 for i, rowd in table_grid.items():
234 if i > last_item:
235 continue
236 new_table_grid[i] = rowd
237 table_grid = new_table_grid
239 if len(table_grid[0]) == 1:
240 # Table is one column in width, no headers on rows
241 first_column_is_headers = False
243 if len(table_grid) == 2:
244 # There's only one or two rows
245 first_column_is_headers = False
247 # Headers are saved in two dict that has their keys made out of tuples
248 # made of their "bookends": so {(1,1), "foo"} for a header that is made
249 # up of the first cell only of a row in the column_hdrs dict.
250 # If we come across a header that has those exact same bookends, only
251 # then do we replace the previous tags with it; if you have overlapping
252 # 'widths', leave them so that we inherit different 'levels' of headers.
253 Spread = tuple[int, int]
254 SpreadDict = dict[Spread, str]
255 # The column and row headers are saved into big dicts: column_hdrs is a dict
256 # whose key is what row or column we are in. The values of that table grid
257 # square is a dict with the bookends (`Spread`) and the tags associated with
258 # those bookends
259 column_hdrs_all: dict[Column, SpreadDict] = {}
260 row_hdrs_all: dict[Row, dict[Column, SpreadDict]] = {}
262 forms: list[Form] = []
263 processed: set[WikiNode] = set()
264 # Some tables have cells with stuff like `του` we want to add to the
265 # next cell
266 prefix: str | None = None
268 # print(f"{table_grid=}")
270 first_cells_are_bold = False
271 found_unformatted_text = False
273 for r, row_d in table_grid.items():
274 # Check for previously added row headers that may have spread lower;
275 # Remove old row headers that don't exist on this row.
276 for c, cell in row_d.items():
277 if cell in processed:
278 continue
279 processed.add(cell)
281 try:
282 rowspan = int(cell.attrs.get("rowspan", "1")) # 🡙
283 colspan = int(cell.attrs.get("colspan", "1")) # 🡘
284 except ValueError:
285 rowspan = 1
286 colspan = 1
288 spans = process_cell_text(wxr, cell)
290 if len(spans) <= 0:
291 continue
293 if r == 0:
294 if spans[0][0]: # starts_bold
295 first_cells_are_bold = True
297 text = clean_value(wxr, " ".join(span[3] for span in spans))
298 # print(f"{text=}")
300 this_is_header, unformatted_text = is_header(
301 wxr,
302 cell,
303 spans,
304 is_greek_entry,
305 found_unformatted_text,
306 first_cells_are_bold,
307 )
309 if unformatted_text is True:
310 found_unformatted_text = True
312 if this_is_header or (c == 0 and first_column_is_headers is True):
313 # Because Greek wiktionary has its own written script to rely
314 # in heuristics, we can use that. It also seems that for
315 # tables in Greek-language entries even if the table doesn't
316 # use proper header cells, you can trust bolding and italics.
318 # Currently we don't care which "direction" the header points:
319 # we add the tag to both column headers and row headers, and
320 # rely on that all headers are on only rows or columns that
321 # don't have data cells; ie. headers and data aren't mixed.
323 # Each row and each column gets its own header data.
324 # The Spread key is used to keep track which headers should
325 # "overlap": if the spread is different, that should always
326 # mean that one is contained within another and thus they're
327 # not complementary headers, but one "bigger" category and
328 # one "specific" category. If the Spread is identical, then
329 # that's obviously two complementary headers, and the later one
330 # overwrites the other.
331 for rr in range(r, r + rowspan):
332 if rr not in row_hdrs_all:
333 row_hdrs_all[rr] = {c: {(r, r + rowspan): text}}
334 elif c not in row_hdrs_all[rr]:
335 row_hdrs_all[rr][c] = {(r, r + rowspan): text}
336 else:
337 # Also overwrites headers with the same "span"; simple
338 # way to have overlapping sections.
339 row_hdrs_all[rr][c][(r, r + rowspan)] = text
341 for cc in range(c, c + colspan):
342 if cc not in column_hdrs_all:
343 column_hdrs_all[cc] = {(c, c + colspan): text}
344 else:
345 column_hdrs_all[cc][(c, c + colspan)] = text
347 prefix = None
349 elif text in ARTICLES:
350 prefix = text
351 else:
352 # cell is data
353 if text in (
354 "αι",
355 "ένα",
356 "ένας",
357 "στα",
358 "στη",
359 "στην",
360 "στης",
361 "στις",
362 "στο",
363 "στον",
364 "στου",
365 "στους",
366 "στων",
367 "τ'",
368 "ταις",
369 "τας",
370 "τες",
371 "τη",
372 "τοις",
373 "τω",
374 ):
375 wxr.wtp.debug(
376 f"Found '{text}' in table '{wxr.wtp.title}'",
377 sortid="table/335",
378 )
379 tags: set[str] = set()
380 for cc, vd in row_hdrs_all.get(r, {}).items():
381 if c <= cc:
382 continue
383 for (start, end), tag in vd.items():
384 if start > r or end < r + rowspan:
385 continue
386 tags.add(tag)
387 for (start, end), tag in column_hdrs_all.get(c, {}).items():
388 if start > c or end < c + colspan:
389 continue
390 tags.add(tag)
391 texts = [text]
392 if "&" in text:
393 texts = [t.strip() for t in text.split("&")]
394 # Avert your eyes... Python list comprehension syntax amirite
395 texts = [line for text in texts for line in text.splitlines()]
396 if prefix is not None:
397 texts = [f"{prefix} {t}" for t in texts]
398 prefix = None
399 if len(tags) > 0:
400 # If a cell has no tags in a table, it's probably a note
401 # or something.
402 forms.extend(
403 Form(form=text, raw_tags=list(tags)) for text in texts
404 )
405 else:
406 wxr.wtp.warning(
407 f"Cell without any tags in table: {text}",
408 sortid="table/300/20250217",
409 )
411 # logger.debug(
412 # f"{wxr.wtp.title}\n{print_tree(tree, indent=2, ret_value=True)}"
413 # )
414 # print(forms)
416 # # Replace raw_tags with tags if appropriate
417 # for form in forms:
418 # legit_tags, new_raw_tags, poses = convert_tags(form.raw_tags)
419 # # Poses are strings like "adj 1", used in pronunciation data
420 # # to later associate sound data with the correct pos entry.
421 # # Ignored here.
422 # if legit_tags:
423 # form.tags = legit_tags
424 # form.tags.extend(poses)
425 # form.raw_tags = new_raw_tags
426 # print(f"Inside parse_table: {forms=}")
428 if len(forms) > 0:
429 data.forms.append(
430 Form(form=template_name, tags=["inflection-template"])
431 )
433 data.forms.extend(forms)
436def process_cell_text(
437 wxr: WiktextractContext, cell: WikiNode
438) -> list[tuple[bool, bool, bool, str]]:
439 cell_text = wxr.wtp.node_to_text(cell, node_handler_fn=cell_node_fn)
440 cell_text = clean_value(wxr, cell_text)
441 split_text = BOLD_RE.split(cell_text)
443 # bold, italics, is greek, text
444 spans: list[tuple[bool, bool, bool, str]] = []
446 inside_bold = False
447 inside_italics = False
448 for i, text in enumerate(split_text):
449 text = text.strip()
450 if not text:
451 continue
452 if i % 2 == 0:
453 for ch in text:
454 if not ch.isalpha():
455 continue
456 greek = unicode_name(ch).startswith("GREEK")
457 break
458 else:
459 # no alphanumerics detected
460 continue
462 spans.append((inside_bold, inside_italics, greek, text))
463 continue
464 match text:
465 case "__B__":
466 inside_bold = True
467 case "__/B__":
468 inside_bold = False
469 case "__I__":
470 inside_italics = True
471 case "__/I__":
472 inside_italics = False
474 return spans
477UnformattedFound: TypeAlias = bool
480def is_header(
481 wxr: WiktextractContext,
482 cell: WikiNode,
483 spans: list[tuple[bool, bool, bool, str]],
484 is_greek_entry: bool,
485 unformatted_text_found: bool,
486 first_cells_are_bold,
487) -> tuple[bool, UnformattedFound]:
488 # Container for more complex logic stuff because trying to figure out
489 # if something is a header can get messy.
490 if cell.kind == NodeKind.TABLE_HEADER_CELL:
491 return True, False
493 starts_bold, starts_italicized, starts_greek, text = spans[0]
495 if "bold" in cell.attrs.get("style", ""):
496 starts_bold = True
497 if "italic" in cell.attrs.get("style", ""):
498 starts_italicized = True
500 # Not a Greek entry
501 if not is_greek_entry:
502 if starts_greek:
503 # If the table is for another language other than Greek, a cell
504 # starting with Greek text is a table header
505 return True, (starts_bold or starts_italicized)
506 else:
507 return False, (starts_bold or starts_italicized)
509 # Is a Greek entry
510 if starts_italicized is True:
511 return True, False
513 if starts_bold is False:
514 return False, True
516 if unformatted_text_found:
517 # This is bolded, but we've seen unformatted text before
518 return True, False
519 # print(f"{text=}-> {starts_bold=}, {starts_italicized=}, {starts_greek=}")
521 if first_cells_are_bold:
522 return True, False
524 wxr.wtp.warning(
525 f"Can't be sure if bolded text entry '{text}' is a header or not",
526 sortid="table/20250210a",
527 )
528 return False, False