Coverage for src/wiktextract/extractor/el/table.py: 67%
239 statements
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
1import re
2from typing import TypeAlias
3from unicodedata import name as unicode_name
5from wikitextprocessor import HTMLNode, NodeKind, TemplateNode, WikiNode
7from wiktextract.clean import clean_value
8from wiktextract.extractor.el.tags import base_tag_map
9from wiktextract.wxr_context import WiktextractContext
10from wiktextract.wxr_logging import logger
12from .models import Form, WordEntry
13from .parse_utils import GREEK_LANGCODES, remove_duplicate_forms
15# from .simple_tags import simple_tag_map
16# from .tags_utils import convert_tags
18# Shorthand for this file. Could be an import, but it's so simple...
19Node = str | WikiNode
22# GREEK TABLE HEURISTICS:
23# If it's a table for a Greek language entry, if it's in a header or is in
24# italics, it's a header.
25# If it's NOT a Greek entry and has Greek text, it's a header.
28# node_fns are different from template_fns. template_fns are functions that
29# are used to handle how to expand (and otherwise process) templates, while
30# node functions are used when turning any parsed "abstract" nodes into strings.
31def cell_node_fn(
32 node: WikiNode,
33) -> list[Node] | None:
34 """Handle nodes in the parse tree specially."""
35 assert isinstance(node, WikiNode)
36 if node.kind == NodeKind.ITALIC: 36 ↛ 37line 36 didn't jump to line 37 because the condition on line 36 was never true
37 return ["__I__", *node.children, "__/I__"]
38 if node.kind == NodeKind.BOLD: 38 ↛ 39line 38 didn't jump to line 39 because the condition on line 38 was never true
39 return ["__B__", *node.children, "__/B__"]
40 # In case someone puts tables inside tables...
41 kind = node.kind
42 if kind in {
43 NodeKind.TABLE_CELL,
44 NodeKind.TABLE_HEADER_CELL,
45 }:
46 return node.children
47 return None
50BOLD_RE = re.compile(r"(__/?[BI]__)")
52ARTICLES: set[str] = {
53 "ο",
54 "η",
55 "το",
56 "την",
57 "της",
58 "τον",
59 "τη",
60 "το",
61 "οι",
62 "οι",
63 "τα",
64 "των",
65 "τους",
66 "του",
67 "τις",
68 "τα",
69}
72def localize_verb_inflection_raw_tags(form: Form) -> None:
73 # Leaves raw_tags untouched
74 verb_tags = []
76 for raw_tag in form.raw_tags:
77 clean_raw_tag = raw_tag.replace("\n", " ").lower()
78 localized = base_tag_map.get(clean_raw_tag)
79 if localized is not None: 79 ↛ 76line 79 didn't jump to line 76 because the condition on line 79 was always true
80 verb_tags.extend(localized)
82 unique_tags = list(set(verb_tags))
83 unique_tags.sort()
84 form.tags.extend(unique_tags)
87def process_inflection_section(
88 wxr: WiktextractContext, data: WordEntry, snode: WikiNode
89):
90 table_nodes: list[tuple[str | None, WikiNode]] = []
91 # template_depth is used as a nonlocal variable in bold_node_handler
92 # to gauge how deep inside a top-level template we are; we want to
93 # collect template data only for the top-level templates that are
94 # visible in the wikitext, not templates inside templates.
95 template_depth = 0
96 top_template_name: str | None = None
98 def table_node_handler_fn(
99 node: WikiNode,
100 ) -> list[str | WikiNode] | None:
101 """Insert special markers `__*__` and `__/*__` around bold nodes so
102 that the strings can later be split into "head-word" and "tag-words"
103 parts. Collect incidental stuff, like side-tables, that are often
104 put around the head."""
105 assert isinstance(node, WikiNode)
106 kind = node.kind
107 nonlocal template_depth
108 nonlocal top_template_name
109 if isinstance(node, TemplateNode): 109 ↛ 113line 109 didn't jump to line 113 because the condition on line 109 was never true
110 # Recursively expand templates so that even nodes inside the
111 # the templates are handled with bold_node_handler.
112 # Argh. Don't use "node_to_text", that causes bad output...
113 expanded = wxr.wtp.expand(wxr.wtp.node_to_wikitext(node))
114 if template_depth == 0:
115 # We are looking at a top-level template in the original
116 # wikitext.
117 top_template_name = node.template_name
118 new_node = wxr.wtp.parse(expanded)
120 template_depth += 1
121 ret = wxr.wtp.node_to_text(
122 new_node, node_handler_fn=table_node_handler_fn
123 )
124 template_depth -= 1
125 if template_depth == 0:
126 top_template_name = None
127 return ret
129 if kind in {
130 NodeKind.TABLE,
131 }:
132 # XXX Handle tables here
133 # template depth and top-level template name
134 nonlocal table_nodes
135 table_nodes.append((top_template_name, node))
136 return [""]
137 return None
139 _ = wxr.wtp.node_to_html(snode, node_handler_fn=table_node_handler_fn)
141 if len(table_nodes) > 0: 141 ↛ 154line 141 didn't jump to line 154 because the condition on line 141 was always true
142 for template_name, table_node in table_nodes:
143 # XXX template_name
144 parse_table(
145 wxr,
146 table_node,
147 data,
148 data.lang_code in GREEK_LANGCODES,
149 template_name=template_name or "",
150 )
151 for form in data.forms:
152 localize_verb_inflection_raw_tags(form)
154 data.forms = remove_duplicate_forms(wxr, data.forms)
157def parse_table(
158 wxr: WiktextractContext,
159 tnode: WikiNode,
160 data: WordEntry,
161 is_greek_entry: bool = False, # Whether the entry is for a Greek word
162 template_name: str = "",
163) -> None:
164 """Parse inflection table. Generates 'form' data; 'foos' is a form of 'foo'
165 with the tags ['plural']."""
166 assert (isinstance(tnode, WikiNode) and tnode.kind == NodeKind.TABLE) or (
167 isinstance(tnode, HTMLNode) and tnode.tag == "table"
168 )
170 is_html_table = isinstance(tnode, HTMLNode)
172 # Some debugging code: if wiktwords is passed a --inflection-tables-file
173 # argument, we save tables to a file for debugging purposes, or for just
174 # getting tables that can be used as test data.
175 if wxr.config.expand_tables: 175 ↛ 176line 175 didn't jump to line 176 because the condition on line 175 was never true
176 with open(wxr.config.expand_tables, "w") as f:
177 f.write(f"{wxr.wtp.title=}\n")
178 text = wxr.wtp.node_to_wikitext(tnode)
179 f.write(f"{text}\n")
181 Row: TypeAlias = int
182 Column: TypeAlias = int
184 # We complete the table using nested dicts (instead of arrays for
185 # convenience) such that when we come across a node, we push that node's
186 # reference to each coordinate point in the table grid it occupies. Each
187 # grid point can then be checked for if it's been handled already and
188 # skipped if needed.
189 table_grid: dict[Row, dict[Column, WikiNode]] = {}
191 first_column_is_headers = True
193 for r, row in enumerate(
194 tnode.find_html_recursively("tr")
195 if is_html_table
196 else tnode.find_child_recursively(NodeKind.TABLE_ROW)
197 ):
198 c = 0
199 # print(f"{r=}, {row=}")
200 if r not in table_grid:
201 table_grid[r] = {}
203 for cell in (
204 row.find_html(["th", "td"])
205 if is_html_table
206 else row.find_child(
207 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL,
208 )
209 ):
210 while c in table_grid[r]:
211 c += 1
213 try:
214 rowspan = int(cell.attrs.get("rowspan", "1")) # 🡙
215 colspan = int(cell.attrs.get("colspan", "1")) # 🡘
216 except ValueError:
217 rowspan = 1
218 colspan = 1
219 # print("COL:", col)
221 if colspan > 30: 221 ↛ 222line 221 didn't jump to line 222 because the condition on line 221 was never true
222 wxr.wtp.error(
223 f"Colspan {colspan} over 30, set to 1",
224 sortid="table/128/20250207",
225 )
226 colspan = 1
227 if rowspan > 30: 227 ↛ 228line 227 didn't jump to line 228 because the condition on line 227 was never true
228 wxr.wtp.error(
229 f"Rowspan {rowspan} over 30, set to 1",
230 sortid="table/134/20250207b",
231 )
232 rowspan = 1
234 for rr in range(r, r + rowspan):
235 if rr not in table_grid:
236 table_grid[rr] = {}
237 for cc in range(c, c + colspan):
238 table_grid[rr][cc] = cell
240 if not table_grid[len(table_grid) - 1]: 240 ↛ 242line 240 didn't jump to line 242 because the condition on line 240 was never true
241 # Last row is empty; traverse backwards to skip empty rows at end
242 last_item = None
243 for i, rowd in reversed(table_grid.items()):
244 if rowd:
245 last_item = i
246 break
248 assert last_item is not None
250 new_table_grid = dict()
251 for i, rowd in table_grid.items():
252 if i > last_item:
253 continue
254 new_table_grid[i] = rowd
255 table_grid = new_table_grid
257 if len(table_grid[0]) == 1: 257 ↛ 259line 257 didn't jump to line 259 because the condition on line 257 was never true
258 # Table is one column in width, no headers on rows
259 first_column_is_headers = False
261 if len(table_grid) == 2: 261 ↛ 263line 261 didn't jump to line 263 because the condition on line 261 was never true
262 # There's only one or two rows
263 first_column_is_headers = False
265 # Headers are saved in two dict that has their keys made out of tuples
266 # made of their "bookends": so {(1,1), "foo"} for a header that is made
267 # up of the first cell only of a row in the column_hdrs dict.
268 # If we come across a header that has those exact same bookends, only
269 # then do we replace the previous tags with it; if you have overlapping
270 # 'widths', leave them so that we inherit different 'levels' of headers.
271 Spread = tuple[int, int]
272 SpreadDict = dict[Spread, str]
273 # The column and row headers are saved into big dicts: column_hdrs is a dict
274 # whose key is what row or column we are in. The values of that table grid
275 # square is a dict with the bookends (`Spread`) and the tags associated with
276 # those bookends
277 column_hdrs_all: dict[Column, SpreadDict] = {}
278 row_hdrs_all: dict[Row, dict[Column, SpreadDict]] = {}
280 forms: list[Form] = []
281 processed: set[WikiNode] = set()
282 # Some tables have cells with stuff like `του` we want to add to the
283 # next cell
284 prefix: str | None = None
286 # print(f"{table_grid=}")
288 first_cells_are_bold = False
289 found_unformatted_text = False
291 for r, row_d in table_grid.items():
292 # Check for previously added row headers that may have spread lower;
293 # Remove old row headers that don't exist on this row.
294 for c, cell in row_d.items():
295 if cell in processed:
296 continue
297 processed.add(cell)
299 try:
300 rowspan = int(cell.attrs.get("rowspan", "1")) # 🡙
301 colspan = int(cell.attrs.get("colspan", "1")) # 🡘
302 except ValueError:
303 rowspan = 1
304 colspan = 1
306 spans = process_cell_text(wxr, cell)
308 if len(spans) <= 0:
309 continue
311 if r == 0:
312 if spans[0][0]: # starts_bold 312 ↛ 313line 312 didn't jump to line 313 because the condition on line 312 was never true
313 first_cells_are_bold = True
315 text = clean_value(wxr, " ".join(span[3] for span in spans))
316 # print(f"{text=}")
318 this_is_header, unformatted_text = is_header(
319 wxr,
320 cell,
321 spans,
322 is_greek_entry,
323 found_unformatted_text,
324 first_cells_are_bold,
325 )
327 if unformatted_text is True:
328 found_unformatted_text = True
330 if this_is_header or (c == 0 and first_column_is_headers is True):
331 # Because Greek wiktionary has its own written script to rely
332 # in heuristics, we can use that. It also seems that for
333 # tables in Greek-language entries even if the table doesn't
334 # use proper header cells, you can trust bolding and italics.
336 # Currently we don't care which "direction" the header points:
337 # we add the tag to both column headers and row headers, and
338 # rely on that all headers are on only rows or columns that
339 # don't have data cells; ie. headers and data aren't mixed.
341 # Each row and each column gets its own header data.
342 # The Spread key is used to keep track which headers should
343 # "overlap": if the spread is different, that should always
344 # mean that one is contained within another and thus they're
345 # not complementary headers, but one "bigger" category and
346 # one "specific" category. If the Spread is identical, then
347 # that's obviously two complementary headers, and the later one
348 # overwrites the other.
349 for rr in range(r, r + rowspan):
350 if rr not in row_hdrs_all:
351 row_hdrs_all[rr] = {c: {(r, r + rowspan): text}}
352 elif c not in row_hdrs_all[rr]: 352 ↛ 357line 352 didn't jump to line 357 because the condition on line 352 was always true
353 row_hdrs_all[rr][c] = {(r, r + rowspan): text}
354 else:
355 # Also overwrites headers with the same "span"; simple
356 # way to have overlapping sections.
357 row_hdrs_all[rr][c][(r, r + rowspan)] = text
359 for cc in range(c, c + colspan):
360 if cc not in column_hdrs_all:
361 column_hdrs_all[cc] = {(c, c + colspan): text}
362 else:
363 column_hdrs_all[cc][(c, c + colspan)] = text
365 prefix = None
367 elif text in ARTICLES: 367 ↛ 368line 367 didn't jump to line 368 because the condition on line 367 was never true
368 prefix = text
369 else:
370 # cell is data
371 if text in ( 371 ↛ 393line 371 didn't jump to line 393 because the condition on line 371 was never true
372 "αι",
373 "ένα",
374 "ένας",
375 "στα",
376 "στη",
377 "στην",
378 "στης",
379 "στις",
380 "στο",
381 "στον",
382 "στου",
383 "στους",
384 "στων",
385 "τ'",
386 "ταις",
387 "τας",
388 "τες",
389 "τη",
390 "τοις",
391 "τω",
392 ):
393 wxr.wtp.debug(
394 f"Found '{text}' in table '{wxr.wtp.title}'",
395 sortid="table/335",
396 )
397 tags: set[str] = set()
398 for cc, vd in row_hdrs_all.get(r, {}).items():
399 if c <= cc: 399 ↛ 400line 399 didn't jump to line 400 because the condition on line 399 was never true
400 continue
401 for (start, end), tag in vd.items():
402 if start > r or end < r + rowspan:
403 continue
404 tags.add(tag)
405 for (start, end), tag in column_hdrs_all.get(c, {}).items():
406 if start > c or end < c + colspan: 406 ↛ 407line 406 didn't jump to line 407 because the condition on line 406 was never true
407 continue
408 tags.add(tag)
409 texts = [text]
410 if "&" in text: 410 ↛ 411line 410 didn't jump to line 411 because the condition on line 410 was never true
411 texts = [t.strip() for t in text.split("&")]
412 # Avert your eyes... Python list comprehension syntax amirite
413 texts = [line for text in texts for line in text.splitlines()]
414 if prefix is not None: 414 ↛ 415line 414 didn't jump to line 415 because the condition on line 414 was never true
415 texts = [f"{prefix} {t}" for t in texts]
416 prefix = None
417 if len(tags) > 0: 417 ↛ 424line 417 didn't jump to line 424 because the condition on line 417 was always true
418 # If a cell has no tags in a table, it's probably a note
419 # or something.
420 forms.extend(
421 Form(form=text, raw_tags=list(tags)) for text in texts
422 )
423 else:
424 wxr.wtp.warning(
425 f"Cell without any tags in table: {text}",
426 sortid="table/300/20250217",
427 )
429 # logger.debug(
430 # f"{wxr.wtp.title}\n{print_tree(tree, indent=2, ret_value=True)}"
431 # )
432 # print(forms)
434 # # Replace raw_tags with tags if appropriate
435 # for form in forms:
436 # legit_tags, new_raw_tags, poses = convert_tags(form.raw_tags)
437 # # Poses are strings like "adj 1", used in pronunciation data
438 # # to later associate sound data with the correct pos entry.
439 # # Ignored here.
440 # if legit_tags:
441 # form.tags = legit_tags
442 # form.tags.extend(poses)
443 # form.raw_tags = new_raw_tags
444 # print(f"Inside parse_table: {forms=}")
446 if len(forms) > 0: 446 ↛ exitline 446 didn't return from function 'parse_table' because the condition on line 446 was always true
447 data.forms.append(
448 Form(form=template_name, tags=["inflection-template"])
449 )
451 data.forms.extend(forms)
454def process_cell_text(
455 wxr: WiktextractContext, cell: WikiNode
456) -> list[tuple[bool, bool, bool, str]]:
457 cell_text = wxr.wtp.node_to_text(cell, node_handler_fn=cell_node_fn)
458 cell_text = clean_value(wxr, cell_text)
459 split_text = BOLD_RE.split(cell_text)
461 # bold, italics, is greek, text
462 spans: list[tuple[bool, bool, bool, str]] = []
464 inside_bold = False
465 inside_italics = False
466 for i, text in enumerate(split_text):
467 text = text.strip()
468 if not text:
469 continue
470 if i % 2 == 0: 470 ↛ 482line 470 didn't jump to line 482 because the condition on line 470 was always true
471 for ch in text: 471 ↛ 478line 471 didn't jump to line 478 because the loop on line 471 didn't complete
472 if not ch.isalpha(): 472 ↛ 473line 472 didn't jump to line 473 because the condition on line 472 was never true
473 continue
474 greek = unicode_name(ch).startswith("GREEK")
475 break
476 else:
477 # no alphanumerics detected
478 continue
480 spans.append((inside_bold, inside_italics, greek, text))
481 continue
482 match text:
483 case "__B__":
484 inside_bold = True
485 case "__/B__":
486 inside_bold = False
487 case "__I__":
488 inside_italics = True
489 case "__/I__":
490 inside_italics = False
492 return spans
495UnformattedFound: TypeAlias = bool
498def is_header(
499 wxr: WiktextractContext,
500 cell: WikiNode,
501 spans: list[tuple[bool, bool, bool, str]],
502 is_greek_entry: bool,
503 unformatted_text_found: bool,
504 first_cells_are_bold,
505) -> tuple[bool, UnformattedFound]:
506 # Container for more complex logic stuff because trying to figure out
507 # if something is a header can get messy.
508 if cell.kind == NodeKind.TABLE_HEADER_CELL:
509 return True, False
511 starts_bold, starts_italicized, starts_greek, text = spans[0]
513 if "bold" in cell.attrs.get("style", ""): 513 ↛ 514line 513 didn't jump to line 514 because the condition on line 513 was never true
514 starts_bold = True
515 if "italic" in cell.attrs.get("style", ""): 515 ↛ 516line 515 didn't jump to line 516 because the condition on line 515 was never true
516 starts_italicized = True
518 # Not a Greek entry
519 if not is_greek_entry: 519 ↛ 520line 519 didn't jump to line 520 because the condition on line 519 was never true
520 if starts_greek:
521 # If the table is for another language other than Greek, a cell
522 # starting with Greek text is a table header
523 return True, (starts_bold or starts_italicized)
524 else:
525 return False, (starts_bold or starts_italicized)
527 # Is a Greek entry
528 if starts_italicized is True: 528 ↛ 529line 528 didn't jump to line 529 because the condition on line 528 was never true
529 return True, False
531 if starts_bold is False: 531 ↛ 534line 531 didn't jump to line 534 because the condition on line 531 was always true
532 return False, True
534 if unformatted_text_found:
535 # This is bolded, but we've seen unformatted text before
536 return True, False
537 # print(f"{text=}-> {starts_bold=}, {starts_italicized=}, {starts_greek=}")
539 if first_cells_are_bold:
540 return True, False
542 wxr.wtp.warning(
543 f"Can't be sure if bolded text entry '{text}' is a header or not",
544 sortid="table/20250210a",
545 )
546 return False, False