Coverage for src/wiktextract/extractor/el/table.py: 83%
291 statements
« prev ^ index » next coverage.py v7.11.0, created at 2025-11-03 05:44 +0000
« prev ^ index » next coverage.py v7.11.0, created at 2025-11-03 05:44 +0000
1import re
2from typing import TypeAlias
3from unicodedata import name as unicode_name
5from wikitextprocessor import HTMLNode, NodeKind, TemplateNode, WikiNode
7from wiktextract.clean import clean_value
8from wiktextract.extractor.el.tags import translate_raw_tags
9from wiktextract.wxr_context import WiktextractContext
11from .models import Form, FormSource, WordEntry
12from .parse_utils import GREEK_LANGCODES, remove_duplicate_forms
14# from .simple_tags import simple_tag_map
15# from .tags_utils import convert_tags
17# Shorthand for this file. Could be an import, but it's so simple...
18Node = str | WikiNode
21# GREEK TABLE HEURISTICS:
22# If it's a table for a Greek language entry, if it's in a header or is in
23# italics, it's a header.
24# If it's NOT a Greek entry and has Greek text, it's a header.
27# node_fns are different from template_fns. template_fns are functions that
28# are used to handle how to expand (and otherwise process) templates, while
29# node functions are used when turning any parsed "abstract" nodes into strings.
30def cell_node_fn(
31 node: WikiNode,
32) -> list[Node] | None:
33 """Handle nodes in the parse tree specially."""
34 assert isinstance(node, WikiNode)
35 if node.kind == NodeKind.ITALIC:
36 return ["__I__", *node.children, "__/I__"]
37 if node.kind == NodeKind.BOLD:
38 return ["__B__", *node.children, "__/B__"]
39 # In case someone puts tables inside tables...
40 kind = node.kind
41 if kind in {
42 NodeKind.TABLE_CELL,
43 NodeKind.TABLE_HEADER_CELL,
44 }:
45 return node.children
46 return None
49BOLD_RE = re.compile(r"(__/?[BI]__)")
50TRAILING_NUMBER_RE = re.compile(r"\d+$")
52ARTICLES: set[str] = {
53 "ο",
54 "η",
55 "το",
56 "την",
57 "της",
58 "τον",
59 "τη",
60 "το",
61 "οι",
62 "οι",
63 "τα",
64 "των",
65 "τους",
66 "του",
67 "τις",
68 "τα",
69}
70EXTENDED_ARTICLES = ARTICLES | {
71 "ο/η",
72 "του/της",
73 "τον/τη",
74 "τον/την",
75 "τους/τις",
76}
77"""Articles to trim from inflection tables / headwords."""
78UNEXPECTED_ARTICLES = {
79 "αι",
80 "ένα",
81 "ένας",
82 "στα",
83 "στη",
84 "στην",
85 "στης",
86 "στις",
87 "στο",
88 "στον",
89 "στου",
90 "στους",
91 "στων",
92 "τ'",
93 "ταις",
94 "τας",
95 "τες",
96 "τη",
97 "τοις",
98 "τω",
99}
100"""Includes contractions, Ancient Greek articles etc."""
103def process_inflection_section(
104 wxr: WiktextractContext,
105 data: WordEntry,
106 snode: WikiNode,
107 *,
108 source: FormSource = "",
109) -> None:
110 table_nodes: list[tuple[str | None, WikiNode]] = []
111 # template_depth is used as a nonlocal variable in bold_node_handler
112 # to gauge how deep inside a top-level template we are; we want to
113 # collect template data only for the top-level templates that are
114 # visible in the wikitext, not templates inside templates.
115 template_depth = 0
116 top_template_name: str | None = None
118 def table_node_handler_fn(
119 node: WikiNode,
120 ) -> list[str] | str | None:
121 """Insert special markers `__*__` and `__/*__` around bold nodes so
122 that the strings can later be split into "head-word" and "tag-words"
123 parts. Collect incidental stuff, like side-tables, that are often
124 put around the head."""
125 assert isinstance(node, WikiNode)
126 nonlocal template_depth
127 nonlocal top_template_name
129 if isinstance(node, TemplateNode):
130 # Recursively expand templates so that even nodes inside the
131 # the templates are handled with bold_node_handler.
132 # Argh. Don't use "node_to_text", that causes bad output...
133 expanded = wxr.wtp.expand(wxr.wtp.node_to_wikitext(node))
134 if template_depth == 0: 134 ↛ 138line 134 didn't jump to line 138 because the condition on line 134 was always true
135 # We are looking at a top-level template in the original
136 # wikitext.
137 top_template_name = node.template_name
138 new_node = wxr.wtp.parse(expanded)
140 template_depth += 1
141 ret = wxr.wtp.node_to_text(
142 new_node, node_handler_fn=table_node_handler_fn
143 )
144 template_depth -= 1
145 if template_depth == 0: 145 ↛ 147line 145 didn't jump to line 147 because the condition on line 145 was always true
146 top_template_name = None
147 return ret
149 if node.kind in {
150 NodeKind.TABLE,
151 }:
152 # XXX Handle tables here
153 # template depth and top-level template name
154 nonlocal table_nodes
155 table_nodes.append((top_template_name, node))
156 return [""]
157 return None
159 _ = wxr.wtp.node_to_html(snode, node_handler_fn=table_node_handler_fn)
161 for template_name, table_node in table_nodes:
162 # XXX template_name
163 parse_table(
164 wxr,
165 table_node,
166 data,
167 data.lang_code in GREEK_LANGCODES,
168 template_name=template_name or "",
169 source=source,
170 )
172 data.forms = remove_duplicate_forms(wxr, data.forms)
175def parse_table(
176 wxr: WiktextractContext,
177 tnode: WikiNode,
178 data: WordEntry,
179 is_greek_entry: bool = False, # Whether the entry is for a Greek word
180 template_name: str = "",
181 *,
182 source: FormSource = "",
183) -> None:
184 """Parse inflection table. Generates 'form' data; 'foos' is a form of 'foo'
185 with the tags ['plural']."""
186 assert (isinstance(tnode, WikiNode) and tnode.kind == NodeKind.TABLE) or (
187 isinstance(tnode, HTMLNode) and tnode.tag == "table"
188 )
190 is_html_table = isinstance(tnode, HTMLNode)
192 # Some debugging code: if wiktwords is passed a --inflection-tables-file
193 # argument, we save tables to a file for debugging purposes, or for just
194 # getting tables that can be used as test data.
195 if wxr.config.expand_tables: 195 ↛ 196line 195 didn't jump to line 196 because the condition on line 195 was never true
196 with open(wxr.config.expand_tables, "w") as f:
197 f.write(f"{wxr.wtp.title=}\n")
198 text = wxr.wtp.node_to_wikitext(tnode)
199 f.write(f"{text}\n")
201 Row: TypeAlias = int
202 Column: TypeAlias = int
204 # We complete the table using nested dicts (instead of arrays for
205 # convenience) such that when we come across a node, we push that node's
206 # reference to each coordinate point in the table grid it occupies. Each
207 # grid point can then be checked for if it's been handled already and
208 # skipped if needed.
209 table_grid: dict[Row, dict[Column, WikiNode]] = {}
211 first_column_is_headers = True
213 for r, row in enumerate(
214 tnode.find_html_recursively("tr")
215 if is_html_table
216 else tnode.find_child_recursively(NodeKind.TABLE_ROW)
217 ):
218 c = 0
219 # print(f"{r=}, {row=}")
220 if r not in table_grid:
221 table_grid[r] = {}
223 for cell in (
224 row.find_html(["th", "td"])
225 if is_html_table
226 else row.find_child(
227 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL,
228 )
229 ):
230 while c in table_grid[r]:
231 c += 1
233 try:
234 rowspan = int(cell.attrs.get("rowspan", "1")) # 🡙
235 colspan = int(cell.attrs.get("colspan", "1")) # 🡘
236 except ValueError:
237 rowspan = 1
238 colspan = 1
239 # print("COL:", col)
241 if colspan > 30: 241 ↛ 242line 241 didn't jump to line 242 because the condition on line 241 was never true
242 wxr.wtp.error(
243 f"Colspan {colspan} over 30, set to 1",
244 sortid="table/128/20250207",
245 )
246 colspan = 1
247 if rowspan > 30: 247 ↛ 248line 247 didn't jump to line 248 because the condition on line 247 was never true
248 wxr.wtp.error(
249 f"Rowspan {rowspan} over 30, set to 1",
250 sortid="table/134/20250207b",
251 )
252 rowspan = 1
254 for rr in range(r, r + rowspan):
255 if rr not in table_grid:
256 table_grid[rr] = {}
257 for cc in range(c, c + colspan):
258 table_grid[rr][cc] = cell
260 if not table_grid[len(table_grid) - 1]: 260 ↛ 262line 260 didn't jump to line 262 because the condition on line 260 was never true
261 # Last row is empty; traverse backwards to skip empty rows at end
262 last_item = None
263 for i, rowd in reversed(table_grid.items()):
264 if rowd:
265 last_item = i
266 break
268 assert last_item is not None
270 new_table_grid = dict()
271 for i, rowd in table_grid.items():
272 if i > last_item:
273 continue
274 new_table_grid[i] = rowd
275 table_grid = new_table_grid
277 if len(table_grid[0]) == 1: 277 ↛ 279line 277 didn't jump to line 279 because the condition on line 277 was never true
278 # Table is one column in width, no headers on rows
279 first_column_is_headers = False
281 if len(table_grid) == 2: 281 ↛ 283line 281 didn't jump to line 283 because the condition on line 281 was never true
282 # There's only one or two rows
283 first_column_is_headers = False
285 # Headers are saved in two dict that has their keys made out of tuples
286 # made of their "bookends": so {(1,1), "foo"} for a header that is made
287 # up of the first cell only of a row in the column_hdrs dict.
288 # If we come across a header that has those exact same bookends, only
289 # then do we replace the previous tags with it; if you have overlapping
290 # 'widths', leave them so that we inherit different 'levels' of headers.
291 Spread = tuple[int, int]
292 SpreadDict = dict[Spread, str]
293 # The column and row headers are saved into big dicts: column_hdrs is a dict
294 # whose key is what row or column we are in. The values of that table grid
295 # square is a dict with the bookends (`Spread`) and the tags associated with
296 # those bookends
297 column_hdrs_all: dict[Column, SpreadDict] = {}
298 row_hdrs_all: dict[Row, dict[Column, SpreadDict]] = {}
300 forms: list[Form] = []
301 processed: set[WikiNode] = set()
302 # Some tables have cells with stuff like `του` we want to add to the
303 # next cell
304 prefix: str | None = None
306 # print(f"{table_grid=}")
308 first_cells_are_bold = False
309 found_unformatted_text = False
311 for r, row_d in table_grid.items():
312 # Check for previously added row headers that may have spread lower;
313 # Remove old row headers that don't exist on this row.
314 for c, cell in row_d.items():
315 if cell in processed:
316 continue
317 processed.add(cell)
319 try:
320 rowspan = int(cell.attrs.get("rowspan", "1")) # 🡙
321 colspan = int(cell.attrs.get("colspan", "1")) # 🡘
322 except ValueError:
323 rowspan = 1
324 colspan = 1
326 spans = process_cell_text(wxr, cell)
328 if len(spans) <= 0:
329 continue
331 if r == 0:
332 if spans[0][0]: # starts_bold
333 first_cells_are_bold = True
335 text = clean_value(wxr, " ".join(span[3] for span in spans))
336 # print(f"{text=}")
338 this_is_header, unformatted_text = is_header(
339 wxr,
340 cell,
341 spans,
342 is_greek_entry,
343 found_unformatted_text,
344 first_cells_are_bold,
345 )
347 if unformatted_text is True:
348 found_unformatted_text = True
350 if this_is_header or (c == 0 and first_column_is_headers is True):
351 # Because Greek wiktionary has its own written script to rely
352 # in heuristics, we can use that. It also seems that for
353 # tables in Greek-language entries even if the table doesn't
354 # use proper header cells, you can trust bolding and italics.
356 # Currently we don't care which "direction" the header points:
357 # we add the tag to both column headers and row headers, and
358 # rely on that all headers are on only rows or columns that
359 # don't have data cells; ie. headers and data aren't mixed.
361 # Each row and each column gets its own header data.
362 # The Spread key is used to keep track which headers should
363 # "overlap": if the spread is different, that should always
364 # mean that one is contained within another and thus they're
365 # not complementary headers, but one "bigger" category and
366 # one "specific" category. If the Spread is identical, then
367 # that's obviously two complementary headers, and the later one
368 # overwrites the other.
369 for rr in range(r, r + rowspan):
370 if rr not in row_hdrs_all:
371 row_hdrs_all[rr] = {c: {(r, r + rowspan): text}}
372 elif c not in row_hdrs_all[rr]: 372 ↛ 377line 372 didn't jump to line 377 because the condition on line 372 was always true
373 row_hdrs_all[rr][c] = {(r, r + rowspan): text}
374 else:
375 # Also overwrites headers with the same "span"; simple
376 # way to have overlapping sections.
377 row_hdrs_all[rr][c][(r, r + rowspan)] = text
379 for cc in range(c, c + colspan):
380 if cc not in column_hdrs_all:
381 column_hdrs_all[cc] = {(c, c + colspan): text}
382 else:
383 column_hdrs_all[cc][(c, c + colspan)] = text
385 prefix = None
387 elif text in ARTICLES:
388 prefix = text
389 else:
390 # cell is data
391 if text in UNEXPECTED_ARTICLES: 391 ↛ 392line 391 didn't jump to line 392 because the condition on line 391 was never true
392 wxr.wtp.debug(
393 f"Found '{text}' in table '{wxr.wtp.title}'",
394 sortid="table/335",
395 )
396 tags: set[str] = set()
397 for cc, vd in row_hdrs_all.get(r, {}).items():
398 if c <= cc: 398 ↛ 399line 398 didn't jump to line 399 because the condition on line 398 was never true
399 continue
400 for (start, end), tag in vd.items():
401 if start > r or end < r + rowspan:
402 continue
403 tags.add(tag)
404 for (start, end), tag in column_hdrs_all.get(c, {}).items():
405 if start > c or end < c + colspan: 405 ↛ 406line 405 didn't jump to line 406 because the condition on line 405 was never true
406 continue
407 tags.add(tag)
408 texts = [text]
409 if "&" in text:
410 texts = [t.strip() for t in text.split("&")]
411 # Avert your eyes... Python list comprehension syntax amirite
412 texts = [line for text in texts for line in text.splitlines()]
413 if prefix is not None:
414 texts = [f"{prefix} {t}" for t in texts]
415 prefix = None
416 if len(tags) > 0: 416 ↛ 428line 416 didn't jump to line 428 because the condition on line 416 was always true
417 # If a cell has no tags in a table, it's probably a note
418 # or something.
419 forms.extend(
420 Form(
421 form=text,
422 raw_tags=sorted(tags),
423 source=source,
424 )
425 for text in texts
426 )
427 else:
428 wxr.wtp.warning(
429 f"Cell without any tags in table: {text}",
430 sortid="table/300/20250217",
431 )
433 # logger.debug(
434 # f"{wxr.wtp.title}\n{print_tree(tree, indent=2, ret_value=True)}"
435 # )
436 # print(forms)
438 # # Replace raw_tags with tags if appropriate
439 # for form in forms:
440 # legit_tags, new_raw_tags, poses = convert_tags(form.raw_tags)
441 # # Poses are strings like "adj 1", used in pronunciation data
442 # # to later associate sound data with the correct pos entry.
443 # # Ignored here.
444 # if legit_tags:
445 # form.tags = legit_tags
446 # form.tags.extend(poses)
447 # form.raw_tags = new_raw_tags
448 # print(f"Inside parse_table: {forms=}")
450 if len(forms) > 0: 450 ↛ exitline 450 didn't return from function 'parse_table' because the condition on line 450 was always true
451 data.forms.append(
452 Form(
453 form=template_name,
454 tags=["inflection-template"],
455 source=source,
456 )
457 )
459 new_forms = postprocess_table_forms(forms, data.word)
460 data.forms.extend(new_forms)
463def remove_article_forms(forms: list[Form], word: str) -> list[Form]:
464 """Return a new form list without article forms.
466 Articles can appear in two ways:
467 * As a separate form:
468 Ex. https://el.wiktionary.org/wiki/λίθος
469 * As part of a form, inside form.form
470 Ex. most tables
472 Used in both headword and table forms. Note that for headword forms, where
473 there is usually no grammatic information, we could also use these articles
474 to populate tags - but since most of the time we remove articles in tables,
475 it was deemed not worth.
476 """
477 # Do not remove article forms for the article pages themselves...
478 if word in ARTICLES:
479 return forms
481 new_forms: list[Form] = []
482 for form in forms:
483 if form.form in EXTENDED_ARTICLES:
484 continue
485 parts = form.form.split()
486 if len(parts) > 1 and parts[0] in EXTENDED_ARTICLES:
487 form.form = " ".join(parts[1:])
488 if not form.form: 488 ↛ 489line 488 didn't jump to line 489 because the condition on line 488 was never true
489 continue
490 new_forms.append(form)
491 return new_forms
494def postprocess_table_forms(forms: list[Form], word: str) -> list[Form]:
495 """Postprocess table forms.
497 * Translate tags
498 * Remove articles (requires original word)
499 * Convert some parens to rare tag
500 * Remove trailing numbers and stars (usu. notes)
501 * Form expansion
503 About form expansion, there are two types:
504 * Separators: "/", "-"
505 * Strings inside parens
507 The purpose being to go:
508 FROM "θα ζητάν(ε) - ζητούν(ε)"
509 TO ["θα ζητάν", "θα ζητάνε", "θα ζητούν", "θα ζητούνε"]
511 References:
512 * https://el.wiktionary.org/wiki/τρώω
513 * https://el.wiktionary.org/wiki/ζητάω < this page is cursed anyway
514 https://el.wiktionary.org/wiki/αγαπάω < use this instead
515 """
516 for form in forms:
517 translate_raw_tags(form)
519 clean_forms = remove_article_forms(forms, word)
521 for form in clean_forms:
522 # Parens > rare inflection (cf. μπόι)
523 if form.form[0] == "(" and form.form[-1] == ")":
524 form.form = form.form[1:-1]
525 form.tags.append("rare")
527 # Remove trailing numbers (usu. notes)
528 # https://el.wiktionary.org/wiki/Καπιτόπουλος
529 form.form = TRAILING_NUMBER_RE.sub("", form.form)
530 # https://el.wiktionary.org/wiki/επιζών
531 form.form = form.form.rstrip("*")
533 # Separators
534 separators = ("/", "-")
535 verb_particles = ("θα", "να")
536 separated_forms: list[Form] = []
537 for form in clean_forms:
538 # Assume only one type of separator present atm
539 sep = next((sep for sep in separators if sep in form.form), None)
540 if sep is None:
541 separated_forms.append(form)
542 continue
544 # Ignore suffix/prefixes (-ισμός)
545 if form.form.startswith(sep) or form.form.endswith(sep):
546 separated_forms.append(form)
547 continue
549 # Extract particle if any
550 suffix_particle = ""
551 parts = form.form.split()
552 if len(parts) > 1 and parts[0] in verb_particles:
553 suffix_particle = parts[0]
554 form.form = " ".join(parts[1:])
556 for separated in form.form.split(sep):
557 separated_form = form.model_copy(deep=True)
558 separated = separated.strip()
559 if suffix_particle:
560 separated_form.form = f"{suffix_particle} {separated}"
561 else:
562 separated_form.form = separated
563 separated_forms.append(separated_form)
565 # Strings inside parens
566 new_forms: list[Form] = []
567 for form in separated_forms:
568 text = form.form
570 m = re.match(r"^(.*?)\((.*?)\)(.*)$", text)
571 if not m:
572 new_forms.append(form)
573 continue
575 before, inside, after = m.groups()
576 expanded = [before + after, before + inside + after]
577 for variant in expanded:
578 new_form = form.model_copy(deep=True)
579 new_form.form = variant
580 new_forms.append(new_form)
582 return new_forms
585def process_cell_text(
586 wxr: WiktextractContext, cell: WikiNode
587) -> list[tuple[bool, bool, bool, str]]:
588 cell_text = wxr.wtp.node_to_text(cell, node_handler_fn=cell_node_fn)
589 cell_text = clean_value(wxr, cell_text)
590 split_text = BOLD_RE.split(cell_text)
592 # bold, italics, is greek, text
593 spans: list[tuple[bool, bool, bool, str]] = []
595 inside_bold = False
596 inside_italics = False
597 for i, text in enumerate(split_text):
598 text = text.strip()
599 if not text:
600 continue
601 if i % 2 == 0:
602 for ch in text:
603 if not ch.isalpha():
604 continue
605 greek = unicode_name(ch).startswith("GREEK")
606 break
607 else:
608 # no alphanumerics detected
609 continue
611 spans.append((inside_bold, inside_italics, greek, text))
612 continue
613 match text:
614 case "__B__":
615 inside_bold = True
616 case "__/B__":
617 inside_bold = False
618 case "__I__":
619 inside_italics = True
620 case "__/I__": 620 ↛ 597line 620 didn't jump to line 597 because the pattern on line 620 always matched
621 inside_italics = False
623 return spans
626UnformattedFound: TypeAlias = bool
629def is_header(
630 wxr: WiktextractContext,
631 cell: WikiNode,
632 spans: list[tuple[bool, bool, bool, str]],
633 is_greek_entry: bool,
634 unformatted_text_found: bool,
635 first_cells_are_bold: bool,
636) -> tuple[bool, UnformattedFound]:
637 # Container for more complex logic stuff because trying to figure out
638 # if something is a header can get messy.
639 if cell.kind == NodeKind.TABLE_HEADER_CELL:
640 return True, False
642 starts_bold, starts_italicized, starts_greek, text = spans[0]
644 if "bold" in cell.attrs.get("style", ""): 644 ↛ 645line 644 didn't jump to line 645 because the condition on line 644 was never true
645 starts_bold = True
646 if "italic" in cell.attrs.get("style", ""):
647 starts_italicized = True
649 # Not a Greek entry
650 if not is_greek_entry: 650 ↛ 651line 650 didn't jump to line 651 because the condition on line 650 was never true
651 if starts_greek:
652 # If the table is for another language other than Greek, a cell
653 # starting with Greek text is a table header
654 return True, (starts_bold or starts_italicized)
655 else:
656 return False, (starts_bold or starts_italicized)
658 # Is a Greek entry
659 if starts_italicized is True:
660 return True, False
662 if starts_bold is False: 662 ↛ 665line 662 didn't jump to line 665 because the condition on line 662 was always true
663 return False, True
665 if unformatted_text_found:
666 # This is bolded, but we've seen unformatted text before
667 return True, False
668 # print(f"{text=}-> {starts_bold=}, {starts_italicized=}, {starts_greek=}")
670 if first_cells_are_bold:
671 return True, False
673 wxr.wtp.wiki_notice(
674 f"Can't be sure if bolded text entry '{text}' is a header or not",
675 sortid="table/20250210a",
676 )
677 return False, False