Coverage for src / wiktextract / extractor / el / table.py: 83%
290 statements
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-12 08:09 +0000
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-12 08:09 +0000
1import re
2from typing import TypeAlias
3from unicodedata import name as unicode_name
5from wikitextprocessor import HTMLNode, NodeKind, TemplateNode, WikiNode
7from wiktextract.clean import clean_value
8from wiktextract.extractor.el.tags import translate_raw_tags
9from wiktextract.wxr_context import WiktextractContext
11from .models import Form, FormSource, WordEntry
12from .parse_utils import GREEK_LANGCODES, remove_duplicate_forms
14# Shorthand for this file. Could be an import, but it's so simple...
15Node = str | WikiNode
18# GREEK TABLE HEURISTICS:
19# If it's a table for a Greek language entry, if it's in a header or is in
20# italics, it's a header.
21# If it's NOT a Greek entry and has Greek text, it's a header.
24# node_fns are different from template_fns. template_fns are functions that
25# are used to handle how to expand (and otherwise process) templates, while
26# node functions are used when turning any parsed "abstract" nodes into strings.
27def cell_node_fn(
28 node: WikiNode,
29) -> list[Node] | None:
30 """Handle nodes in the parse tree specially."""
31 assert isinstance(node, WikiNode)
32 if node.kind == NodeKind.ITALIC:
33 return ["__I__", *node.children, "__/I__"]
34 if node.kind == NodeKind.BOLD:
35 return ["__B__", *node.children, "__/B__"]
36 # In case someone puts tables inside tables...
37 kind = node.kind
38 if kind in {
39 NodeKind.TABLE_CELL,
40 NodeKind.TABLE_HEADER_CELL,
41 }:
42 return node.children
43 return None
46BOLD_RE = re.compile(r"(__/?[BI]__)")
47TRAILING_NUMBER_RE = re.compile(r"\d+$")
49ARTICLES: set[str] = {
50 "ο",
51 "η",
52 "το",
53 "την",
54 "της",
55 "τον",
56 "τη",
57 "το",
58 "οι",
59 "οι",
60 "τα",
61 "των",
62 "τους",
63 "του",
64 "τις",
65 "τα",
66}
67EXTENDED_ARTICLES = ARTICLES | {
68 "τη(ν)",
69 "ο/η",
70 "του/της",
71 "τον/τη",
72 "τον/τη(ν)",
73 "τον/την",
74 "τους/τις",
75}
76"""Articles to trim from inflection tables / headwords."""
77UNEXPECTED_ARTICLES = {
78 "αι",
79 "ένα",
80 "ένας",
81 "στα",
82 "στη",
83 "στην",
84 "στης",
85 "στις",
86 "στο",
87 "στον",
88 "στου",
89 "στους",
90 "στων",
91 "τ'",
92 "ταις",
93 "τας",
94 "τες",
95 "τη",
96 "τοις",
97 "τω",
98}
99"""Includes contractions, Ancient Greek articles etc."""
102def process_inflection_section(
103 wxr: WiktextractContext,
104 data: WordEntry,
105 snode: WikiNode,
106 *,
107 source: FormSource = "",
108 top_template_name: str | None = None,
109) -> None:
110 table_nodes: list[tuple[str | None, WikiNode]] = []
111 # template_depth is used as a nonlocal variable in bold_node_handler
112 # to gauge how deep inside a top-level template we are; we want to
113 # collect template data only for the top-level templates that are
114 # visible in the wikitext, not templates inside templates.
115 template_depth = 0
117 def table_node_handler_fn(
118 node: WikiNode,
119 ) -> list[str] | str | None:
120 """Insert special markers `__*__` and `__/*__` around bold nodes so
121 that the strings can later be split into "head-word" and "tag-words"
122 parts. Collect incidental stuff, like side-tables, that are often
123 put around the head."""
124 assert isinstance(node, WikiNode)
125 nonlocal template_depth
126 nonlocal top_template_name
128 if isinstance(node, TemplateNode):
129 # Recursively expand templates so that even nodes inside the
130 # the templates are handled with bold_node_handler.
131 # Argh. Don't use "node_to_text", that causes bad output...
132 expanded = wxr.wtp.expand(wxr.wtp.node_to_wikitext(node))
133 if template_depth == 0: 133 ↛ 137line 133 didn't jump to line 137 because the condition on line 133 was always true
134 # We are looking at a top-level template in the original
135 # wikitext.
136 top_template_name = node.template_name
137 new_node = wxr.wtp.parse(expanded)
139 template_depth += 1
140 ret = wxr.wtp.node_to_text(
141 new_node, node_handler_fn=table_node_handler_fn
142 )
143 template_depth -= 1
144 if template_depth == 0: 144 ↛ 146line 144 didn't jump to line 146 because the condition on line 144 was always true
145 top_template_name = None
146 return ret
148 if node.kind in {
149 NodeKind.TABLE,
150 }:
151 # XXX Handle tables here
152 # template depth and top-level template name
153 nonlocal table_nodes
154 table_nodes.append((top_template_name, node))
155 return [""]
156 return None
158 _ = wxr.wtp.node_to_html(snode, node_handler_fn=table_node_handler_fn)
160 for template_name, table_node in table_nodes:
161 # XXX template_name
162 parse_table(
163 wxr,
164 table_node,
165 data,
166 data.lang_code in GREEK_LANGCODES,
167 template_name=template_name or "",
168 source=source,
169 )
171 data.forms = remove_duplicate_forms(wxr, data.forms)
174def parse_table(
175 wxr: WiktextractContext,
176 tnode: WikiNode,
177 data: WordEntry,
178 is_greek_entry: bool = False, # Whether the entry is for a Greek word
179 template_name: str = "",
180 *,
181 source: FormSource = "",
182) -> None:
183 """Parse inflection table. Generates 'form' data; 'foos' is a form of 'foo'
184 with the tags ['plural']."""
185 assert (isinstance(tnode, WikiNode) and tnode.kind == NodeKind.TABLE) or (
186 isinstance(tnode, HTMLNode) and tnode.tag == "table"
187 )
189 is_html_table = isinstance(tnode, HTMLNode)
191 # Some debugging code: if wiktwords is passed a --inflection-tables-file
192 # argument, we save tables to a file for debugging purposes, or for just
193 # getting tables that can be used as test data.
194 if wxr.config.expand_tables: 194 ↛ 195line 194 didn't jump to line 195 because the condition on line 194 was never true
195 with open(wxr.config.expand_tables, "w") as f:
196 f.write(f"{wxr.wtp.title=}\n")
197 text = wxr.wtp.node_to_wikitext(tnode)
198 f.write(f"{text}\n")
200 Row: TypeAlias = int
201 Column: TypeAlias = int
203 # We complete the table using nested dicts (instead of arrays for
204 # convenience) such that when we come across a node, we push that node's
205 # reference to each coordinate point in the table grid it occupies. Each
206 # grid point can then be checked for if it's been handled already and
207 # skipped if needed.
208 table_grid: dict[Row, dict[Column, WikiNode]] = {}
210 first_column_is_headers = True
212 for r, row in enumerate(
213 tnode.find_html_recursively("tr")
214 if is_html_table
215 else tnode.find_child_recursively(NodeKind.TABLE_ROW)
216 ):
217 c = 0
218 # print(f"{r=}, {row=}")
219 if r not in table_grid:
220 table_grid[r] = {}
222 for cell in (
223 row.find_html(["th", "td"])
224 if is_html_table
225 else row.find_child(
226 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL,
227 )
228 ):
229 while c in table_grid[r]:
230 c += 1
232 try:
233 rowspan = int(cell.attrs.get("rowspan", "1")) # 🡙
234 colspan = int(cell.attrs.get("colspan", "1")) # 🡘
235 except ValueError:
236 rowspan = 1
237 colspan = 1
238 # print("COL:", col)
240 if colspan > 30: 240 ↛ 241line 240 didn't jump to line 241 because the condition on line 240 was never true
241 wxr.wtp.error(
242 f"Colspan {colspan} over 30, set to 1",
243 sortid="table/128/20250207",
244 )
245 colspan = 1
246 if rowspan > 30: 246 ↛ 247line 246 didn't jump to line 247 because the condition on line 246 was never true
247 wxr.wtp.error(
248 f"Rowspan {rowspan} over 30, set to 1",
249 sortid="table/134/20250207b",
250 )
251 rowspan = 1
253 for rr in range(r, r + rowspan):
254 if rr not in table_grid:
255 table_grid[rr] = {}
256 for cc in range(c, c + colspan):
257 table_grid[rr][cc] = cell
259 if not table_grid[len(table_grid) - 1]: 259 ↛ 261line 259 didn't jump to line 261 because the condition on line 259 was never true
260 # Last row is empty; traverse backwards to skip empty rows at end
261 last_item = None
262 for i, rowd in reversed(table_grid.items()):
263 if rowd:
264 last_item = i
265 break
267 assert last_item is not None
269 new_table_grid = dict()
270 for i, rowd in table_grid.items():
271 if i > last_item:
272 continue
273 new_table_grid[i] = rowd
274 table_grid = new_table_grid
276 if len(table_grid[0]) == 1: 276 ↛ 278line 276 didn't jump to line 278 because the condition on line 276 was never true
277 # Table is one column in width, no headers on rows
278 first_column_is_headers = False
280 if len(table_grid) == 2: 280 ↛ 282line 280 didn't jump to line 282 because the condition on line 280 was never true
281 # There's only one or two rows
282 first_column_is_headers = False
284 # Headers are saved in two dict that has their keys made out of tuples
285 # made of their "bookends": so {(1,1), "foo"} for a header that is made
286 # up of the first cell only of a row in the column_hdrs dict.
287 # If we come across a header that has those exact same bookends, only
288 # then do we replace the previous tags with it; if you have overlapping
289 # 'widths', leave them so that we inherit different 'levels' of headers.
290 Spread = tuple[int, int]
291 SpreadDict = dict[Spread, str]
292 # The column and row headers are saved into big dicts: column_hdrs is a dict
293 # whose key is what row or column we are in. The values of that table grid
294 # square is a dict with the bookends (`Spread`) and the tags associated with
295 # those bookends
296 column_hdrs_all: dict[Column, SpreadDict] = {}
297 row_hdrs_all: dict[Row, dict[Column, SpreadDict]] = {}
299 forms: list[Form] = []
300 processed: set[WikiNode] = set()
301 # Some tables have cells with stuff like `του` we want to add to the
302 # next cell
303 prefix: str | None = None
305 # print(f"{table_grid=}")
307 first_cells_are_bold = False
308 found_unformatted_text = False
310 for r, row_d in table_grid.items():
311 # Check for previously added row headers that may have spread lower;
312 # Remove old row headers that don't exist on this row.
313 for c, cell in row_d.items():
314 if cell in processed:
315 continue
316 processed.add(cell)
318 try:
319 rowspan = int(cell.attrs.get("rowspan", "1")) # 🡙
320 colspan = int(cell.attrs.get("colspan", "1")) # 🡘
321 except ValueError:
322 rowspan = 1
323 colspan = 1
325 spans = process_cell_text(wxr, cell)
327 if len(spans) <= 0:
328 continue
330 if r == 0:
331 if spans[0][0]: # starts_bold
332 first_cells_are_bold = True
334 text = clean_value(wxr, " ".join(span[3] for span in spans))
335 # print(f"{text=}")
337 this_is_header, unformatted_text = is_header(
338 wxr,
339 cell,
340 spans,
341 is_greek_entry,
342 found_unformatted_text,
343 first_cells_are_bold,
344 )
346 if unformatted_text is True:
347 found_unformatted_text = True
349 if this_is_header or (c == 0 and first_column_is_headers is True):
350 # Because Greek wiktionary has its own written script to rely
351 # in heuristics, we can use that. It also seems that for
352 # tables in Greek-language entries even if the table doesn't
353 # use proper header cells, you can trust bolding and italics.
355 # Currently we don't care which "direction" the header points:
356 # we add the tag to both column headers and row headers, and
357 # rely on that all headers are on only rows or columns that
358 # don't have data cells; ie. headers and data aren't mixed.
360 # Each row and each column gets its own header data.
361 # The Spread key is used to keep track which headers should
362 # "overlap": if the spread is different, that should always
363 # mean that one is contained within another and thus they're
364 # not complementary headers, but one "bigger" category and
365 # one "specific" category. If the Spread is identical, then
366 # that's obviously two complementary headers, and the later one
367 # overwrites the other.
368 for rr in range(r, r + rowspan):
369 if rr not in row_hdrs_all:
370 row_hdrs_all[rr] = {c: {(r, r + rowspan): text}}
371 elif c not in row_hdrs_all[rr]: 371 ↛ 376line 371 didn't jump to line 376 because the condition on line 371 was always true
372 row_hdrs_all[rr][c] = {(r, r + rowspan): text}
373 else:
374 # Also overwrites headers with the same "span"; simple
375 # way to have overlapping sections.
376 row_hdrs_all[rr][c][(r, r + rowspan)] = text
378 for cc in range(c, c + colspan):
379 if cc not in column_hdrs_all:
380 column_hdrs_all[cc] = {(c, c + colspan): text}
381 else:
382 column_hdrs_all[cc][(c, c + colspan)] = text
384 prefix = None
386 elif text in ARTICLES:
387 prefix = text
388 else:
389 # cell is data
390 if text in UNEXPECTED_ARTICLES: 390 ↛ 391line 390 didn't jump to line 391 because the condition on line 390 was never true
391 wxr.wtp.debug(
392 f"Found '{text}' in table '{wxr.wtp.title}'",
393 sortid="table/335",
394 )
395 tags: set[str] = set()
396 for cc, vd in row_hdrs_all.get(r, {}).items():
397 if c <= cc: 397 ↛ 398line 397 didn't jump to line 398 because the condition on line 397 was never true
398 continue
399 for (start, end), tag in vd.items():
400 if start > r or end < r + rowspan:
401 continue
402 tags.add(tag)
403 for (start, end), tag in column_hdrs_all.get(c, {}).items():
404 if start > c or end < c + colspan: 404 ↛ 405line 404 didn't jump to line 405 because the condition on line 404 was never true
405 continue
406 tags.add(tag)
407 texts = [text]
408 if "&" in text:
409 texts = [t.strip() for t in text.split("&")]
410 # Avert your eyes... Python list comprehension syntax amirite
411 texts = [line for text in texts for line in text.splitlines()]
412 if prefix is not None:
413 texts = [f"{prefix} {t}" for t in texts]
414 prefix = None
415 if len(tags) > 0: 415 ↛ 427line 415 didn't jump to line 427 because the condition on line 415 was always true
416 # If a cell has no tags in a table, it's probably a note
417 # or something.
418 forms.extend(
419 Form(
420 form=text,
421 raw_tags=sorted(tags),
422 source=source,
423 )
424 for text in texts
425 )
426 else:
427 wxr.wtp.warning(
428 f"Cell without any tags in table: {text}",
429 sortid="table/300/20250217",
430 )
432 # logger.debug(
433 # f"{wxr.wtp.title}\n{print_tree(tree, indent=2, ret_value=True)}"
434 # )
435 # print(forms)
437 # # Replace raw_tags with tags if appropriate
438 # for form in forms:
439 # legit_tags, new_raw_tags, poses = convert_tags(form.raw_tags)
440 # # Poses are strings like "adj 1", used in pronunciation data
441 # # to later associate sound data with the correct pos entry.
442 # # Ignored here.
443 # if legit_tags:
444 # form.tags = legit_tags
445 # form.tags.extend(poses)
446 # form.raw_tags = new_raw_tags
447 # print(f"Inside parse_table: {forms=}")
449 # If there is no template name (https://el.wiktionary.org/wiki/κρόκος)
450 # we are adding junk anyway. This prevents a Form with empty form, which
451 # is treated as an (non critical) error by src/wiktextract/wiktionary.py
452 #
453 # (I think the κρόκος issue is due to not stopping parsing at headings,
454 # since the two intermingled templates are in different headings...)
455 if forms and template_name: 455 ↛ exitline 455 didn't return from function 'parse_table' because the condition on line 455 was always true
456 data.forms.append(
457 Form(
458 form=template_name,
459 tags=["inflection-template"],
460 source=source,
461 )
462 )
464 new_forms = postprocess_table_forms(forms, data.word)
465 data.forms.extend(new_forms)
468def remove_article_forms(forms: list[Form], word: str) -> list[Form]:
469 """Return a new form list without article forms.
471 Articles can appear in two ways:
472 * As a separate form:
473 Ex. https://el.wiktionary.org/wiki/λίθος
474 * As part of a form, inside form.form
475 Ex. most tables
477 Used in both headword and table forms. Note that for headword forms, where
478 there is usually no grammatic information, we could also use these articles
479 to populate tags - but since most of the time we remove articles in tables,
480 it was deemed not worth.
481 """
482 # Do not remove article forms for the article pages themselves...
483 if word in ARTICLES:
484 return forms
486 new_forms: list[Form] = []
487 for form in forms:
488 if form.form in EXTENDED_ARTICLES:
489 continue
490 parts = form.form.split()
491 if len(parts) > 1 and parts[0] in EXTENDED_ARTICLES:
492 form.form = " ".join(parts[1:])
493 if not form.form: 493 ↛ 494line 493 didn't jump to line 494 because the condition on line 493 was never true
494 continue
495 new_forms.append(form)
496 return new_forms
499def postprocess_table_forms(forms: list[Form], word: str) -> list[Form]:
500 """Postprocess table forms.
502 * Translate tags
503 * Remove articles (requires original word)
504 * Convert some parens to rare tag
505 * Remove trailing numbers and stars (usu. notes)
506 * Form expansion
508 About form expansion, there are two types:
509 * Separators: "/", "-"
510 * Strings inside parens
512 The purpose being to go:
513 FROM "θα ζητάν(ε) - ζητούν(ε)"
514 TO ["θα ζητάν", "θα ζητάνε", "θα ζητούν", "θα ζητούνε"]
516 References:
517 * https://el.wiktionary.org/wiki/τρώω
518 * https://el.wiktionary.org/wiki/ζητάω < this page is cursed anyway
519 https://el.wiktionary.org/wiki/αγαπάω < use this instead
520 """
521 for form in forms:
522 translate_raw_tags(form)
524 clean_forms = remove_article_forms(forms, word)
526 for form in clean_forms:
527 # Parens > rare inflection (cf. μπόι)
528 if form.form[0] == "(" and form.form[-1] == ")":
529 form.form = form.form[1:-1]
530 form.tags.append("rare")
532 # Remove trailing numbers (usu. notes)
533 # https://el.wiktionary.org/wiki/Καπιτόπουλος
534 form.form = TRAILING_NUMBER_RE.sub("", form.form)
535 # https://el.wiktionary.org/wiki/επιζών
536 form.form = form.form.rstrip("*")
538 # Separators
539 separators = ("/", "-")
540 verb_particles = ("θα", "να")
541 separated_forms: list[Form] = []
542 for form in clean_forms:
543 # Assume only one type of separator present atm
544 sep = next((sep for sep in separators if sep in form.form), None)
545 if sep is None:
546 separated_forms.append(form)
547 continue
549 # Ignore separator if the original word contained it
550 # Ex. "-ισμός", "η-τάξη" etc.
551 if sep in word:
552 separated_forms.append(form)
553 continue
555 # Extract particle if any
556 suffix_particle = ""
557 parts = form.form.split()
558 if len(parts) > 1 and parts[0] in verb_particles:
559 suffix_particle = parts[0]
560 form.form = " ".join(parts[1:])
562 for separated in form.form.split(sep):
563 separated_form = form.model_copy(deep=True)
564 separated = separated.strip()
565 if suffix_particle:
566 separated_form.form = f"{suffix_particle} {separated}"
567 else:
568 separated_form.form = separated
569 separated_forms.append(separated_form)
571 # Strings inside parens
572 new_forms: list[Form] = []
573 for form in separated_forms:
574 text = form.form
576 m = re.match(r"^(.*?)\((.*?)\)(.*)$", text)
577 if not m:
578 new_forms.append(form)
579 continue
581 before, inside, after = m.groups()
582 expanded = [before + after, before + inside + after]
583 for variant in expanded:
584 new_form = form.model_copy(deep=True)
585 new_form.form = variant
586 new_forms.append(new_form)
588 return new_forms
591def process_cell_text(
592 wxr: WiktextractContext, cell: WikiNode
593) -> list[tuple[bool, bool, bool, str]]:
594 cell_text = wxr.wtp.node_to_text(cell, node_handler_fn=cell_node_fn)
595 cell_text = clean_value(wxr, cell_text)
596 split_text = BOLD_RE.split(cell_text)
598 # bold, italics, is greek, text
599 spans: list[tuple[bool, bool, bool, str]] = []
601 inside_bold = False
602 inside_italics = False
603 for i, text in enumerate(split_text):
604 text = text.strip()
605 if not text:
606 continue
607 if i % 2 == 0:
608 for ch in text:
609 if not ch.isalpha():
610 continue
611 greek = unicode_name(ch).startswith("GREEK")
612 break
613 else:
614 # no alphanumerics detected
615 continue
617 spans.append((inside_bold, inside_italics, greek, text))
618 continue
619 match text:
620 case "__B__":
621 inside_bold = True
622 case "__/B__":
623 inside_bold = False
624 case "__I__":
625 inside_italics = True
626 case "__/I__": 626 ↛ 603line 626 didn't jump to line 603 because the pattern on line 626 always matched
627 inside_italics = False
629 return spans
632UnformattedFound: TypeAlias = bool
635def is_header(
636 wxr: WiktextractContext,
637 cell: WikiNode,
638 spans: list[tuple[bool, bool, bool, str]],
639 is_greek_entry: bool,
640 unformatted_text_found: bool,
641 first_cells_are_bold: bool,
642) -> tuple[bool, UnformattedFound]:
643 # Container for more complex logic stuff because trying to figure out
644 # if something is a header can get messy.
645 if cell.kind == NodeKind.TABLE_HEADER_CELL:
646 return True, False
648 starts_bold, starts_italicized, starts_greek, text = spans[0]
650 if "bold" in cell.attrs.get("style", ""): 650 ↛ 651line 650 didn't jump to line 651 because the condition on line 650 was never true
651 starts_bold = True
652 if "italic" in cell.attrs.get("style", ""):
653 starts_italicized = True
655 # Not a Greek entry
656 if not is_greek_entry: 656 ↛ 657line 656 didn't jump to line 657 because the condition on line 656 was never true
657 if starts_greek:
658 # If the table is for another language other than Greek, a cell
659 # starting with Greek text is a table header
660 return True, (starts_bold or starts_italicized)
661 else:
662 return False, (starts_bold or starts_italicized)
664 # Is a Greek entry
665 if starts_italicized is True:
666 return True, False
668 if starts_bold is False: 668 ↛ 671line 668 didn't jump to line 671 because the condition on line 668 was always true
669 return False, True
671 if unformatted_text_found:
672 # This is bolded, but we've seen unformatted text before
673 return True, False
674 # print(f"{text=}-> {starts_bold=}, {starts_italicized=}, {starts_greek=}")
676 if first_cells_are_bold:
677 return True, False
679 wxr.wtp.wiki_notice(
680 f"Can't be sure if bolded text entry '{text}' is a header or not",
681 sortid="table/20250210a",
682 )
683 return False, False