Coverage for src / wiktextract / extractor / simple / table.py: 62%
69 statements
« prev ^ index » next coverage.py v7.13.1, created at 2026-01-02 00:27 +0000
« prev ^ index » next coverage.py v7.13.1, created at 2026-01-02 00:27 +0000
1from itertools import chain
3from wikitextprocessor import NodeKind, TemplateNode, WikiNode
5# from wikitextprocessor.parser import print_tree
6from wiktextract.page import clean_node
7from wiktextract.wxr_context import WiktextractContext
8from wiktextract.wxr_logging import logger
10from .models import Form, WordEntry
11from .simple_tags import simple_tag_map
12from .tags_utils import convert_tags
14# Shorthand for this file. Could be an import, but it's so simple...
15Node = str | WikiNode
18# node_fns are different from template_fns. template_fns are functions that
19# are used to handle how to expand (and otherwise process) templates, while
20# node functions are used when turning parsed nodes into strings.
21def cell_node_fn(
22 node: WikiNode,
23) -> list[Node] | None:
24 """Handle nodes in the parse_tree specially. Currently: check for italics
25 containing the string 'none' and replace with hyphen."""
26 assert isinstance(node, WikiNode)
27 if node.kind == NodeKind.ITALIC: 27 ↛ 30line 27 didn't jump to line 30 because the condition on line 27 was never true
28 # If we have italicized text 'none', like in `deviate`, turn it to "–"
29 # XXX 'None' without italics...
30 if (
31 len(node.children) == 1
32 and isinstance(node.children[0], str)
33 and node.children[0].strip() == "none"
34 ):
35 return ["–"]
36 # This latter bit is from the default node_handler function and really
37 # unnecessary, but in case someone puts tables inside tables...
38 kind = node.kind
39 if kind in {
40 NodeKind.TABLE_CELL,
41 NodeKind.TABLE_HEADER_CELL,
42 }:
43 return node.children
44 return None
47def parse_pos_table(
48 wxr: WiktextractContext, tnode: TemplateNode, data: WordEntry
49) -> list[Form]:
50 """Parse inflection table. Simple English Wiktionary article POS sections
51 start with a template that generates a table with the different inflected
52 forms."""
53 assert isinstance(tnode, TemplateNode)
54 # Expand the template into text (and all subtemplates too), then parse.
55 tree = wxr.wtp.parse(wxr.wtp.node_to_wikitext(tnode), expand_all=True)
57 # Some debugging code: if wiktwords is passed a --inflection-tables-file
58 # argument, we save tables to a file for debugging purposes, or for just
59 # getting tables that can be used as test data.
60 if wxr.config.expand_tables: 60 ↛ 61line 60 didn't jump to line 61 because the condition on line 60 was never true
61 with open(wxr.config.expand_tables, "w") as f:
62 f.write(f"{wxr.wtp.title=}\n")
63 text = wxr.wtp.node_to_wikitext(tree)
64 f.write(f"{text}\n")
66 # Check if there are actually any headers, because Simple English Wiktionary
67 # doesn't use them in these POS template tables.
68 # Headers and non-headers in other editions can be a real headache.
69 # Having headers is better than not, but when they're inconsistently
70 # applied, it's a headache.
71 for header in tree.find_child_recursively(NodeKind.TABLE_HEADER_CELL): 71 ↛ 72line 71 didn't jump to line 72 because the loop on line 71 never started
72 wxr.wtp.debug(
73 f"POS template table has headers! {repr(header)[:255]}",
74 sortid="simple/table/45",
75 )
77 # A typical SEW table has simple 2-line cells, without headers, EXCEPT
78 # some have actual table structure like "did". That's why we do thing
79 # row-by-row.
80 column_hdrs: dict[int, str] = {}
81 forms: list[Form] = []
82 for row in chain(
83 # This just combines these two (mostly mutually incomplementary)
84 # calls into one list, with an expectation that we get a list of only
85 # WikiNodes or HTML nodes. If they're mixed up, that's super weird. It's
86 # a hack!
87 tree.find_child_recursively(NodeKind.TABLE_ROW),
88 tree.find_html_recursively("tr"),
89 ):
90 # If the row has an active header (left to right).
91 row_hdr = ""
92 for i, cell in chain(
93 row.find_child(NodeKind.TABLE_CELL, with_index=True),
94 row.find_html("td", with_index=True, attr_name="", attr_value=""),
95 ):
96 text = clean_node(
97 wxr, data, cell, node_handler_fn=cell_node_fn
98 ).strip()
99 if not text:
100 # In case there's an empty cell on the first row.
101 if i not in column_hdrs: 101 ↛ 103line 101 didn't jump to line 103 because the condition on line 101 was always true
102 column_hdrs[i] = ""
103 continue
104 lines = [s.strip() for s in text.splitlines()]
105 if len(lines) != 2: 105 ↛ 108line 105 didn't jump to line 108 because the condition on line 105 was never true
106 # SEW style: a single cell, first line is the 'header',
107 # second is the form/data.
108 logger.debug(
109 f"{wxr.wtp.title}: A cell that's "
110 f"not exactly 2 lines: {repr(text)}"
111 )
112 if len(lines) == 1: 112 ↛ 114line 112 didn't jump to line 114 because the condition on line 112 was never true
113 # XXX do tag parsing instead of i == 0; Levenshtein.
114 if text in simple_tag_map:
115 # Found something that looks like a tag.
116 if i == 0:
117 row_hdr = text
118 column_hdrs[i] = text
119 else:
120 tags = []
121 if i in column_hdrs and column_hdrs[i]:
122 tags.append(column_hdrs[i])
123 if row_hdr:
124 tags.append(row_hdr)
125 forms.append(Form(form=text, raw_tags=tags))
126 # Add a single line cell as a column header and trust it
127 # will be overridden as appropriate
128 # Only applicable to Simple English wiktionary!
129 column_hdrs[i] = text
131 continue
132 if len(lines) == 2: 132 ↛ 92line 132 didn't jump to line 92 because the condition on line 132 was always true
133 # Default assumption.
134 column_hdrs[i] = lines[0]
135 cell_content = lines[1]
136 tags = []
137 if column_hdrs[i]: 137 ↛ 139line 137 didn't jump to line 139 because the condition on line 137 was always true
138 tags.append(column_hdrs[i])
139 if row_hdr: 139 ↛ 140line 139 didn't jump to line 140 because the condition on line 139 was never true
140 tags.append(row_hdr)
141 forms.append(Form(form=cell_content, raw_tags=tags))
142 # Ignore cells with more than two lines.
144 # logger.debug(
145 # f"{wxr.wtp.title}\n{print_tree(tree, indent=2, ret_value=True)}"
146 # )
147 # print(forms)
149 # Replace raw_tags with tags if appropriate
150 for form in forms:
151 legit_tags, new_raw_tags, poses = convert_tags(form.raw_tags)
152 # XXX poses are strings like "adj 1", used in pronunciation data
153 # to later associate sound data with the correct pos entry.
154 # Not useful or common here?
155 # if len(poses) > 0: # This spams the logs
156 # wxr.wtp.warning(f"convert_tags() returned weird `poses` data for "
157 # f"forms: {poses=}", sortid="simple/table/122")
158 if legit_tags: 158 ↛ 150line 158 didn't jump to line 150 because the condition on line 158 was always true
159 form.tags = legit_tags
160 form.raw_tags = new_raw_tags
162 return forms