Coverage for src/wiktextract/extractor/simple/table.py: 62%
69 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1from itertools import chain
3from wikitextprocessor import NodeKind, TemplateNode, WikiNode
5# from wikitextprocessor.parser import print_tree
6from wiktextract.page import clean_node
7from wiktextract.wxr_context import WiktextractContext
8from wiktextract.wxr_logging import logger
10from .models import Form, WordEntry
11from .simple_tags import simple_tag_map
12from .tags_utils import convert_tags
14# Shorthand for this file. Could be an import, but it's so simple...
15Node = str | WikiNode
19# node_fns are different from template_fns. template_fns are functions that
20# are used to handle how to expand (and otherwise process) templates, while
21# node functions are used when turning parsed nodes into strings.
22def cell_node_fn(
23 node: WikiNode,
24) -> list[Node] | None:
25 """Handle nodes in the parse_tree specially. Currently: check for italics
26 containing the string 'none' and replace with hyphen."""
27 assert isinstance(node, WikiNode)
28 if node.kind == NodeKind.ITALIC: 28 ↛ 31line 28 didn't jump to line 31 because the condition on line 28 was never true
29 # If we have italicized text 'none', like in `deviate`, turn it to "–"
30 # XXX 'None' without italics...
31 if (
32 len(node.children) == 1
33 and isinstance(node.children[0], str)
34 and node.children[0].strip() == "none"
35 ):
36 return ["–"]
37 # This latter bit is from the default node_handler function and really
38 # unnecessary, but in case someone puts tables inside tables...
39 kind = node.kind
40 if kind in {
41 NodeKind.TABLE_CELL,
42 NodeKind.TABLE_HEADER_CELL,
43 }:
44 return node.children
45 return None
48def parse_pos_table(
49 wxr: WiktextractContext, tnode: TemplateNode, data: WordEntry
50) -> list[Form]:
51 """Parse inflection table. Simple English Wiktionary article POS sections
52 start with a template that generates a table with the different inflected
53 forms."""
54 assert isinstance(tnode, TemplateNode)
55 # Expand the template into text (and all subtemplates too), then parse.
56 tree = wxr.wtp.parse(wxr.wtp.node_to_wikitext(tnode), expand_all=True)
58 # Some debugging code: if wiktwords is passed a --inflection-tables-file
59 # argument, we save tables to a file for debugging purposes, or for just
60 # getting tables that can be used as test data.
61 if wxr.config.expand_tables: 61 ↛ 62line 61 didn't jump to line 62 because the condition on line 61 was never true
62 with open(wxr.config.expand_tables, "w") as f:
63 f.write(f"{wxr.wtp.title=}\n")
64 text = wxr.wtp.node_to_wikitext(tree)
65 f.write(f"{text}\n")
67 # Check if there are actually any headers, because Simple English Wiktionary
68 # doesn't use them in these POS template tables.
69 # Headers and non-headers in other editions can be a real headache.
70 # Having headers is better than not, but when they're inconsistenly applied,
71 # it's a headache.
72 for header in tree.find_child_recursively(NodeKind.TABLE_HEADER_CELL): 72 ↛ 73line 72 didn't jump to line 73 because the loop on line 72 never started
73 wxr.wtp.debug(
74 f"POS template table has headers! {repr(header)[:255]}",
75 sortid="simple/table/45",
76 )
78 # A typical SEW table has simple 2-line cells, without headers, EXCEPT
79 # some have actual table structure like "did". That's why we do thing
80 # row-by-row.
81 column_hdrs: dict[int, str] = {}
82 forms: list[Form] = []
83 for row in chain(
84 # This just combines these two (mostly mutually incomplementary)
85 # calls into one list, with an expectation that we get a list of only
86 # WikiNodes or HTML nodes. If they're mixed up, that's super weird. It's
87 # a hack!
88 tree.find_child_recursively(NodeKind.TABLE_ROW),
89 tree.find_html_recursively("tr"),
90 ):
91 # If the row has an active header (left to right).
92 row_hdr = ""
93 for i, cell in chain(
94 row.find_child(NodeKind.TABLE_CELL, with_index=True),
95 row.find_html("td", with_index=True, attr_name="", attr_value=""),
96 ):
97 text = clean_node(
98 wxr, data, cell, node_handler_fn=cell_node_fn
99 ).strip()
100 if not text:
101 # In case there's an empty cell on the first row.
102 if i not in column_hdrs: 102 ↛ 104line 102 didn't jump to line 104 because the condition on line 102 was always true
103 column_hdrs[i] = ""
104 continue
105 lines = [s.strip() for s in text.splitlines()]
106 if len(lines) != 2: 106 ↛ 109line 106 didn't jump to line 109 because the condition on line 106 was never true
107 # SEW style: a single cell, first line is the 'header',
108 # second is the form/data.
109 logger.debug(
110 f"{wxr.wtp.title}: A cell that's "
111 f"not exactly 2 lines: {repr(text)}"
112 )
113 if len(lines) == 1: 113 ↛ 115line 113 didn't jump to line 115 because the condition on line 113 was never true
114 # XXX do tag parsing instead of i == 0; Levenshtein.
115 if text in simple_tag_map:
116 # Found something that looks like a tag.
117 if i == 0:
118 row_hdr = text
119 column_hdrs[i] = text
120 else:
121 tags = []
122 if i in column_hdrs and column_hdrs[i]:
123 tags.append(column_hdrs[i])
124 if row_hdr:
125 tags.append(row_hdr)
126 forms.append(Form(form=text, raw_tags=tags))
127 # Add a single line cell as a column header and trust it
128 # will be overridden as appropriate
129 # Only applicable to Simple English wiktionary!
130 column_hdrs[i] = text
132 continue
133 if len(lines) == 2: 133 ↛ 93line 133 didn't jump to line 93 because the condition on line 133 was always true
134 # Default assumption.
135 column_hdrs[i] = lines[0]
136 cell_content = lines[1]
137 tags = []
138 if column_hdrs[i]: 138 ↛ 140line 138 didn't jump to line 140 because the condition on line 138 was always true
139 tags.append(column_hdrs[i])
140 if row_hdr: 140 ↛ 141line 140 didn't jump to line 141 because the condition on line 140 was never true
141 tags.append(row_hdr)
142 forms.append(Form(form=cell_content, raw_tags=tags))
143 # Ignore cells with more than two lines.
145 # logger.debug(
146 # f"{wxr.wtp.title}\n{print_tree(tree, indent=2, ret_value=True)}"
147 # )
148 # print(forms)
150 # Replace raw_tags with tags if appropriate
151 for form in forms:
152 legit_tags, new_raw_tags, poses = convert_tags(form.raw_tags)
153 # XXX poses are strings like "adj 1", used in pronunciation data
154 # to later associate sound data with the correct pos entry.
155 # Not useful or common here?
156 # if len(poses) > 0: # This spams the logs
157 # wxr.wtp.warning(f"convert_tags() returned weird `poses` data for "
158 # f"forms: {poses=}", sortid="simple/table/122")
159 if legit_tags: 159 ↛ 151line 159 didn't jump to line 151 because the condition on line 159 was always true
160 form.tags = legit_tags
161 form.raw_tags = new_raw_tags
163 return forms