Coverage for src/wiktextract/extractor/it/inflection.py: 96%
134 statements
« prev ^ index » next coverage.py v7.11.0, created at 2025-11-03 05:44 +0000
« prev ^ index » next coverage.py v7.11.0, created at 2025-11-03 05:44 +0000
1from dataclasses import dataclass
2from itertools import chain
4from wikitextprocessor import NodeKind, TemplateNode, WikiNode
6from ...page import clean_node
7from ...wxr_context import WiktextractContext
8from .models import Form, WordEntry
9from .tags import translate_raw_tags
12def extract_tabs_template(
13 wxr: WiktextractContext, word_entry: WordEntry, node: TemplateNode
14) -> None:
15 # https://it.wiktionary.org/wiki/Template:Tabs
16 tags = [
17 ["masculine", "singular"],
18 ["masculine", "plural"],
19 ["feminine", "singular"],
20 ["feminine", "plural"],
21 ]
22 for arg_name in range(1, 5):
23 arg_value = clean_node(
24 wxr, None, node.template_parameters.get(arg_name, "")
25 )
26 if arg_value not in ["", wxr.wtp.title]:
27 form = Form(form=arg_value, tags=tags[arg_name - 1])
28 word_entry.forms.append(form)
31def extract_it_decl_agg_template(
32 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
33) -> None:
34 # https://it.wiktionary.org/wiki/Template:It-decl-agg4
35 # https://it.wiktionary.org/wiki/Template:It-decl-agg2
36 expanded_node = wxr.wtp.parse(
37 wxr.wtp.node_to_wikitext(t_node), expand_all=True
38 )
39 for table in expanded_node.find_child(NodeKind.TABLE):
40 raw_tag = ""
41 col_tags = []
42 for row in table.find_child(NodeKind.TABLE_ROW):
43 row_tag = ""
44 col_index = 0
45 for cell in row.find_child(
46 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
47 ):
48 match cell.kind:
49 case NodeKind.TABLE_HEADER_CELL:
50 col_span = cell.attrs.get("colspan", "")
51 if col_span != "":
52 raw_tag = clean_node(wxr, None, cell)
53 elif (
54 len(
55 [
56 n
57 for n in row.find_child(
58 NodeKind.TABLE_HEADER_CELL
59 )
60 ]
61 )
62 == 1
63 ):
64 row_tag = clean_node(wxr, None, cell)
65 else:
66 col_header = clean_node(wxr, None, cell)
67 if col_header != "": 67 ↛ 45line 67 didn't jump to line 45 because the condition on line 67 was always true
68 col_tags.append(col_header)
69 case NodeKind.TABLE_CELL: 69 ↛ 45line 69 didn't jump to line 45 because the pattern on line 69 always matched
70 word = clean_node(wxr, None, cell)
71 if word not in ["", wxr.wtp.title]:
72 form = Form(form=word)
73 if raw_tag != "": 73 ↛ 75line 73 didn't jump to line 75 because the condition on line 73 was always true
74 form.raw_tags.append(raw_tag)
75 if row_tag != "": 75 ↛ 77line 75 didn't jump to line 77 because the condition on line 75 was always true
76 form.raw_tags.append(row_tag)
77 if col_index < len(col_tags): 77 ↛ 79line 77 didn't jump to line 79 because the condition on line 77 was always true
78 form.raw_tags.append(col_tags[col_index])
79 translate_raw_tags(form)
80 word_entry.forms.append(form)
81 col_index += 1
84def extract_appendix_conjugation_page(
85 wxr: WiktextractContext, word_entry: WordEntry, page_title: str
86) -> None:
87 # https://it.wiktionary.org/wiki/Appendice:Coniugazioni
88 page_text = wxr.wtp.get_page_body(page_title, 100)
89 if page_text is None: 89 ↛ 90line 89 didn't jump to line 90 because the condition on line 89 was never true
90 return
91 root = wxr.wtp.parse(page_text)
92 for t_node in root.find_child(NodeKind.TEMPLATE):
93 if t_node.template_name.lower().endswith("-conj"): 93 ↛ 92line 93 didn't jump to line 92 because the condition on line 93 was always true
94 extract_conj_template(wxr, word_entry, t_node, page_title)
97@dataclass
98class TableHeader:
99 text: str
100 col_index: int
101 colspan: int
102 row_index: int
103 rowspan: int
106def extract_conj_template(
107 wxr: WiktextractContext,
108 word_entry: WordEntry,
109 t_node: TemplateNode,
110 page_title: str,
111) -> None:
112 # https://it.wiktionary.org/wiki/Template:It-conj
113 expanded_node = wxr.wtp.parse(
114 wxr.wtp.node_to_wikitext(t_node), expand_all=True
115 )
116 for table in expanded_node.find_child(NodeKind.TABLE):
117 col_headers = []
118 row_headers = []
119 for row_index, row in enumerate(table.find_child(NodeKind.TABLE_ROW)):
120 row_has_data = row.contain_node(NodeKind.TABLE_CELL)
121 col_index = 0
122 for header in chain(col_headers, row_headers):
123 if (
124 row_index > header.row_index
125 and row_index < header.row_index + header.rowspan
126 and header.col_index <= col_index
127 ):
128 col_index += header.colspan
129 for cell_node in row.find_child(
130 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
131 ):
132 cell_text = clean_node(wxr, None, cell_node)
133 colspan = int(cell_node.attrs.get("colspan", "1"))
134 rowspan = int(cell_node.attrs.get("rowspan", "1"))
135 if cell_node.kind == NodeKind.TABLE_CELL:
136 pass
137 elif not row_has_data:
138 col_headers.append(
139 TableHeader(
140 cell_text, col_index, colspan, row_index, rowspan
141 )
142 )
143 else:
144 row_headers.append(
145 TableHeader(
146 cell_text, col_index, colspan, row_index, rowspan
147 )
148 )
149 col_index += colspan
151 for row_index, row in enumerate(table.find_child(NodeKind.TABLE_ROW)):
152 col_index = 0
153 added_headers = set()
154 for header in chain(col_headers, row_headers):
155 if (
156 row_index >= header.row_index
157 and row_index < header.row_index + header.rowspan
158 and header.col_index <= col_index
159 ):
160 col_index += header.colspan
161 added_headers.add(header.text)
162 for cell_node in row.find_child(
163 NodeKind.TABLE_CELL | NodeKind.TABLE_HEADER_CELL
164 ):
165 cell_has_table = False
166 for cell_table in cell_node.find_child_recursively(
167 NodeKind.TABLE
168 ):
169 extract_conj_cell_table(
170 wxr,
171 word_entry,
172 cell_table,
173 row_headers,
174 col_headers,
175 page_title,
176 col_index,
177 row_index,
178 )
179 cell_has_table = True
180 if not cell_has_table:
181 colspan = int(cell_node.attrs.get("colspan", "1"))
182 rowspan = int(cell_node.attrs.get("rowspan", "1"))
183 cell_text = clean_node(wxr, None, cell_node)
184 if cell_node.kind == NodeKind.TABLE_HEADER_CELL:
185 if cell_text not in added_headers:
186 col_index += colspan
187 continue
188 for line in cell_text.splitlines():
189 for form_str in line.split(","):
190 form_str = form_str.strip()
191 if form_str not in ["", "—", wxr.wtp.title]:
192 add_conj_form(
193 word_entry,
194 form_str,
195 page_title,
196 colspan,
197 rowspan,
198 col_index,
199 col_headers,
200 row_index,
201 row_headers,
202 )
203 col_index += colspan
206def extract_conj_cell_table(
207 wxr: WiktextractContext,
208 word_entry: WordEntry,
209 table_node: WikiNode,
210 row_headers: list[TableHeader],
211 col_headers: list[TableHeader],
212 page_title: str,
213 start_col_index: int,
214 row_index: int,
215):
216 for row in table_node.find_child(NodeKind.TABLE_ROW):
217 for col_index, cell in enumerate(row.find_child(NodeKind.TABLE_CELL)):
218 colspan = int(cell.attrs.get("colspan", "1"))
219 rowspan = int(cell.attrs.get("rowspan", "1"))
220 for cell_str in clean_node(wxr, None, cell).splitlines():
221 if cell_str not in ["", "—", wxr.wtp.title]: 221 ↛ 220line 221 didn't jump to line 220 because the condition on line 221 was always true
222 add_conj_form(
223 word_entry,
224 cell_str,
225 page_title,
226 colspan,
227 rowspan,
228 start_col_index + col_index,
229 col_headers,
230 row_index,
231 row_headers,
232 )
235def add_conj_form(
236 word_entry: WordEntry,
237 form_str: str,
238 page_title: str,
239 colspan: int,
240 rowspan: int,
241 col_index: int,
242 col_headers: list[TableHeader],
243 row_index: int,
244 row_headers: list[TableHeader],
245):
246 form = Form(form=form_str, source=page_title)
247 use_tags = []
248 last_col_header_row = -1
249 last_row_header_col = -1
250 for col_header in col_headers[::-1]:
251 if (
252 col_header.col_index < col_index + colspan
253 and col_index < col_header.col_index + col_header.colspan
254 and col_header.text not in form.raw_tags
255 and col_header.text not in use_tags
256 and (
257 (
258 last_col_header_row != -1
259 and col_header.row_index + col_header.rowspan
260 in [last_col_header_row, last_col_header_row + 1]
261 )
262 or (
263 last_col_header_row == -1
264 and col_header.row_index + col_header.rowspan <= row_index
265 )
266 )
267 ) or (
268 # the last "imperativo" column header in Template:It-conj
269 col_header.col_index == 0
270 and col_header.row_index < row_index + rowspan
271 and col_header.row_index + col_header.rowspan > row_index
272 ):
273 use_tags.append(col_header.text)
274 last_col_header_row = col_header.row_index
275 form.raw_tags.extend(use_tags[::-1])
276 use_tags.clear()
277 for row_header in row_headers[::-1]:
278 if (
279 row_header.row_index < row_index + rowspan
280 and row_index < row_header.row_index + row_header.rowspan
281 and row_header.text not in form.raw_tags
282 and row_header.text not in use_tags
283 and (
284 (
285 last_row_header_col != -1
286 and (
287 row_header.col_index + row_header.colspan
288 in [last_row_header_col, last_row_header_col + 1]
289 or row_header.col_index == last_row_header_col
290 )
291 )
292 or (
293 last_row_header_col == -1
294 and row_header.col_index + row_header.colspan <= col_index
295 )
296 )
297 ):
298 use_tags.append(row_header.text)
299 last_row_header_col = row_header.col_index
300 form.raw_tags.extend(use_tags[::-1])
301 translate_raw_tags(form)
302 word_entry.forms.append(form)