Coverage for src/wiktextract/extractor/pl/inflection.py: 94%
237 statements
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-11 10:26 +0000
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-11 10:26 +0000
1import re
2from dataclasses import dataclass
4from wikitextprocessor.parser import NodeKind, TemplateNode, WikiNode
6from ...page import clean_node
7from ...wxr_context import WiktextractContext
8from .models import Form, WordEntry
9from .tags import translate_raw_tags
12def extract_inflection_section(
13 wxr: WiktextractContext,
14 page_data: list[WordEntry],
15 lang_code: str,
16 level_node: WikiNode,
17) -> None:
18 from .page import match_sense_index
20 sense_index = ""
21 forms = []
22 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM):
23 for node in list_item.children:
24 if isinstance(node, str):
25 m = re.search(r"\([\d\s,-.]+\)", node)
26 if m is not None: 26 ↛ 23line 26 didn't jump to line 23 because the condition on line 26 was always true
27 sense_index = m.group(0).strip("()")
28 elif isinstance(node, TemplateNode): 28 ↛ 23line 28 didn't jump to line 23 because the condition on line 28 was always true
29 forms.extend(
30 extract_inflection_template(wxr, node, sense_index)
31 )
32 if not level_node.contain_node(NodeKind.LIST):
33 # have to search recursively cuz "preformatted" node
34 for node in level_node.find_child_recursively(NodeKind.TEMPLATE):
35 forms.extend(extract_inflection_template(wxr, node, sense_index))
37 for data in page_data:
38 if data.lang_code == lang_code: 38 ↛ 37line 38 didn't jump to line 37 because the condition on line 38 was always true
39 for form in forms:
40 if form.sense_index == "" or match_sense_index(
41 form.sense_index, data
42 ):
43 data.forms.append(form)
46def extract_inflection_template(
47 wxr: WiktextractContext, template_node: TemplateNode, sense_index: str
48) -> list[Form]:
49 if template_node.template_name in [
50 "odmiana-rzeczownik-polski",
51 "odmiana-rzeczownik-czeski",
52 ]:
53 return extract_odmiana_rzeczownik_polski(
54 wxr, template_node, sense_index
55 )
56 elif template_node.template_name == "odmiana-przymiotnik-polski":
57 return extract_odmiana_przymiotnik_polski(
58 wxr, template_node, sense_index
59 )
60 elif template_node.template_name == "odmiana-czasownik-polski":
61 return extract_odmiana_czasownik_polski(wxr, template_node, sense_index)
62 elif template_node.template_name == "odmiana-rzeczownik-esperanto": 62 ↛ 65line 62 didn't jump to line 65 because the condition on line 62 was always true
63 return odmiana_rzeczownik_esperanto(wxr, template_node, sense_index)
65 return []
68def extract_odmiana_rzeczownik_polski(
69 wxr: WiktextractContext, template_node: TemplateNode, sense_index: str
70) -> list[Form]:
71 # noun table
72 # https://pl.wiktionary.org/wiki/Szablon:odmiana-rzeczownik-polski
73 forms = []
74 for arg_name, arg_value in template_node.template_parameters.items():
75 if not isinstance(arg_name, str): 75 ↛ 76line 75 didn't jump to line 76 because the condition on line 75 was never true
76 continue
77 if arg_name.startswith("Forma"):
78 raw_tags = ["depr."] if arg_name.endswith("depr") else ["ndepr."]
79 raw_tags.extend(["M.", "W.", "lm"])
80 else:
81 raw_tags = arg_name.lower().split()
82 if isinstance(arg_value, str):
83 arg_value = [arg_value]
84 if isinstance(arg_value, list): 84 ↛ 74line 84 didn't jump to line 74 because the condition on line 84 was always true
85 form_nodes = []
86 current_form_raw_tags = []
87 current_form_tags = []
88 parsed_arg = wxr.wtp.parse(wxr.wtp.node_to_wikitext(arg_value))
89 for node in parsed_arg.children:
90 if isinstance(node, str) and "/" in node:
91 slash_index = node.index("/")
92 form_nodes.append(node[:slash_index])
93 form_text = clean_node(wxr, None, form_nodes)
94 if form_text != "" and form_text != wxr.wtp.title:
95 form = Form(
96 form=form_text,
97 sense_index=sense_index,
98 raw_tags=raw_tags + current_form_raw_tags,
99 tags=current_form_tags,
100 )
101 translate_raw_tags(form)
102 forms.append(form)
103 form_nodes.clear()
104 current_form_raw_tags.clear()
105 current_form_tags.clear()
106 form_nodes.append(node[slash_index + 1 :])
107 elif isinstance(node, TemplateNode):
108 node_text = clean_node(wxr, None, node)
109 if node_text.endswith("."):
110 current_form_raw_tags.append(node_text)
111 else:
112 form_nodes.append(node_text)
113 if node.template_name == "potencjalnie":
114 current_form_tags.extend(["potential", "rare"])
115 else:
116 form_nodes.append(node)
117 if len(form_nodes) > 0: 117 ↛ 74line 117 didn't jump to line 74 because the condition on line 117 was always true
118 form_text = clean_node(wxr, None, form_nodes)
119 if form_text != "" and form_text != wxr.wtp.title: 119 ↛ 74line 119 didn't jump to line 74 because the condition on line 119 was always true
120 form = Form(
121 form=form_text,
122 sense_index=sense_index,
123 raw_tags=raw_tags + current_form_raw_tags,
124 tags=current_form_tags,
125 )
126 translate_raw_tags(form)
127 forms.append(form)
128 return forms
131def create_noun_form(
132 form_text: str,
133 sense_idx: str,
134 raw_tags: list[str],
135) -> Form:
136 form = Form(form=form_text, sense_index=sense_idx, raw_tags=raw_tags)
137 translate_raw_tags(form)
138 return form
141@dataclass
142class TableHeader:
143 text: str
144 start: int
145 end: int
148def extract_odmiana_przymiotnik_polski(
149 wxr: WiktextractContext, template_node: TemplateNode, sense_index: str
150) -> list[Form]:
151 # adj table
152 # https://pl.wiktionary.org/wiki/Szablon:odmiana-przymiotnik-polski
153 expanded_node = wxr.wtp.parse(
154 wxr.wtp.node_to_wikitext(template_node), expand_all=True
155 )
156 forms = []
157 for table_tag in expanded_node.find_html_recursively("table"):
158 forms.extend(
159 extract_odmiana_przymiotnik_polski_table(
160 wxr, table_tag, sense_index
161 )
162 )
163 return forms
166def extract_odmiana_przymiotnik_polski_table(
167 wxr: WiktextractContext, table_tag: WikiNode, sense_index: str
168) -> list[Form]:
169 forms = []
170 col_headers = []
171 for tr_tag in table_tag.find_html("tr"):
172 th_col_index = 0
173 for th_tag in tr_tag.find_html("th"):
174 if th_tag.contain_node(NodeKind.BOLD):
175 # comparative forms in the second and third table header
176 raw_tag_nodes = []
177 for th_child in th_tag.children:
178 if (
179 isinstance(th_child, WikiNode)
180 and th_child.kind == NodeKind.BOLD
181 ):
182 raw_tag = clean_node(wxr, None, raw_tag_nodes)
183 form = Form(
184 form=clean_node(wxr, None, th_child),
185 sense_index=sense_index,
186 )
187 if raw_tag != "": 187 ↛ 190line 187 didn't jump to line 190 because the condition on line 187 was always true
188 form.raw_tags.append(raw_tag)
189 translate_raw_tags(form)
190 if form.form not in [ 190 ↛ 177line 190 didn't jump to line 177 because the condition on line 190 was always true
191 "",
192 "nie stopniuje się",
193 wxr.wtp.title,
194 ]:
195 forms.append(form)
196 else:
197 raw_tag_nodes.append(th_child)
198 else:
199 th_text = clean_node(wxr, None, th_tag)
200 col_span = int(th_tag.attrs.get("colspan", "1"))
201 if th_text != "przypadek":
202 col_headers.append(
203 TableHeader(
204 th_text,
205 th_col_index,
206 th_col_index + col_span,
207 )
208 )
209 th_col_index += col_span
211 # td tags
212 th_col_index = 0
213 td_col_index = 0
214 row_header = ""
215 all_header_row = all(
216 td_tag.attrs.get("class", "") == "forma"
217 for td_tag in tr_tag.find_html("td")
218 )
219 for td_tag in tr_tag.find_html("td"):
220 if any(td_tag.find_html("table")):
221 break
222 td_text = clean_node(wxr, None, td_tag)
223 if all_header_row:
224 col_headers.append(
225 TableHeader(td_text, th_col_index, th_col_index + 1)
226 )
227 th_col_index += 1
228 elif "forma" == td_tag.attrs.get("class", ""):
229 row_header = td_text
230 else:
231 col_span = int(td_tag.attrs.get("colspan", "1"))
232 if td_text == wxr.wtp.title:
233 td_col_index += col_span
234 continue
235 form = Form(form=td_text, sense_index=sense_index)
236 if row_header != "": 236 ↛ 238line 236 didn't jump to line 238 because the condition on line 236 was always true
237 form.raw_tags.append(row_header)
238 for col_header in col_headers:
239 if (
240 col_header.start < td_col_index + col_span
241 and td_col_index < col_header.end
242 ):
243 form.raw_tags.append(col_header.text)
244 td_col_index += col_span
245 translate_raw_tags(form)
246 forms.append(form)
247 return forms
250def extract_odmiana_czasownik_polski(
251 wxr: WiktextractContext, template_node: TemplateNode, sense_index: str
252) -> list[Form]:
253 # verb table
254 # https://pl.wiktionary.org/wiki/Szablon:odmiana-czasownik-polski
255 expanded_node = wxr.wtp.parse(
256 wxr.wtp.node_to_wikitext(template_node), expand_all=True
257 )
258 forms = []
259 col_headers = []
260 for table_tag in expanded_node.find_html_recursively("table"):
261 forms.extend(
262 extract_odmiana_czasownik_polski_table(
263 wxr, table_tag, sense_index, col_headers
264 )
265 )
266 return forms
269def extract_odmiana_czasownik_polski_table(
270 wxr: WiktextractContext,
271 table_tag: WikiNode,
272 sense_index: str,
273 col_headers: list[TableHeader],
274) -> list[Form]:
275 forms = []
276 row_headers = []
277 for row_index, tr_tag in enumerate(table_tag.find_html("tr")):
278 has_td_tag = any(t for t in tr_tag.find_html("td"))
279 th_col_index = 0
280 for th_tag in tr_tag.find_html("th"):
281 th_text = clean_node(wxr, None, th_tag)
282 if th_text in ["forma", "pozostałe formy"]:
283 continue
284 if not has_td_tag and "rowspan" not in th_tag.attrs:
285 col_span = int(th_tag.attrs.get("colspan", "1"))
286 col_headers.append(
287 TableHeader(th_text, th_col_index, th_col_index + col_span)
288 )
289 th_col_index += col_span
290 else:
291 row_span = int(th_tag.attrs.get("rowspan", "1"))
292 if th_tag.contain_node(NodeKind.LINK):
293 for link_node in th_tag.find_child(NodeKind.LINK):
294 row_headers.append(
295 TableHeader(
296 clean_node(wxr, None, link_node),
297 row_index,
298 row_index + row_span,
299 )
300 )
301 else:
302 row_headers.append(
303 TableHeader(th_text, row_index, row_index + row_span)
304 )
306 for row_index, tr_tag in enumerate(table_tag.find_html("tr")):
307 td_col_index = 0
308 for td_tag in tr_tag.find_html("td"):
309 if any(t for t in td_tag.find_html("table")):
310 break
311 td_text = clean_node(wxr, None, td_tag)
312 col_span = int(td_tag.attrs.get("colspan", "1"))
313 row_span = int(td_tag.attrs.get("rowspan", "1"))
314 # "Szablon:potencjalnie" uses "{{int:potential-form-tooltip}}"
315 # not implemented magic word
316 is_potential_form = False
317 for span_tag in td_tag.find_html(
318 "span", attr_name="class", attr_value="potential-form"
319 ):
320 is_potential_form = True
322 for line in td_text.splitlines():
323 for form_text in line.split(","):
324 form_text = form_text.strip()
325 if form_text == "" or form_text == wxr.wtp.title:
326 continue
327 form = Form(form=form_text, sense_index=sense_index)
328 for col_header in col_headers:
329 if (
330 col_header.start < td_col_index + col_span
331 and td_col_index < col_header.end
332 ):
333 form.raw_tags.append(col_header.text)
334 for row_header in row_headers:
335 if (
336 row_header.start < row_index + row_span
337 and row_index < row_header.end
338 ):
339 form.raw_tags.append(row_header.text)
340 translate_raw_tags(form)
341 if is_potential_form:
342 form.tags.extend(["potential", "rare"])
343 forms.append(form)
345 td_col_index += col_span
347 return forms
350def odmiana_rzeczownik_esperanto(
351 wxr: WiktextractContext, template_node: TemplateNode, sense_index: str
352) -> list[Form]:
353 # noun table
354 # https://pl.wiktionary.org/wiki/Szablon:odmiana-rzeczownik-esperanto
355 expanded_node = wxr.wtp.parse(
356 wxr.wtp.node_to_wikitext(template_node), expand_all=True
357 )
358 forms = []
359 col_headers = []
360 tags = []
361 for span_tag in expanded_node.find_html_recursively(
362 "span", attr_name="class", attr_value="short-content"
363 ):
364 span_text = clean_node(wxr, None, span_tag)
365 if span_text == "blm": 365 ↛ 367line 365 didn't jump to line 367 because the condition on line 365 was always true
366 tags.append("no-plural")
367 elif span_text == "blp":
368 tags.append("no-singulative")
369 for table_tag in expanded_node.find_html_recursively("table"):
370 for tr_tag in table_tag.find_html("tr"):
371 is_header_row = not any(t for t in tr_tag.find_html("td"))
372 row_header = ""
373 for th_tag in tr_tag.find_html("th"):
374 th_text = clean_node(wxr, None, th_tag)
375 if th_text == "":
376 continue
377 if is_header_row:
378 col_headers.append(th_text)
379 else:
380 row_header = th_text
381 for td_index, td_tag in enumerate(tr_tag.find_html("td")):
382 form_text = clean_node(wxr, None, td_tag)
383 td_tags = []
384 for _ in td_tag.find_html_recursively(
385 "span", attr_name="class", attr_value="potential-form"
386 ):
387 td_tags.extend(["potential", "rare"])
388 if form_text != "" and form_text != wxr.wtp.title:
389 form = Form(
390 form=form_text,
391 sense_index=sense_index,
392 tags=tags + td_tags,
393 )
394 if row_header != "": 394 ↛ 396line 394 didn't jump to line 396 because the condition on line 394 was always true
395 form.raw_tags.append(row_header)
396 if td_index < len(col_headers): 396 ↛ 398line 396 didn't jump to line 398 because the condition on line 396 was always true
397 form.raw_tags.append(col_headers[td_index])
398 translate_raw_tags(form)
399 forms.append(form)
401 return forms