Coverage for src/wiktextract/extractor/pl/inflection.py: 95%
233 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1import re
2from dataclasses import dataclass
4from wikitextprocessor.parser import NodeKind, TemplateNode, WikiNode
6from ...page import clean_node
7from ...wxr_context import WiktextractContext
8from .models import Form, WordEntry
9from .tags import translate_raw_tags
12def extract_inflection_section(
13 wxr: WiktextractContext,
14 page_data: list[WordEntry],
15 lang_code: str,
16 level_node: WikiNode,
17) -> None:
18 from .page import match_sense_index
20 sense_index = ""
21 forms = []
22 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM):
23 for node in list_item.children:
24 if isinstance(node, str):
25 m = re.search(r"\([\d\s,-.]+\)", node)
26 if m is not None: 26 ↛ 23line 26 didn't jump to line 23 because the condition on line 26 was always true
27 sense_index = m.group(0).strip("()")
28 elif isinstance(node, TemplateNode): 28 ↛ 23line 28 didn't jump to line 23 because the condition on line 28 was always true
29 forms.extend(
30 extract_inflection_template(wxr, node, sense_index)
31 )
32 if not level_node.contain_node(NodeKind.LIST):
33 # have to search recursively cuz "preformatted" node
34 for node in level_node.find_child_recursively(NodeKind.TEMPLATE):
35 forms.extend(extract_inflection_template(wxr, node, sense_index))
37 for data in page_data:
38 if data.lang_code == lang_code: 38 ↛ 37line 38 didn't jump to line 37 because the condition on line 38 was always true
39 for form in forms:
40 if form.sense_index == "" or match_sense_index(
41 form.sense_index, data
42 ):
43 data.forms.append(form)
46def extract_inflection_template(
47 wxr: WiktextractContext, template_node: TemplateNode, sense_index: str
48) -> list[Form]:
49 if template_node.template_name in [
50 "odmiana-rzeczownik-polski",
51 "odmiana-rzeczownik-czeski",
52 ]:
53 return extract_odmiana_rzeczownik_polski(
54 wxr, template_node, sense_index
55 )
56 elif template_node.template_name == "odmiana-przymiotnik-polski":
57 return extract_odmiana_przymiotnik_polski(
58 wxr, template_node, sense_index
59 )
60 elif template_node.template_name == "odmiana-czasownik-polski":
61 return extract_odmiana_czasownik_polski(wxr, template_node, sense_index)
62 elif template_node.template_name == "odmiana-rzeczownik-esperanto": 62 ↛ 65line 62 didn't jump to line 65 because the condition on line 62 was always true
63 return odmiana_rzeczownik_esperanto(wxr, template_node, sense_index)
65 return []
68def extract_odmiana_rzeczownik_polski(
69 wxr: WiktextractContext, template_node: TemplateNode, sense_index: str
70) -> list[Form]:
71 # noun table
72 # https://pl.wiktionary.org/wiki/Szablon:odmiana-rzeczownik-polski
73 forms = []
74 for arg_name, arg_value in template_node.template_parameters.items():
75 if not isinstance(arg_name, str): 75 ↛ 76line 75 didn't jump to line 76 because the condition on line 75 was never true
76 continue
77 if arg_name.startswith("Forma"):
78 raw_tags = ["depr."] if arg_name.endswith("depr") else ["ndepr."]
79 raw_tags.extend(["M.", "W.", "lm"])
80 else:
81 raw_tags = arg_name.lower().split()
82 if isinstance(arg_value, str):
83 arg_value = [arg_value]
84 if isinstance(arg_value, list): 84 ↛ 74line 84 didn't jump to line 74 because the condition on line 84 was always true
85 form_nodes = []
86 current_form_raw_tags = []
87 current_form_tags = []
88 parsed_arg = wxr.wtp.parse(wxr.wtp.node_to_wikitext(arg_value))
89 for node in parsed_arg.children:
90 if isinstance(node, str) and "/" in node:
91 slash_index = node.index("/")
92 form_nodes.append(node[:slash_index])
93 form_text = clean_node(wxr, None, form_nodes)
94 if form_text != "" and form_text != wxr.wtp.title:
95 form = Form(
96 form=form_text,
97 sense_index=sense_index,
98 raw_tags=raw_tags + current_form_raw_tags,
99 tags=current_form_tags,
100 )
101 translate_raw_tags(form)
102 forms.append(form)
103 form_nodes.clear()
104 current_form_raw_tags.clear()
105 current_form_tags.clear()
106 form_nodes.append(node[slash_index + 1 :])
107 elif isinstance(node, TemplateNode):
108 node_text = clean_node(wxr, None, node)
109 if node_text.endswith("."):
110 current_form_raw_tags.append(node_text)
111 else:
112 form_nodes.append(node_text)
113 if node.template_name == "potencjalnie":
114 current_form_tags.extend(["potential", "rare"])
115 else:
116 form_nodes.append(node)
117 if len(form_nodes) > 0: 117 ↛ 74line 117 didn't jump to line 74 because the condition on line 117 was always true
118 form_text = clean_node(wxr, None, form_nodes)
119 if form_text != "" and form_text != wxr.wtp.title: 119 ↛ 74line 119 didn't jump to line 74 because the condition on line 119 was always true
120 form = Form(
121 form=form_text,
122 sense_index=sense_index,
123 raw_tags=raw_tags + current_form_raw_tags,
124 tags=current_form_tags,
125 )
126 translate_raw_tags(form)
127 forms.append(form)
128 return forms
131def create_noun_form(
132 form_text: str,
133 sense_idx: str,
134 raw_tags: list[str],
135) -> Form:
136 form = Form(form=form_text, sense_index=sense_idx, raw_tags=raw_tags)
137 translate_raw_tags(form)
138 return form
141@dataclass
142class TableHeader:
143 text: str
144 start: int
145 end: int
148def extract_odmiana_przymiotnik_polski(
149 wxr: WiktextractContext, template_node: TemplateNode, sense_index: str
150) -> list[Form]:
151 # adj table
152 # https://pl.wiktionary.org/wiki/Szablon:odmiana-przymiotnik-polski
153 expanded_node = wxr.wtp.parse(
154 wxr.wtp.node_to_wikitext(template_node), expand_all=True
155 )
156 forms = []
157 for table_tag in expanded_node.find_html_recursively("table"):
158 forms.extend(
159 extract_odmiana_przymiotnik_polski_table(
160 wxr, table_tag, sense_index
161 )
162 )
163 return forms
166def extract_odmiana_przymiotnik_polski_table(
167 wxr: WiktextractContext, table_tag: WikiNode, sense_index: str
168) -> list[Form]:
169 forms = []
170 col_headers = []
171 for tr_tag in table_tag.find_html("tr"):
172 th_col_index = 0
173 for th_tag in tr_tag.find_html("th"):
174 if th_tag.contain_node(NodeKind.BOLD):
175 # comparative forms in the second and third table header
176 raw_tag_nodes = []
177 for th_child in th_tag.children:
178 if (
179 isinstance(th_child, WikiNode)
180 and th_child.kind == NodeKind.BOLD
181 ):
182 form = Form(
183 form=clean_node(wxr, None, th_child),
184 raw_tags=[clean_node(wxr, None, raw_tag_nodes)],
185 sense_index=sense_index,
186 )
187 translate_raw_tags(form)
188 forms.append(form)
189 else:
190 raw_tag_nodes.append(th_child)
191 else:
192 th_text = clean_node(wxr, None, th_tag)
193 col_span = int(th_tag.attrs.get("colspan", "1"))
194 if th_text != "przypadek":
195 col_headers.append(
196 TableHeader(
197 th_text,
198 th_col_index,
199 th_col_index + col_span,
200 )
201 )
202 th_col_index += col_span
204 # td tags
205 th_col_index = 0
206 td_col_index = 0
207 row_header = ""
208 all_header_row = all(
209 td_tag.attrs.get("class", "") == "forma"
210 for td_tag in tr_tag.find_html("td")
211 )
212 for td_tag in tr_tag.find_html("td"):
213 if any(td_tag.find_html("table")):
214 break
215 td_text = clean_node(wxr, None, td_tag)
216 if all_header_row:
217 col_headers.append(
218 TableHeader(td_text, th_col_index, th_col_index + 1)
219 )
220 th_col_index += 1
221 elif "forma" == td_tag.attrs.get("class", ""):
222 row_header = td_text
223 else:
224 col_span = int(td_tag.attrs.get("colspan", "1"))
225 if td_text == wxr.wtp.title:
226 td_col_index += col_span
227 continue
228 form = Form(form=td_text, sense_index=sense_index)
229 if row_header != "": 229 ↛ 231line 229 didn't jump to line 231 because the condition on line 229 was always true
230 form.raw_tags.append(row_header)
231 for col_header in col_headers:
232 if (
233 col_header.start < td_col_index + col_span
234 and td_col_index < col_header.end
235 ):
236 form.raw_tags.append(col_header.text)
237 td_col_index += col_span
238 translate_raw_tags(form)
239 forms.append(form)
240 return forms
243def extract_odmiana_czasownik_polski(
244 wxr: WiktextractContext, template_node: TemplateNode, sense_index: str
245) -> list[Form]:
246 # verb table
247 # https://pl.wiktionary.org/wiki/Szablon:odmiana-czasownik-polski
248 expanded_node = wxr.wtp.parse(
249 wxr.wtp.node_to_wikitext(template_node), expand_all=True
250 )
251 forms = []
252 col_headers = []
253 for table_tag in expanded_node.find_html_recursively("table"):
254 forms.extend(
255 extract_odmiana_czasownik_polski_table(
256 wxr, table_tag, sense_index, col_headers
257 )
258 )
259 return forms
262def extract_odmiana_czasownik_polski_table(
263 wxr: WiktextractContext,
264 table_tag: WikiNode,
265 sense_index: str,
266 col_headers: list[TableHeader],
267) -> list[Form]:
268 forms = []
269 row_headers = []
270 for row_index, tr_tag in enumerate(table_tag.find_html("tr")):
271 has_td_tag = any(t for t in tr_tag.find_html("td"))
272 th_col_index = 0
273 for th_tag in tr_tag.find_html("th"):
274 th_text = clean_node(wxr, None, th_tag)
275 if th_text in ["forma", "pozostałe formy"]:
276 continue
277 if not has_td_tag and "rowspan" not in th_tag.attrs:
278 col_span = int(th_tag.attrs.get("colspan", "1"))
279 col_headers.append(
280 TableHeader(th_text, th_col_index, th_col_index + col_span)
281 )
282 th_col_index += col_span
283 else:
284 row_span = int(th_tag.attrs.get("rowspan", "1"))
285 if th_tag.contain_node(NodeKind.LINK):
286 for link_node in th_tag.find_child(NodeKind.LINK):
287 row_headers.append(
288 TableHeader(
289 clean_node(wxr, None, link_node),
290 row_index,
291 row_index + row_span,
292 )
293 )
294 else:
295 row_headers.append(
296 TableHeader(th_text, row_index, row_index + row_span)
297 )
299 for row_index, tr_tag in enumerate(table_tag.find_html("tr")):
300 td_col_index = 0
301 for td_tag in tr_tag.find_html("td"):
302 if any(t for t in td_tag.find_html("table")):
303 break
304 td_text = clean_node(wxr, None, td_tag)
305 col_span = int(td_tag.attrs.get("colspan", "1"))
306 row_span = int(td_tag.attrs.get("rowspan", "1"))
307 # "Szablon:potencjalnie" uses "{{int:potential-form-tooltip}}"
308 # not implemented magic word
309 is_potential_form = False
310 for span_tag in td_tag.find_html(
311 "span", attr_name="class", attr_value="potential-form"
312 ):
313 is_potential_form = True
315 for line in td_text.splitlines():
316 for form_text in line.split(","):
317 form_text = form_text.strip()
318 if form_text == "" or form_text == wxr.wtp.title:
319 continue
320 form = Form(form=form_text, sense_index=sense_index)
321 for col_header in col_headers:
322 if (
323 col_header.start < td_col_index + col_span
324 and td_col_index < col_header.end
325 ):
326 form.raw_tags.append(col_header.text)
327 for row_header in row_headers:
328 if (
329 row_header.start < row_index + row_span
330 and row_index < row_header.end
331 ):
332 form.raw_tags.append(row_header.text)
333 translate_raw_tags(form)
334 if is_potential_form:
335 form.tags.extend(["potential", "rare"])
336 forms.append(form)
338 td_col_index += col_span
340 return forms
343def odmiana_rzeczownik_esperanto(
344 wxr: WiktextractContext, template_node: TemplateNode, sense_index: str
345) -> list[Form]:
346 # noun table
347 # https://pl.wiktionary.org/wiki/Szablon:odmiana-rzeczownik-esperanto
348 expanded_node = wxr.wtp.parse(
349 wxr.wtp.node_to_wikitext(template_node), expand_all=True
350 )
351 forms = []
352 col_headers = []
353 tags = []
354 for span_tag in expanded_node.find_html_recursively(
355 "span", attr_name="class", attr_value="short-content"
356 ):
357 span_text = clean_node(wxr, None, span_tag)
358 if span_text == "blm": 358 ↛ 360line 358 didn't jump to line 360 because the condition on line 358 was always true
359 tags.append("no-plural")
360 elif span_text == "blp":
361 tags.append("no-singulative")
362 for table_tag in expanded_node.find_html_recursively("table"):
363 for tr_tag in table_tag.find_html("tr"):
364 is_header_row = not any(t for t in tr_tag.find_html("td"))
365 row_header = ""
366 for th_tag in tr_tag.find_html("th"):
367 th_text = clean_node(wxr, None, th_tag)
368 if th_text == "":
369 continue
370 if is_header_row:
371 col_headers.append(th_text)
372 else:
373 row_header = th_text
374 for td_index, td_tag in enumerate(tr_tag.find_html("td")):
375 form_text = clean_node(wxr, None, td_tag)
376 td_tags = []
377 for _ in td_tag.find_html_recursively(
378 "span", attr_name="class", attr_value="potential-form"
379 ):
380 td_tags.extend(["potential", "rare"])
381 if form_text != "" and form_text != wxr.wtp.title:
382 form = Form(
383 form=form_text,
384 sense_index=sense_index,
385 tags=tags + td_tags,
386 )
387 if row_header != "": 387 ↛ 389line 387 didn't jump to line 389 because the condition on line 387 was always true
388 form.raw_tags.append(row_header)
389 if td_index < len(col_headers): 389 ↛ 391line 389 didn't jump to line 391 because the condition on line 389 was always true
390 form.raw_tags.append(col_headers[td_index])
391 translate_raw_tags(form)
392 forms.append(form)
394 return forms