Coverage for src/wiktextract/extractor/nl/inflection.py: 85%
244 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1import re
2from dataclasses import dataclass
4from wikitextprocessor.parser import (
5 LEVEL_KIND_FLAGS,
6 NodeKind,
7 TemplateNode,
8 WikiNode,
9)
11from ...page import clean_node
12from ...wxr_context import WiktextractContext
13from .models import Form, WordEntry
14from .tags import translate_raw_tags
16FORMS_TABLE_TEMPLATES = frozenset(
17 [
18 "-nlnoun-",
19 "adjcomp",
20 "-nlname-",
21 "-denoun-",
22 "-denoun1-",
23 "-nlstam-",
24 "-csadjc-comp-",
25 "-dumstam-",
26 ]
27)
30def extract_inflection_template(
31 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
32) -> None:
33 if t_node.template_name in [
34 "-nlnoun-",
35 "adjcomp",
36 "-nlname-",
37 "-denoun-",
38 "-denoun1-",
39 ]:
40 extract_noun_adj_table(wxr, word_entry, t_node)
41 elif t_node.template_name == "-nlstam-":
42 extract_nlstam_template(wxr, word_entry, t_node)
43 elif t_node.template_name.startswith("-csadjc-comp-"): 43 ↛ 44line 43 didn't jump to line 44 because the condition on line 43 was never true
44 extract_csadjc_comp_template(wxr, word_entry, t_node)
45 elif t_node.template_name == "-dumstam-": 45 ↛ exitline 45 didn't return from function 'extract_inflection_template' because the condition on line 45 was always true
46 extract_dumstam_template(wxr, word_entry, t_node)
49def extract_noun_adj_table(
50 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
51) -> None:
52 # https://nl.wiktionary.org/wiki/Sjabloon:-nlnoun-
53 # https://nl.wiktionary.org/wiki/Sjabloon:adjcomp
54 expanded_node = wxr.wtp.parse(
55 wxr.wtp.node_to_wikitext(t_node), expand_all=True
56 )
57 column_headers = []
58 for table_node in expanded_node.find_child(NodeKind.TABLE):
59 for row_node in table_node.find_child(NodeKind.TABLE_ROW):
60 for header_node in row_node.find_child(NodeKind.TABLE_HEADER_CELL):
61 header_text = clean_node(wxr, None, header_node)
62 if header_text != "":
63 column_headers.append(header_text)
64 row_header = ""
65 for col_index, data_node in enumerate(
66 row_node.find_child(NodeKind.TABLE_CELL)
67 ):
68 if col_index == 0:
69 row_header = clean_node(wxr, None, data_node)
70 else:
71 for form_str in clean_node(
72 wxr, None, data_node
73 ).splitlines():
74 if form_str not in ["", "-", wxr.wtp.title]:
75 form = Form(form=form_str)
76 if row_header not in ["", "naamwoord", "demoniem"]:
77 form.raw_tags.append(row_header)
78 if col_index - 1 < len(column_headers): 78 ↛ 82line 78 didn't jump to line 82 because the condition on line 78 was always true
79 form.raw_tags.append(
80 column_headers[col_index - 1]
81 )
82 translate_raw_tags(form)
83 word_entry.forms.append(form)
85 for link_node in expanded_node.find_child(NodeKind.LINK):
86 clean_node(wxr, word_entry, link_node)
89def extract_nlstam_template(
90 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
91) -> None:
92 # verb table
93 # https://nl.wiktionary.org/wiki/Sjabloon:-nlstam-
94 for arg in [2, 3]:
95 form_texts = clean_node(
96 wxr, None, t_node.template_parameters.get(arg, "")
97 )
98 ipa_texts = clean_node(
99 wxr, None, t_node.template_parameters.get(arg + 3, "")
100 ).splitlines()
101 for index, form_str in enumerate(form_texts.splitlines()):
102 if form_str != "": 102 ↛ 101line 102 didn't jump to line 101 because the condition on line 102 was always true
103 form = Form(form=form_str)
104 if index < len(ipa_texts): 104 ↛ 106line 104 didn't jump to line 106 because the condition on line 104 was always true
105 form.ipa = ipa_texts[index]
106 form.tags.extend(
107 ["past"] if arg == 2 else ["past", "participle"]
108 )
109 word_entry.forms.append(form)
110 clean_node(wxr, word_entry, t_node)
111 if not word_entry.extracted_vervoeging_page:
112 extract_vervoeging_page(wxr, word_entry)
113 word_entry.extracted_vervoeging_page = True
116def extract_vervoeging_page(
117 wxr: WiktextractContext, word_entry: WordEntry
118) -> None:
119 page = wxr.wtp.get_page(f"{wxr.wtp.title}/vervoeging", 0)
120 if page is None:
121 return
122 root = wxr.wtp.parse(page.body)
123 table_templates = [
124 "-nlverb-",
125 "-nlverb-reflex-",
126 "-nlverb-onp-",
127 "-dumverb-",
128 ]
129 for t_node in root.find_child(NodeKind.TEMPLATE):
130 if t_node.template_name in table_templates: 130 ↛ 129line 130 didn't jump to line 129 because the condition on line 130 was always true
131 extract_nlverb_template(wxr, word_entry, t_node, "")
132 sense = ""
133 for lang_level_node in root.find_child(NodeKind.LEVEL2):
134 lang_name = clean_node(wxr, None, lang_level_node.largs)
135 if lang_name != word_entry.lang: 135 ↛ 136line 135 didn't jump to line 136 because the condition on line 135 was never true
136 continue
137 for sense_level_node in lang_level_node.find_child_recursively(
138 LEVEL_KIND_FLAGS
139 ):
140 sense = clean_node(wxr, None, sense_level_node.largs)
141 for t_node in sense_level_node.find_child(NodeKind.TEMPLATE):
142 if t_node.template_name in table_templates: 142 ↛ 141line 142 didn't jump to line 141 because the condition on line 142 was always true
143 extract_nlverb_template(wxr, word_entry, t_node, sense)
144 # only have language level node
145 for t_node in lang_level_node.find_child(NodeKind.TEMPLATE):
146 if t_node.template_name in table_templates: 146 ↛ 145line 146 didn't jump to line 145 because the condition on line 146 was always true
147 extract_nlverb_template(wxr, word_entry, t_node, sense)
150@dataclass
151class TableHeader:
152 text: str
153 col_index: int
154 colspan: int
155 row_index: int
156 rowspan: int
159NLVERB_HEADER_PREFIXES = {
160 "vervoeging van de bedrijvende vorm van": ["active"],
161 "onpersoonlijke lijdende vorm": ["impersonal", "passive"],
162 "lijdende vorm": ["passive"],
163}
166def extract_nlverb_template(
167 wxr: WiktextractContext,
168 word_entry: WordEntry,
169 t_node: TemplateNode,
170 sense: str,
171) -> None:
172 # https://nl.wiktionary.org/wiki/Sjabloon:-nlverb-
173 # Sjabloon:-nlverb-reflex-
174 # Sjabloon:-dumverb-
175 expanded_node = wxr.wtp.parse(
176 wxr.wtp.node_to_wikitext(t_node), expand_all=True
177 )
178 for link_node in expanded_node.find_child(NodeKind.LINK):
179 clean_node(wxr, word_entry, link_node)
180 if t_node.template_name == "-dumverb-":
181 extract_dumverb_table(wxr, word_entry, expanded_node, sense)
182 return
184 for table_node in expanded_node.find_child(NodeKind.TABLE):
185 row_index = 0
186 shared_tags = []
187 shared_raw_tags = []
188 last_row_all_header = False
189 col_headers = []
190 row_headers = []
191 for row_node in table_node.find_child(NodeKind.TABLE_ROW):
192 col_index = 0
193 for row_header in row_headers:
194 if (
195 row_index >= row_header.row_index
196 and row_index < row_header.row_index + row_header.rowspan
197 ):
198 col_index += row_header.rowspan
200 current_row_all_header = all(
201 nlverb_table_cell_is_header(n)
202 for n in row_node.find_child(
203 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
204 )
205 )
206 if current_row_all_header and not last_row_all_header:
207 row_index = 0
208 shared_tags.clear()
209 shared_raw_tags.clear()
210 col_headers.clear()
211 row_headers.clear()
213 small_tag = ""
214 is_row_first_node = True
215 for cell_node in row_node.find_child(
216 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
217 ):
218 cell_colspan = 1
219 cell_colspan_str = cell_node.attrs.get("colspan", "1")
220 if re.fullmatch(r"\d+", cell_colspan_str): 220 ↛ 222line 220 didn't jump to line 222 because the condition on line 220 was always true
221 cell_colspan = int(cell_colspan_str)
222 cell_rowspan = 1
223 cell_rowspan_str = cell_node.attrs.get("rowspan", "1")
224 if re.fullmatch(r"\d+", cell_rowspan_str): 224 ↛ 226line 224 didn't jump to line 226 because the condition on line 224 was always true
225 cell_rowspan = int(cell_rowspan_str)
226 cell_str = clean_node(wxr, None, cell_node).strip("| ")
227 if cell_str in ["", "—", wxr.wtp.title]:
228 pass
229 elif nlverb_table_cell_is_header(cell_node):
230 for (
231 header_prefix,
232 prefix_tags,
233 ) in NLVERB_HEADER_PREFIXES.items():
234 if cell_str.startswith(header_prefix):
235 shared_tags.extend(prefix_tags)
236 break
237 else:
238 if cell_str.startswith("vervoeging van "): 238 ↛ 239line 238 didn't jump to line 239 because the condition on line 238 was never true
239 pass
240 elif current_row_all_header:
241 if (
242 is_row_first_node
243 and t_node.template_name == "-nlverb-"
244 ):
245 shared_raw_tags.append(cell_str)
246 else:
247 col_headers.append(
248 TableHeader(
249 cell_str,
250 col_index,
251 cell_colspan,
252 row_index,
253 cell_rowspan,
254 )
255 )
256 else:
257 if "(" in cell_str: 257 ↛ 258line 257 didn't jump to line 258 because the condition on line 257 was never true
258 cell_str = cell_str[
259 : cell_str.index("(")
260 ].strip()
261 row_headers.append(
262 TableHeader(
263 cell_str,
264 col_index,
265 cell_colspan,
266 row_index,
267 cell_rowspan,
268 )
269 )
270 else: # data cell
271 has_small_tag = False
272 for small_node in cell_node.find_html("small"): 272 ↛ 273line 272 didn't jump to line 273 because the loop on line 272 never started
273 has_small_tag = True
274 if has_small_tag: 274 ↛ 275line 274 didn't jump to line 275 because the condition on line 274 was never true
275 small_tag = cell_str
276 col_index += cell_colspan
277 continue
278 form_texts = [cell_str]
279 if "/ " in cell_str: # "zweerde/ zwoor"
280 form_texts = cell_str.split("/")
281 elif "/" in cell_str and " " in cell_str:
282 # "zult/zal zweren" -> ["zult zweren", "zal zweren"]
283 space_index = cell_str.index(" ")
284 second_part = cell_str[space_index:]
285 form_texts = [
286 f_str + second_part
287 for f_str in cell_str[:space_index].split("/")
288 ]
289 for form_str in form_texts:
290 form_str = form_str.strip()
291 if len(form_str) == 0: 291 ↛ 292line 291 didn't jump to line 292 because the condition on line 291 was never true
292 continue
293 form = Form(
294 form=form_str,
295 tags=shared_tags,
296 raw_tags=shared_raw_tags,
297 source=f"{wxr.wtp.title}/vervoeging",
298 sense=sense,
299 )
300 if small_tag != "": 300 ↛ 301line 300 didn't jump to line 301 because the condition on line 300 was never true
301 form.raw_tags.append(small_tag)
302 small_tag = ""
303 for row_header in row_headers:
304 if (
305 row_index >= row_header.row_index
306 and row_index
307 < row_header.row_index + row_header.rowspan
308 ):
309 form.raw_tags.append(row_header.text)
310 for col_header in col_headers:
311 if (
312 col_index >= col_header.col_index
313 and col_index
314 < col_header.col_index + col_header.colspan
315 ):
316 form.raw_tags.append(col_header.text)
317 translate_raw_tags(form)
318 word_entry.forms.append(form)
320 col_index += cell_colspan
321 is_row_first_node = False
323 row_index += 1
324 last_row_all_header = current_row_all_header
327def nlverb_table_cell_is_header(node: WikiNode) -> bool:
328 return (
329 node.kind == NodeKind.TABLE_HEADER_CELL
330 or node.attrs.get("class", "") == "infoboxrijhoofding"
331 )
334def extract_csadjc_comp_template(
335 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
336) -> None:
337 # https://nl.wiktionary.org/wiki/Sjabloon:-csadjc-comp-ý3-
338 expanded_node = wxr.wtp.parse(
339 wxr.wtp.node_to_wikitext(t_node), expand_all=True
340 )
341 for table in expanded_node.find_child(NodeKind.TABLE):
342 for row in table.find_child(NodeKind.TABLE_ROW):
343 row_header = ""
344 for cell_node in row.find_child(
345 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
346 ):
347 if cell_node.kind == NodeKind.TABLE_HEADER_CELL:
348 row_header = clean_node(wxr, None, cell_node)
349 elif cell_node.kind == NodeKind.TABLE_CELL:
350 form_text = clean_node(wxr, None, cell_node)
351 if form_text not in ["", wxr.wtp.title]:
352 form = Form(form=form_text)
353 if row_header != "":
354 form.raw_tags.append(row_header)
355 translate_raw_tags(form)
356 word_entry.forms.append(form)
359def extract_dumstam_template(
360 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
361) -> None:
362 # https://nl.wiktionary.org/wiki/Sjabloon:-dumstam-
363 tags = [
364 ["infinitive"],
365 ["past", "singular"],
366 ["past", "plural"],
367 ["past", "participle"],
368 ]
369 for arg_name in range(1, 5):
370 word = clean_node(
371 wxr, None, t_node.template_parameters.get(arg_name, "")
372 )
373 if word not in ["", word_entry.word]:
374 form = Form(form=word, tags=tags[arg_name - 1])
375 word_entry.forms.append(form)
376 clean_node(wxr, word_entry, t_node)
377 if not word_entry.extracted_vervoeging_page: 377 ↛ exitline 377 didn't return from function 'extract_dumstam_template' because the condition on line 377 was always true
378 extract_vervoeging_page(wxr, word_entry)
379 word_entry.extracted_vervoeging_page = True
382def extract_dumverb_table(
383 wxr: WiktextractContext,
384 word_entry: WordEntry,
385 expanded_node: WikiNode,
386 sense: str,
387) -> None:
388 table_node = expanded_node
389 for t_node in expanded_node.find_child(NodeKind.TABLE): 389 ↛ 392line 389 didn't jump to line 392 because the loop on line 389 didn't complete
390 table_node = t_node
391 break
392 col_headers = []
393 last_row_all_header = False
394 for row_node in table_node.find_child(NodeKind.TABLE_ROW):
395 col_index = 0
396 row_header = ""
397 current_row_all_header = all(
398 nlverb_table_cell_is_header(n)
399 for n in row_node.find_child(
400 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
401 )
402 )
403 if current_row_all_header and not last_row_all_header:
404 col_headers.clear()
405 for cell_node in row_node.find_child(
406 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
407 ):
408 cell_colspan = 1
409 cell_colspan_str = cell_node.attrs.get("colspan", "1")
410 if re.fullmatch(r"\d+", cell_colspan_str): 410 ↛ 412line 410 didn't jump to line 412 because the condition on line 410 was always true
411 cell_colspan = int(cell_colspan_str)
412 cell_str = clean_node(wxr, None, cell_node).strip("!| \n")
413 if cell_str in ["", "—", wxr.wtp.title]:
414 continue
415 is_header = nlverb_table_cell_is_header(cell_node)
416 if is_header:
417 if current_row_all_header:
418 col_headers.append(
419 TableHeader(
420 cell_str,
421 col_index,
422 cell_colspan,
423 0,
424 0,
425 )
426 )
427 col_index += cell_colspan
428 else:
429 row_header = cell_str
430 else:
431 for cell_line in cell_str.splitlines():
432 cell_line = cell_line.strip()
433 if cell_line == "": 433 ↛ 434line 433 didn't jump to line 434 because the condition on line 433 was never true
434 continue
435 form = Form(
436 form=cell_line,
437 source=f"{wxr.wtp.title}/vervoeging",
438 sense=sense,
439 )
440 if row_header != "":
441 form.raw_tags.append(row_header)
442 for col_header in col_headers:
443 if (
444 col_index >= col_header.col_index
445 and col_index
446 < col_header.col_index + col_header.colspan
447 ):
448 form.raw_tags.append(col_header.text)
449 translate_raw_tags(form)
450 word_entry.forms.append(form)
451 col_index += cell_colspan
452 last_row_all_header = current_row_all_header