Coverage for src/wiktextract/extractor/nl/inflection.py: 85%
255 statements
« prev ^ index » next coverage.py v7.14.3, created at 2026-06-23 09:14 +0000
« prev ^ index » next coverage.py v7.14.3, created at 2026-06-23 09:14 +0000
1import re
2from dataclasses import dataclass
4from wikitextprocessor.parser import (
5 LEVEL_KIND_FLAGS,
6 NodeKind,
7 TemplateNode,
8 WikiNode,
9)
11from ...page import clean_node
12from ...wxr_context import WiktextractContext
13from .models import Form, WordEntry
14from .tags import translate_raw_tags
16FORMS_TABLE_TEMPLATES = frozenset(
17 [
18 "-nlnoun-",
19 "adjcomp",
20 "-nlname-",
21 "-denoun-",
22 "-denoun1-",
23 "-nlstam-",
24 "-csadjc-comp-",
25 "-dumstam-",
26 ]
27)
30def extract_inflection_template(
31 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
32) -> None:
33 if t_node.template_name in [
34 "-nlnoun-",
35 "adjcomp",
36 "-nlname-",
37 "-denoun-",
38 "-denoun1-",
39 ]:
40 extract_noun_adj_table(wxr, word_entry, t_node)
41 elif t_node.template_name == "-nlstam-":
42 extract_nlstam_template(wxr, word_entry, t_node)
43 elif t_node.template_name.startswith("-csadjc-comp-"): 43 ↛ 44line 43 didn't jump to line 44 because the condition on line 43 was never true
44 extract_csadjc_comp_template(wxr, word_entry, t_node)
45 elif t_node.template_name == "-dumstam-": 45 ↛ exitline 45 didn't return from function 'extract_inflection_template' because the condition on line 45 was always true
46 extract_dumstam_template(wxr, word_entry, t_node)
49def extract_noun_adj_table(
50 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
51) -> None:
52 # https://nl.wiktionary.org/wiki/Sjabloon:-nlnoun-
53 # https://nl.wiktionary.org/wiki/Sjabloon:adjcomp
54 expanded_node = wxr.wtp.parse(
55 wxr.wtp.node_to_wikitext(t_node), expand_all=True
56 )
57 column_headers = []
58 for table_node in expanded_node.find_child(NodeKind.TABLE):
59 for row_node in table_node.find_child(NodeKind.TABLE_ROW):
60 for header_node in row_node.find_child(NodeKind.TABLE_HEADER_CELL):
61 header_text = clean_node(wxr, None, header_node)
62 if header_text != "":
63 column_headers.append(header_text)
64 row_header = ""
65 for col_index, data_node in enumerate(
66 row_node.find_child(NodeKind.TABLE_CELL)
67 ):
68 if col_index == 0:
69 row_header = clean_node(wxr, None, data_node)
70 else:
71 for form_str in clean_node(
72 wxr, None, data_node
73 ).splitlines():
74 if form_str not in ["", "-", wxr.wtp.title]:
75 form = Form(form=form_str)
76 if row_header not in ["", "naamwoord", "demoniem"]:
77 form.raw_tags.append(row_header)
78 if col_index - 1 < len(column_headers): 78 ↛ 82line 78 didn't jump to line 82 because the condition on line 78 was always true
79 form.raw_tags.append(
80 column_headers[col_index - 1]
81 )
82 translate_raw_tags(form)
83 word_entry.forms.append(form)
85 for link_node in expanded_node.find_child(NodeKind.LINK):
86 clean_node(wxr, word_entry, link_node)
89def extract_nlstam_template(
90 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
91) -> None:
92 # verb table
93 # https://nl.wiktionary.org/wiki/Sjabloon:-nlstam-
94 for arg in [2, 3]:
95 form_texts = clean_node(
96 wxr, None, t_node.template_parameters.get(arg, "")
97 )
98 ipa_texts = clean_node(
99 wxr, None, t_node.template_parameters.get(arg + 3, "")
100 ).splitlines()
101 for index, form_str in enumerate(form_texts.splitlines()):
102 if form_str != "": 102 ↛ 101line 102 didn't jump to line 101 because the condition on line 102 was always true
103 form = Form(form=form_str)
104 if index < len(ipa_texts): 104 ↛ 106line 104 didn't jump to line 106 because the condition on line 104 was always true
105 form.ipa = ipa_texts[index]
106 form.tags.extend(
107 ["past"] if arg == 2 else ["past", "participle"]
108 )
109 word_entry.forms.append(form)
110 clean_node(wxr, word_entry, t_node)
111 if not word_entry.extracted_vervoeging_page:
112 extract_vervoeging_page(wxr, word_entry)
113 word_entry.extracted_vervoeging_page = True
116def extract_vervoeging_page(
117 wxr: WiktextractContext, word_entry: WordEntry
118) -> None:
119 page = wxr.wtp.get_page(f"{wxr.wtp.title}/vervoeging", 0)
120 if page is None:
121 return
122 root = wxr.wtp.parse(page.body)
123 table_templates = [
124 "-nlverb-",
125 "-nlverb-reflex-",
126 "-nlverb-onp-",
127 "-dumverb-",
128 ]
129 for t_node in root.find_child(NodeKind.TEMPLATE):
130 if t_node.template_name in table_templates: 130 ↛ 129line 130 didn't jump to line 129 because the condition on line 130 was always true
131 extract_nlverb_template(wxr, word_entry, t_node, "")
132 sense = ""
133 for lang_level_node in root.find_child(NodeKind.LEVEL2):
134 lang_name = clean_node(wxr, None, lang_level_node.largs)
135 if lang_name != word_entry.lang: 135 ↛ 136line 135 didn't jump to line 136 because the condition on line 135 was never true
136 continue
137 for sense_level_node in lang_level_node.find_child_recursively(
138 LEVEL_KIND_FLAGS
139 ):
140 sense = clean_node(wxr, None, sense_level_node.largs)
141 for t_node in sense_level_node.find_child(NodeKind.TEMPLATE):
142 if t_node.template_name in table_templates: 142 ↛ 141line 142 didn't jump to line 141 because the condition on line 142 was always true
143 extract_nlverb_template(wxr, word_entry, t_node, sense)
144 # only have language level node
145 for t_node in lang_level_node.find_child(NodeKind.TEMPLATE):
146 if t_node.template_name in table_templates: 146 ↛ 145line 146 didn't jump to line 145 because the condition on line 146 was always true
147 extract_nlverb_template(wxr, word_entry, t_node, sense)
150@dataclass
151class TableHeader:
152 text: str
153 col_index: int
154 colspan: int
155 row_index: int
156 rowspan: int
159NLVERB_HEADER_PREFIXES = {
160 "vervoeging van de bedrijvende vorm van": ["active"],
161 "onpersoonlijke lijdende vorm": ["impersonal", "passive"],
162 "lijdende vorm": ["passive"],
163}
166def extract_nlverb_template(
167 wxr: WiktextractContext,
168 word_entry: WordEntry,
169 t_node: TemplateNode,
170 sense: str,
171) -> None:
172 # https://nl.wiktionary.org/wiki/Sjabloon:-nlverb-
173 # Sjabloon:-nlverb-reflex-
174 # Sjabloon:-dumverb-
175 expanded_node = wxr.wtp.parse(
176 wxr.wtp.node_to_wikitext(t_node), expand_all=True
177 )
178 for link_node in expanded_node.find_child(NodeKind.LINK):
179 clean_node(wxr, word_entry, link_node)
180 if t_node.template_name == "-dumverb-":
181 extract_dumverb_table(wxr, word_entry, expanded_node, sense)
182 return
184 for table_node in expanded_node.find_child(NodeKind.TABLE):
185 row_index = 0
186 shared_tags = []
187 shared_raw_tags = []
188 last_row_all_header = False
189 col_headers = []
190 row_headers = []
191 for row_node in table_node.find_child(NodeKind.TABLE_ROW):
192 col_index = 0
193 for row_header in row_headers:
194 if (
195 row_index >= row_header.row_index
196 and row_index < row_header.row_index + row_header.rowspan
197 ):
198 col_index += row_header.rowspan
200 current_row_all_header = all(
201 nlverb_table_cell_is_header(n)
202 for n in row_node.find_child(
203 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
204 )
205 )
206 if current_row_all_header and not last_row_all_header:
207 row_index = 0
208 shared_tags.clear()
209 shared_raw_tags.clear()
210 col_headers.clear()
211 row_headers.clear()
213 small_tag = ""
214 is_row_first_node = True
215 for cell_node in row_node.find_child(
216 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
217 ):
218 cell_colspan = 1
219 cell_colspan_str = cell_node.attrs.get("colspan", "1")
220 if re.fullmatch(r"\d+", cell_colspan_str): 220 ↛ 222line 220 didn't jump to line 222 because the condition on line 220 was always true
221 cell_colspan = int(cell_colspan_str)
222 cell_rowspan = 1
223 cell_rowspan_str = cell_node.attrs.get("rowspan", "1")
224 if re.fullmatch(r"\d+", cell_rowspan_str): 224 ↛ 226line 224 didn't jump to line 226 because the condition on line 224 was always true
225 cell_rowspan = int(cell_rowspan_str)
226 cell_str = clean_node(wxr, None, cell_node).strip("| ")
227 if cell_str in ["", "—", wxr.wtp.title]:
228 pass
229 elif nlverb_table_cell_is_header(cell_node):
230 for (
231 header_prefix,
232 prefix_tags,
233 ) in NLVERB_HEADER_PREFIXES.items():
234 if cell_str.startswith(header_prefix):
235 shared_tags.extend(prefix_tags)
236 break
237 else:
238 if cell_str.startswith("vervoeging van "): 238 ↛ 239line 238 didn't jump to line 239 because the condition on line 238 was never true
239 pass
240 elif current_row_all_header:
241 if (
242 is_row_first_node
243 and t_node.template_name == "-nlverb-"
244 ):
245 shared_raw_tags.append(cell_str)
246 else:
247 col_headers.append(
248 TableHeader(
249 cell_str,
250 col_index,
251 cell_colspan,
252 row_index,
253 cell_rowspan,
254 )
255 )
256 else:
257 if "(" in cell_str: 257 ↛ 258line 257 didn't jump to line 258 because the condition on line 257 was never true
258 cell_str = cell_str[
259 : cell_str.index("(")
260 ].strip()
261 row_headers.append(
262 TableHeader(
263 cell_str,
264 col_index,
265 cell_colspan,
266 row_index,
267 cell_rowspan,
268 )
269 )
270 else: # data cell
271 has_small_tag = False
272 for small_node in cell_node.find_html("small"): 272 ↛ 273line 272 didn't jump to line 273 because the loop on line 272 never started
273 has_small_tag = True
274 if has_small_tag: 274 ↛ 275line 274 didn't jump to line 275 because the condition on line 274 was never true
275 small_tag = cell_str
276 col_index += cell_colspan
277 continue
278 form_texts = nl_split_cell(cell_str)
279 for form_str in form_texts:
280 form_str = form_str.strip()
281 if len(form_str) == 0: 281 ↛ 282line 281 didn't jump to line 282 because the condition on line 281 was never true
282 continue
283 form = Form(
284 form=form_str,
285 tags=shared_tags,
286 raw_tags=shared_raw_tags,
287 source=f"{wxr.wtp.title}/vervoeging",
288 sense=sense,
289 )
290 if small_tag != "": 290 ↛ 291line 290 didn't jump to line 291 because the condition on line 290 was never true
291 form.raw_tags.append(small_tag)
292 small_tag = ""
293 for row_header in row_headers:
294 if (
295 row_index >= row_header.row_index
296 and row_index
297 < row_header.row_index + row_header.rowspan
298 ):
299 form.raw_tags.append(row_header.text)
300 for col_header in col_headers:
301 if (
302 col_index >= col_header.col_index
303 and col_index
304 < col_header.col_index + col_header.colspan
305 ):
306 form.raw_tags.append(col_header.text)
307 translate_raw_tags(form)
308 word_entry.forms.append(form)
310 col_index += cell_colspan
311 is_row_first_node = False
313 row_index += 1
314 last_row_all_header = current_row_all_header
317def nlverb_table_cell_is_header(node: WikiNode) -> bool:
318 return (
319 node.kind == NodeKind.TABLE_HEADER_CELL
320 or node.attrs.get("class", "") == "infoboxrijhoofding"
321 )
324def extract_csadjc_comp_template(
325 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
326) -> None:
327 # https://nl.wiktionary.org/wiki/Sjabloon:-csadjc-comp-ý3-
328 expanded_node = wxr.wtp.parse(
329 wxr.wtp.node_to_wikitext(t_node), expand_all=True
330 )
331 for table in expanded_node.find_child(NodeKind.TABLE):
332 for row in table.find_child(NodeKind.TABLE_ROW):
333 row_header = ""
334 for cell_node in row.find_child(
335 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
336 ):
337 if cell_node.kind == NodeKind.TABLE_HEADER_CELL:
338 row_header = clean_node(wxr, None, cell_node)
339 elif cell_node.kind == NodeKind.TABLE_CELL:
340 form_text = clean_node(wxr, None, cell_node)
341 if form_text not in ["", wxr.wtp.title]:
342 form = Form(form=form_text)
343 if row_header != "":
344 form.raw_tags.append(row_header)
345 translate_raw_tags(form)
346 word_entry.forms.append(form)
349def extract_dumstam_template(
350 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
351) -> None:
352 # https://nl.wiktionary.org/wiki/Sjabloon:-dumstam-
353 tags = [
354 ["infinitive"],
355 ["past", "singular"],
356 ["past", "plural"],
357 ["past", "participle"],
358 ]
359 for arg_name in range(1, 5):
360 word = clean_node(
361 wxr, None, t_node.template_parameters.get(arg_name, "")
362 )
363 if word not in ["", word_entry.word]:
364 form = Form(form=word, tags=tags[arg_name - 1])
365 word_entry.forms.append(form)
366 clean_node(wxr, word_entry, t_node)
367 if not word_entry.extracted_vervoeging_page: 367 ↛ exitline 367 didn't return from function 'extract_dumstam_template' because the condition on line 367 was always true
368 extract_vervoeging_page(wxr, word_entry)
369 word_entry.extracted_vervoeging_page = True
372def extract_dumverb_table(
373 wxr: WiktextractContext,
374 word_entry: WordEntry,
375 expanded_node: WikiNode,
376 sense: str,
377) -> None:
378 table_node = expanded_node
379 for t_node in expanded_node.find_child(NodeKind.TABLE): 379 ↛ 382line 379 didn't jump to line 382 because the loop on line 379 didn't complete
380 table_node = t_node
381 break
382 col_headers = []
383 last_row_all_header = False
384 for row_node in table_node.find_child(NodeKind.TABLE_ROW):
385 col_index = 0
386 row_header = ""
387 current_row_all_header = all(
388 nlverb_table_cell_is_header(n)
389 for n in row_node.find_child(
390 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
391 )
392 )
393 if current_row_all_header and not last_row_all_header:
394 col_headers.clear()
395 for cell_node in row_node.find_child(
396 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
397 ):
398 cell_colspan = 1
399 cell_colspan_str = cell_node.attrs.get("colspan", "1")
400 if re.fullmatch(r"\d+", cell_colspan_str): 400 ↛ 402line 400 didn't jump to line 402 because the condition on line 400 was always true
401 cell_colspan = int(cell_colspan_str)
402 cell_str = clean_node(wxr, None, cell_node).strip("!| \n")
403 if cell_str in ["", "—", wxr.wtp.title]:
404 continue
405 is_header = nlverb_table_cell_is_header(cell_node)
406 if is_header:
407 if current_row_all_header:
408 col_headers.append(
409 TableHeader(
410 cell_str,
411 col_index,
412 cell_colspan,
413 0,
414 0,
415 )
416 )
417 col_index += cell_colspan
418 else:
419 row_header = cell_str
420 else:
421 for cell_line in cell_str.splitlines():
422 cell_line = cell_line.strip()
423 if cell_line == "": 423 ↛ 424line 423 didn't jump to line 424 because the condition on line 423 was never true
424 continue
425 form = Form(
426 form=cell_line,
427 source=f"{wxr.wtp.title}/vervoeging",
428 sense=sense,
429 )
430 if row_header != "":
431 form.raw_tags.append(row_header)
432 for col_header in col_headers:
433 if (
434 col_index >= col_header.col_index
435 and col_index
436 < col_header.col_index + col_header.colspan
437 ):
438 form.raw_tags.append(col_header.text)
439 translate_raw_tags(form)
440 word_entry.forms.append(form)
441 col_index += cell_colspan
442 last_row_all_header = current_row_all_header
445def nl_split_cell(text: str) -> list[str]:
446 if not text:
447 return []
448 if ("/" in text) + ("\n" in text) + ("(" in text) > 1:
449 # Leave messy entries alone; remove this when not applicable anymore
450 return [text]
451 if "/ " in text: # "zweerde/ zwoor"
452 form_texts = [s.strip() for s in text.split("/")]
453 elif "/" in text and " " in text:
454 # "zult/zal zweren" -> ["zult zweren", "zal zweren"]
455 space_index = text.index(" ")
456 second_part = text[space_index:]
457 form_texts = [
458 (f_str + second_part).strip()
459 for f_str in text[:space_index].split("/")
460 ]
461 elif m := re.match(r"([^()]+)\(([^)]+)\)(.+)", text):
462 # "zou(dt) treinsurfen" -> ["zou treinsurfen", "zoudt treinsurfen"]
463 form_texts = [
464 m.group(1) + m.group(3),
465 m.group(1) + m.group(2) + m.group(3),
466 ]
467 elif "\n" in text:
468 form_texts = [s.strip() for s in text.split("\n")]
469 else:
470 form_texts = [text]
471 return form_texts