Coverage for src/wiktextract/extractor/nl/pos.py: 92%
193 statements
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-15 05:18 +0000
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-15 05:18 +0000
1import re
3from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode
5from ...page import clean_node
6from ...wxr_context import WiktextractContext
7from .example import (
8 EXAMPLE_TEMPLATES,
9 extract_example_list_item,
10 extract_example_template,
11)
12from .models import AltForm, Form, Sense, WordEntry
13from .section_titles import LINKAGE_SECTIONS, POS_DATA
14from .tags import (
15 GLOSS_TAG_TEMPLATES,
16 LIST_ITEM_TAG_TEMPLATES,
17 translate_raw_tags,
18)
21def extract_pos_section(
22 wxr: WiktextractContext,
23 page_data: list[WordEntry],
24 base_data: WordEntry,
25 forms_data: WordEntry,
26 level_node: LevelNode,
27 pos_title: str,
28) -> None:
29 page_data.append(base_data.model_copy(deep=True))
30 page_data[-1].pos_title = pos_title
31 pos_data = POS_DATA[pos_title]
32 page_data[-1].pos = pos_data["pos"]
33 page_data[-1].tags.extend(pos_data.get("tags", []))
34 if forms_data.pos == "unknown":
35 forms_data.pos = page_data[-1].pos
36 if forms_data.pos == page_data[-1].pos:
37 page_data[-1].forms.extend(forms_data.forms)
38 page_data[-1].categories.extend(forms_data.categories)
39 else:
40 forms_data.forms.clear()
41 forms_data.categories.clear()
42 extract_pos_section_nodes(wxr, page_data, base_data, forms_data, level_node)
43 if len(page_data[-1].senses) == 0 and pos_title in LINKAGE_SECTIONS:
44 page_data.pop()
47def extract_pos_section_nodes(
48 wxr: WiktextractContext,
49 page_data: list[WordEntry],
50 base_data: WordEntry,
51 forms_data: WordEntry,
52 level_node: LevelNode,
53) -> None:
54 gloss_list_start = 0
55 is_first_bold = True
56 for index, node in enumerate(level_node.children):
57 if (
58 isinstance(node, WikiNode)
59 and node.kind == NodeKind.LIST
60 and node.sarg.endswith(("#", "::"))
61 ):
62 if gloss_list_start == 0 and node.sarg.endswith("#"):
63 gloss_list_start = index
64 extract_pos_header_line_nodes(
65 wxr, page_data[-1], level_node.children[:index]
66 )
67 for list_item in node.find_child(NodeKind.LIST_ITEM):
68 parent_sense = None
69 if node.sarg.endswith("##") and len(page_data[-1].senses) > 0:
70 p_glosses_len = len(node.sarg) - 1
71 for sense in page_data[-1].senses: 71 ↛ 78line 71 didn't jump to line 78 because the loop on line 71 didn't complete
72 if ( 72 ↛ 71line 72 didn't jump to line 71 because the condition on line 72 was always true
73 sense.glosses
74 == page_data[-1].senses[-1].glosses[:p_glosses_len]
75 ):
76 parent_sense = sense
77 break
78 extract_gloss_list_item(
79 wxr, page_data[-1], list_item, parent_sense
80 )
81 elif isinstance(node, LevelNode):
82 title_text = clean_node(wxr, None, node.largs)
83 if title_text in POS_DATA and title_text not in LINKAGE_SECTIONS:
84 # expanded from "eng-onv-d" form-of template
85 from .page import parse_section
87 parse_section(wxr, page_data, base_data, forms_data, node)
88 else:
89 break
90 elif (
91 isinstance(node, TemplateNode)
92 and node.template_name in EXAMPLE_TEMPLATES
93 and len(page_data[-1].senses) > 0
94 ):
95 extract_example_template(wxr, page_data[-1].senses[-1], node)
96 elif isinstance(node, TemplateNode) and (
97 node.template_name
98 in [
99 "noun-pl",
100 "nl-advb-form",
101 "noun-dim",
102 "noun-dim-pl",
103 "num-form",
104 "ordn-form",
105 "prep-form",
106 "pronom-dem-form",
107 "pronom-pos-form",
108 "xh-pronom-pos-form",
109 "oudeschrijfwijze",
110 ]
111 or node.template_name.endswith(
112 ("adjc-form", "adverb-form", "noun-form")
113 )
114 or re.search(r"-dec\d+", node.template_name) is not None
115 ):
116 extract_noun_form_of_template(wxr, page_data[-1], node)
117 elif isinstance(node, TemplateNode) and (
118 node.template_name.startswith(
119 (
120 "1ps",
121 "2ps",
122 "aanv-w",
123 "onv-d",
124 "ott-",
125 "ovt-",
126 "tps",
127 "volt-d",
128 "eng-onv-d",
129 # Categorie:Bijvoeglijknaamwoordsjablonen
130 "dan-adjc-",
131 "la-adjc-",
132 "nno-adjc-",
133 "nor-adjc-",
134 "swe-adjc-",
135 )
136 )
137 or node.template_name.endswith(
138 (
139 # Categorie:Werkwoordsvormsjablonen
140 "verb-form",
141 "-gw",
142 "-lv",
143 "-lv-vt",
144 "-lv-vtd",
145 "-onv-d",
146 "-twt",
147 "-vt",
148 "-vt-onr",
149 "-3ps",
150 "-inf",
151 "-lv-hv",
152 "-twt-bv",
153 "-twt-hv",
154 "-vt-onr-bv",
155 "-vt-onr-hv",
156 "-vt-onr",
157 )
158 )
159 or node.template_name
160 in ["fra-deelwoord", "2ps-rus", "ww-kur", "ww-tur"]
161 ):
162 extract_verb_form_of_template(
163 wxr, page_data, base_data, forms_data, node
164 )
165 elif isinstance(node, TemplateNode):
166 # tag template after form-of template
167 cats = {}
168 expanded_text = clean_node(wxr, cats, node)
169 if (
170 expanded_text.startswith("(")
171 and expanded_text.endswith(")")
172 and len(page_data[-1].senses) > 0
173 ):
174 page_data[-1].senses[-1].raw_tags.append(
175 expanded_text.strip("() ")
176 )
177 page_data[-1].senses[-1].categories.extend(
178 cats.get("categories", [])
179 )
180 translate_raw_tags(page_data[-1].senses[-1])
181 elif (
182 isinstance(node, WikiNode)
183 and node.kind == NodeKind.BOLD
184 and is_first_bold
185 ):
186 extract_form_line_bold_node(wxr, page_data[-1], node)
187 is_first_bold = None
190def extract_gloss_list_item(
191 wxr: WiktextractContext,
192 word_entry: WordEntry,
193 list_item: WikiNode,
194 parent_sense: Sense | None = None,
195) -> None:
196 create_new_sense = (
197 False if list_item.sarg == "::" and len(word_entry.senses) > 0 else True
198 )
199 if not create_new_sense:
200 sense = word_entry.senses[-1]
201 elif parent_sense is None:
202 sense = Sense()
203 else:
204 sense = parent_sense.model_copy(deep=True)
206 gloss_nodes = []
207 for child in list_item.children:
208 if isinstance(child, TemplateNode):
209 if child.template_name in GLOSS_TAG_TEMPLATES:
210 sense.raw_tags.append(clean_node(wxr, sense, child))
211 elif child.template_name in LIST_ITEM_TAG_TEMPLATES: 211 ↛ 212line 211 didn't jump to line 212 because the condition on line 211 was never true
212 sense.tags.append(LIST_ITEM_TAG_TEMPLATES[child.template_name])
213 else:
214 expanded_text = clean_node(wxr, sense, child)
215 if expanded_text.startswith("(") and expanded_text.endswith(
216 ")"
217 ):
218 sense.raw_tags.append(expanded_text.strip("() "))
219 else:
220 gloss_nodes.append(expanded_text)
221 elif isinstance(child, WikiNode) and child.kind == NodeKind.LIST:
222 if child.sarg.endswith("*"):
223 for next_list_item in child.find_child(NodeKind.LIST_ITEM):
224 extract_example_list_item(wxr, sense, next_list_item)
225 elif isinstance(child, WikiNode) and child.kind == NodeKind.ITALIC:
226 italic_text = clean_node(wxr, sense, child)
227 if italic_text.startswith("(") and italic_text.endswith(")"): 227 ↛ 230line 227 didn't jump to line 230 because the condition on line 227 was always true
228 sense.raw_tags.append(italic_text.strip("() "))
229 else:
230 gloss_nodes.append(italic_text)
231 else:
232 gloss_nodes.append(child)
234 gloss_text = clean_node(wxr, sense, gloss_nodes)
235 while gloss_text.startswith(","): # between qualifier templates
236 gloss_text = gloss_text.removeprefix(",").strip()
237 m = re.match(r"\(([^()]+)\)", gloss_text)
238 if m is not None:
239 new_gloss_text = gloss_text[m.end() :].strip()
240 if new_gloss_text != "":
241 # expanded "verouderd" template in "2ps" template
242 gloss_text = new_gloss_text
243 sense.raw_tags.append(m.group(1))
244 else: # gloss text after form-of template
245 gloss_text = m.group(1)
247 if len(gloss_text) > 0:
248 sense.glosses.append(gloss_text)
249 if ( 249 ↛ 259line 249 didn't jump to line 259 because the condition on line 249 was always true
250 len(sense.glosses) > 0
251 or len(sense.tags) > 0
252 or len(sense.raw_tags) > 0
253 or len(sense.examples) > 0
254 ):
255 translate_raw_tags(sense)
256 if create_new_sense:
257 word_entry.senses.append(sense)
259 for child_list in list_item.find_child(NodeKind.LIST):
260 if child_list.sarg.startswith("#") and child_list.sarg.endswith("#"):
261 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM):
262 extract_gloss_list_item(wxr, word_entry, child_list_item, sense)
265def extract_pos_header_line_nodes(
266 wxr: WiktextractContext, word_entry: WordEntry, nodes: list[WikiNode | str]
267) -> None:
268 for node in nodes:
269 if isinstance(node, str) and word_entry.etymology_index == "":
270 m = re.search(r"\[(.+)\]", node.strip())
271 if m is not None:
272 word_entry.etymology_index = m.group(1).strip()
273 elif isinstance(node, TemplateNode):
274 if node.template_name == "-l-":
275 extract_l_template(wxr, word_entry, node)
276 elif node.template_name == "dimt": 276 ↛ 277line 276 didn't jump to line 277 because the condition on line 276 was never true
277 word_entry.raw_tags.append(clean_node(wxr, word_entry, node))
278 translate_raw_tags(word_entry)
281def extract_l_template(
282 wxr: WiktextractContext, word_entry: WordEntry, node: TemplateNode
283) -> None:
284 # https://nl.wiktionary.org/wiki/Sjabloon:-l-
285 first_arg = clean_node(wxr, None, node.template_parameters.get(1, ""))
286 gender_args = {
287 "n": "neuter",
288 "m": "masculine",
289 "fm": ["feminine", "masculine"],
290 "p": "plural",
291 }
292 tag = gender_args.get(first_arg, [])
293 if isinstance(tag, str):
294 word_entry.tags.append(tag)
295 elif isinstance(tag, list): 295 ↛ exitline 295 didn't return from function 'extract_l_template' because the condition on line 295 was always true
296 word_entry.tags.extend(tag)
299# https://nl.wiktionary.org/wiki/Sjabloon:noun-pl
300# https://nl.wiktionary.org/wiki/Sjabloon:noun-form
301# https://nl.wiktionary.org/wiki/Sjabloon:oudeschrijfwijze
302# "getal" and "gesl" args
303NOUN_FORM_OF_TEMPLATE_NUM_TAGS = {
304 "s": "singular",
305 "p": "plural",
306 "d": "dual",
307 "c": "collective",
308 "a": "animate",
309 "i": "inanimate",
310}
311NOUN_FORM_OF_TEMPLATE_GENDER_TAGS = {
312 "m": "masculine",
313 "f": "feminine",
314 "n": "neuter",
315 "c": "common",
316 "fm": ["feminine", "masculine"],
317 "mf": ["feminine", "masculine"],
318 "mn": ["masculine", "neuter"],
319}
322def extract_oudeschrijfwijze_template_g_arg(
323 wxr: WiktextractContext, g_arg: str, sense: Sense
324) -> bool:
325 for tags_dict in [
326 NOUN_FORM_OF_TEMPLATE_GENDER_TAGS,
327 NOUN_FORM_OF_TEMPLATE_NUM_TAGS,
328 ]:
329 if g_arg in tags_dict:
330 tag = tags_dict[g_arg]
331 if isinstance(tag, str): 331 ↛ 333line 331 didn't jump to line 333 because the condition on line 331 was always true
332 sense.tags.append(tag)
333 elif isinstance(tag, list):
334 sense.tags.extend(tag)
335 return True
336 return False
339def extract_oudeschrijfwijze_template(
340 wxr: WiktextractContext, t_node: TemplateNode, sense: Sense
341) -> None:
342 g_arg_str = clean_node(wxr, None, t_node.template_parameters.get("g", ""))
343 if not extract_oudeschrijfwijze_template_g_arg(wxr, g_arg_str, sense):
344 g_args = t_node.template_parameters.get("g", "")
345 if isinstance(g_args, list):
346 for g_arg in g_args:
347 if isinstance(g_arg, TemplateNode):
348 extract_oudeschrijfwijze_template_g_arg(
349 wxr, g_arg.template_name, sense
350 )
353def extract_noun_form_of_template(
354 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
355) -> None:
356 # https://nl.wiktionary.org/wiki/Categorie:Vormsjablonen
357 sense = Sense(tags=["form-of"])
358 if t_node.template_name.endswith("-pl"):
359 sense.tags.append("plural")
360 else:
361 num_arg = clean_node(
362 wxr, None, t_node.template_parameters.get("getal", "")
363 )
364 if num_arg in NOUN_FORM_OF_TEMPLATE_NUM_TAGS:
365 sense.tags.append(NOUN_FORM_OF_TEMPLATE_NUM_TAGS[num_arg])
367 gender_arg = clean_node(
368 wxr, None, t_node.template_parameters.get("gesl", "")
369 )
370 if gender_arg in NOUN_FORM_OF_TEMPLATE_GENDER_TAGS: 370 ↛ 371line 370 didn't jump to line 371 because the condition on line 370 was never true
371 gender_tag = NOUN_FORM_OF_TEMPLATE_GENDER_TAGS[gender_arg]
372 if isinstance(gender_tag, str):
373 sense.tags.append(gender_tag)
374 elif isinstance(gender_tag, list):
375 sense.tags.extend(gender_tag)
377 # Sjabloon:oudeschrijfwijze
378 if t_node.template_name == "oudeschrijfwijze":
379 extract_oudeschrijfwijze_template(wxr, t_node, sense)
381 form_of = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
382 if form_of != "": 382 ↛ 385line 382 didn't jump to line 385 because the condition on line 382 was always true
383 sense.form_of.append(AltForm(word=form_of))
385 expanded_node = wxr.wtp.parse(
386 wxr.wtp.node_to_wikitext(t_node), expand_all=True
387 )
388 for list_item in expanded_node.find_child_recursively(NodeKind.LIST_ITEM):
389 sense.glosses.append(clean_node(wxr, None, list_item.children))
390 break
391 clean_node(wxr, sense, expanded_node)
392 word_entry.senses.append(sense)
395def extract_verb_form_of_template(
396 wxr: WiktextractContext,
397 page_data: list[WordEntry],
398 base_data: WordEntry,
399 forms_data: WordEntry,
400 t_node: TemplateNode,
401) -> None:
402 # https://nl.wiktionary.org/wiki/Categorie:Werkwoordsvormsjablonen_voor_het_Nederlands
403 # https://nl.wiktionary.org/wiki/Categorie:Werkwoordsvormsjablonen
404 from .page import extract_section_categories
406 orig_data_len = len(page_data)
407 expanded_node = wxr.wtp.parse(
408 wxr.wtp.node_to_wikitext(t_node), expand_all=True
409 )
410 extract_pos_section_nodes(
411 wxr, page_data, base_data, forms_data, expanded_node
412 )
413 form_of = clean_node(
414 wxr,
415 None,
416 t_node.template_parameters.get(
417 3 if t_node.template_name == "la-adjc-form" else 1, ""
418 ),
419 )
420 for word_entry in page_data[orig_data_len - len(page_data) - 1 :]:
421 for sense in word_entry.senses:
422 sense.tags.append("form-of")
423 if form_of != "": 423 ↛ 421line 423 didn't jump to line 421 because the condition on line 423 was always true
424 sense.form_of.append(AltForm(word=form_of))
425 extract_section_categories(wxr, word_entry, expanded_node)
426 word_entry.tags.append("form-of")
429def extract_form_line_bold_node(
430 wxr: WiktextractContext, word_entry: WordEntry, bold_node: WikiNode
431):
432 word = clean_node(wxr, None, bold_node)
433 if word != "" and word != wxr.wtp.title:
434 word_entry.forms.append(Form(form=word, tags=["canonical"]))