Coverage for src/wiktextract/extractor/nl/pos.py: 90%
172 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1import re
3from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode
5from ...page import clean_node
6from ...wxr_context import WiktextractContext
7from .example import (
8 EXAMPLE_TEMPLATES,
9 extract_example_list_item,
10 extract_example_template,
11)
12from .models import AltForm, Sense, WordEntry
13from .section_titles import LINKAGE_SECTIONS, POS_DATA
14from .tags import (
15 GLOSS_TAG_TEMPLATES,
16 LIST_ITEM_TAG_TEMPLATES,
17 translate_raw_tags,
18)
21def extract_pos_section(
22 wxr: WiktextractContext,
23 page_data: list[WordEntry],
24 base_data: WordEntry,
25 forms_data: WordEntry,
26 level_node: LevelNode,
27 pos_title: str,
28) -> None:
29 page_data.append(base_data.model_copy(deep=True))
30 page_data[-1].pos_title = pos_title
31 pos_data = POS_DATA[pos_title]
32 page_data[-1].pos = pos_data["pos"]
33 page_data[-1].tags.extend(pos_data.get("tags", []))
34 if forms_data.pos == "unknown":
35 forms_data.pos = page_data[-1].pos
36 if forms_data.pos == page_data[-1].pos:
37 page_data[-1].forms.extend(forms_data.forms)
38 page_data[-1].categories.extend(forms_data.categories)
39 else:
40 forms_data.forms.clear()
41 forms_data.categories.clear()
42 extract_pos_section_nodes(wxr, page_data, base_data, forms_data, level_node)
43 if len(page_data[-1].senses) == 0 and pos_title in LINKAGE_SECTIONS:
44 page_data.pop()
47def extract_pos_section_nodes(
48 wxr: WiktextractContext,
49 page_data: list[WordEntry],
50 base_data: WordEntry,
51 forms_data: WordEntry,
52 level_node: LevelNode,
53) -> None:
54 gloss_list_start = 0
55 for index, node in enumerate(level_node.children):
56 if (
57 isinstance(node, WikiNode)
58 and node.kind == NodeKind.LIST
59 and node.sarg.endswith(("#", "::"))
60 ):
61 if gloss_list_start == 0 and node.sarg.endswith("#"):
62 gloss_list_start = index
63 extract_pos_header_line_nodes(
64 wxr, page_data[-1], level_node.children[:index]
65 )
66 for list_item in node.find_child(NodeKind.LIST_ITEM):
67 extract_gloss_list_item(wxr, page_data[-1], list_item)
68 elif isinstance(node, LevelNode):
69 title_text = clean_node(wxr, None, node.largs)
70 if title_text in POS_DATA and title_text not in LINKAGE_SECTIONS:
71 # expanded from "eng-onv-d" form-of template
72 from .page import parse_section
74 parse_section(wxr, page_data, base_data, forms_data, node)
75 else:
76 break
77 elif (
78 isinstance(node, TemplateNode)
79 and node.template_name in EXAMPLE_TEMPLATES
80 and len(page_data[-1].senses) > 0
81 ):
82 extract_example_template(wxr, page_data[-1].senses[-1], node)
83 elif isinstance(node, TemplateNode) and (
84 node.template_name
85 in [
86 "noun-pl",
87 "nl-advb-form",
88 "noun-dim",
89 "noun-dim-pl",
90 "num-form",
91 "ordn-form",
92 "prep-form",
93 "pronom-dem-form",
94 "pronom-pos-form",
95 "xh-pronom-pos-form",
96 "oudeschrijfwijze",
97 ]
98 or node.template_name.endswith(
99 ("adjc-form", "adverb-form", "noun-form")
100 )
101 or re.search(r"-dec\d+", node.template_name) is not None
102 ):
103 extract_noun_form_of_template(wxr, page_data[-1], node)
104 elif isinstance(node, TemplateNode) and (
105 node.template_name.startswith(
106 (
107 "1ps",
108 "2ps",
109 "aanv-w",
110 "onv-d",
111 "ott-",
112 "ovt-",
113 "tps",
114 "volt-d",
115 "eng-onv-d",
116 )
117 )
118 or node.template_name.endswith("verb-form")
119 ):
120 extract_verb_form_of_template(
121 wxr, page_data, base_data, forms_data, node
122 )
123 elif isinstance(node, TemplateNode):
124 # tag template after form-of template
125 cats = {}
126 expanded_text = clean_node(wxr, cats, node)
127 if (
128 expanded_text.startswith("(")
129 and expanded_text.endswith(")")
130 and len(page_data[-1].senses) > 0
131 ):
132 page_data[-1].senses[-1].raw_tags.append(
133 expanded_text.strip("() ")
134 )
135 page_data[-1].senses[-1].categories.extend(
136 cats.get("categories", [])
137 )
138 translate_raw_tags(page_data[-1].senses[-1])
141def extract_gloss_list_item(
142 wxr: WiktextractContext,
143 word_entry: WordEntry,
144 list_item: WikiNode,
145) -> None:
146 create_new_sense = (
147 False if list_item.sarg == "::" and len(word_entry.senses) > 0 else True
148 )
149 sense = Sense() if create_new_sense else word_entry.senses[-1]
150 gloss_nodes = []
151 for child in list_item.children:
152 if isinstance(child, TemplateNode):
153 if child.template_name in GLOSS_TAG_TEMPLATES:
154 sense.raw_tags.append(clean_node(wxr, sense, child))
155 elif child.template_name in LIST_ITEM_TAG_TEMPLATES: 155 ↛ 156line 155 didn't jump to line 156 because the condition on line 155 was never true
156 sense.tags.append(LIST_ITEM_TAG_TEMPLATES[child.template_name])
157 else:
158 expanded_text = clean_node(wxr, sense, child)
159 if expanded_text.startswith("(") and expanded_text.endswith(
160 ")"
161 ):
162 sense.raw_tags.append(expanded_text.strip("() "))
163 else:
164 gloss_nodes.append(expanded_text)
165 elif isinstance(child, WikiNode) and child.kind == NodeKind.LIST:
166 if child.sarg.endswith("*"): 166 ↛ 151line 166 didn't jump to line 151 because the condition on line 166 was always true
167 for next_list_item in child.find_child(NodeKind.LIST_ITEM):
168 extract_example_list_item(wxr, sense, next_list_item)
169 elif isinstance(child, WikiNode) and child.kind == NodeKind.ITALIC:
170 italic_text = clean_node(wxr, sense, child)
171 if italic_text.startswith("(") and italic_text.endswith(")"): 171 ↛ 174line 171 didn't jump to line 174 because the condition on line 171 was always true
172 sense.raw_tags.append(italic_text.strip("() "))
173 else:
174 gloss_nodes.append(italic_text)
175 else:
176 gloss_nodes.append(child)
178 gloss_text = clean_node(wxr, sense, gloss_nodes)
179 while gloss_text.startswith(","): # between qualifier templates
180 gloss_text = gloss_text.removeprefix(",").strip()
181 m = re.match(r"\(([^()]+)\)", gloss_text)
182 if m is not None:
183 new_gloss_text = gloss_text[m.end() :].strip()
184 if new_gloss_text != "":
185 # expanded "verouderd" template in "2ps" template
186 gloss_text = new_gloss_text
187 sense.raw_tags.append(m.group(1))
188 else: # gloss text after form-of template
189 gloss_text = m.group(1)
191 if len(gloss_text) > 0:
192 sense.glosses.append(gloss_text)
193 if ( 193 ↛ exitline 193 didn't return from function 'extract_gloss_list_item' because the condition on line 193 was always true
194 len(sense.glosses) > 0
195 or len(sense.tags) > 0
196 or len(sense.raw_tags) > 0
197 or len(sense.examples) > 0
198 ):
199 translate_raw_tags(sense)
200 if len(sense.glosses) == 0:
201 sense.tags.append("no-gloss")
202 if create_new_sense:
203 word_entry.senses.append(sense)
206def extract_pos_header_line_nodes(
207 wxr: WiktextractContext, word_entry: WordEntry, nodes: list[WikiNode | str]
208) -> None:
209 for node in nodes:
210 if isinstance(node, str) and word_entry.etymology_index == "":
211 m = re.search(r"\[(.+)\]", node.strip())
212 if m is not None:
213 word_entry.etymology_index = m.group(1).strip()
214 elif isinstance(node, TemplateNode):
215 if node.template_name == "-l-": 215 ↛ 217line 215 didn't jump to line 217 because the condition on line 215 was always true
216 extract_l_template(wxr, word_entry, node)
217 elif node.template_name == "dimt":
218 word_entry.raw_tags.append(clean_node(wxr, word_entry, node))
219 translate_raw_tags(word_entry)
222def extract_l_template(
223 wxr: WiktextractContext, word_entry: WordEntry, node: TemplateNode
224) -> None:
225 # https://nl.wiktionary.org/wiki/Sjabloon:-l-
226 first_arg = clean_node(wxr, None, node.template_parameters.get(1, ""))
227 gender_args = {
228 "n": "neuter",
229 "m": "masculine",
230 "fm": ["feminine", "masculine"],
231 "p": "plural",
232 }
233 tag = gender_args.get(first_arg, [])
234 if isinstance(tag, str):
235 word_entry.tags.append(tag)
236 elif isinstance(tag, list): 236 ↛ exitline 236 didn't return from function 'extract_l_template' because the condition on line 236 was always true
237 word_entry.tags.extend(tag)
240# https://nl.wiktionary.org/wiki/Sjabloon:noun-pl
241# https://nl.wiktionary.org/wiki/Sjabloon:noun-form
242# https://nl.wiktionary.org/wiki/Sjabloon:oudeschrijfwijze
243# "getal" and "gesl" args
244NOUN_FORM_OF_TEMPLATE_NUM_TAGS = {
245 "s": "singular",
246 "p": "plural",
247 "d": "dual",
248 "c": "collective",
249 "a": "animate",
250 "i": "inanimate",
251}
252NOUN_FORM_OF_TEMPLATE_GENDER_TAGS = {
253 "m": "masculine",
254 "f": "feminine",
255 "n": "neuter",
256 "c": "common",
257 "fm": ["feminine", "masculine"],
258 "mf": ["feminine", "masculine"],
259 "mn": ["masculine", "neuter"],
260}
263def extract_oudeschrijfwijze_template_g_arg(
264 wxr: WiktextractContext, g_arg: str, sense: Sense
265) -> bool:
266 for tags_dict in [
267 NOUN_FORM_OF_TEMPLATE_GENDER_TAGS,
268 NOUN_FORM_OF_TEMPLATE_NUM_TAGS,
269 ]:
270 if g_arg in tags_dict:
271 tag = tags_dict[g_arg]
272 if isinstance(tag, str): 272 ↛ 274line 272 didn't jump to line 274 because the condition on line 272 was always true
273 sense.tags.append(tag)
274 elif isinstance(tag, list):
275 sense.tags.extend(tag)
276 return True
277 return False
280def extract_oudeschrijfwijze_template(
281 wxr: WiktextractContext, t_node: TemplateNode, sense: Sense
282) -> None:
283 g_arg_str = clean_node(wxr, None, t_node.template_parameters.get("g", ""))
284 if not extract_oudeschrijfwijze_template_g_arg(wxr, g_arg_str, sense):
285 g_args = t_node.template_parameters.get("g", "")
286 if isinstance(g_args, list):
287 for g_arg in g_args:
288 if isinstance(g_arg, TemplateNode):
289 extract_oudeschrijfwijze_template_g_arg(
290 wxr, g_arg.template_name, sense
291 )
294def extract_noun_form_of_template(
295 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
296) -> None:
297 # https://nl.wiktionary.org/wiki/Categorie:Vormsjablonen
298 sense = Sense(tags=["form-of"])
299 if t_node.template_name.endswith("-pl"):
300 sense.tags.append("plural")
301 else:
302 num_arg = clean_node(
303 wxr, None, t_node.template_parameters.get("getal", "")
304 )
305 if num_arg in NOUN_FORM_OF_TEMPLATE_NUM_TAGS:
306 sense.tags.append(NOUN_FORM_OF_TEMPLATE_NUM_TAGS[num_arg])
308 gender_arg = clean_node(
309 wxr, None, t_node.template_parameters.get("gesl", "")
310 )
311 if gender_arg in NOUN_FORM_OF_TEMPLATE_GENDER_TAGS: 311 ↛ 312line 311 didn't jump to line 312 because the condition on line 311 was never true
312 gender_tag = NOUN_FORM_OF_TEMPLATE_GENDER_TAGS[gender_arg]
313 if isinstance(gender_tag, str):
314 sense.tags.append(gender_tag)
315 elif isinstance(gender_tag, list):
316 sense.tags.extend(gender_tag)
318 # Sjabloon:oudeschrijfwijze
319 if t_node.template_name == "oudeschrijfwijze":
320 extract_oudeschrijfwijze_template(wxr, t_node, sense)
322 form_of = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
323 if form_of != "": 323 ↛ 326line 323 didn't jump to line 326 because the condition on line 323 was always true
324 sense.form_of.append(AltForm(word=form_of))
326 expanded_node = wxr.wtp.parse(
327 wxr.wtp.node_to_wikitext(t_node), expand_all=True
328 )
329 for list_item in expanded_node.find_child_recursively(NodeKind.LIST_ITEM):
330 sense.glosses.append(clean_node(wxr, None, list_item.children))
331 break
332 clean_node(wxr, sense, expanded_node)
333 word_entry.senses.append(sense)
336def extract_verb_form_of_template(
337 wxr: WiktextractContext,
338 page_data: list[WordEntry],
339 base_data: WordEntry,
340 forms_data: WordEntry,
341 t_node: TemplateNode,
342) -> None:
343 # https://nl.wiktionary.org/wiki/Categorie:Werkwoordsvormsjablonen_voor_het_Nederlands
344 # https://nl.wiktionary.org/wiki/Categorie:Werkwoordsvormsjablonen
345 from .page import extract_section_categories
347 orig_data_len = len(page_data)
348 expanded_node = wxr.wtp.parse(
349 wxr.wtp.node_to_wikitext(t_node), expand_all=True
350 )
351 extract_pos_section_nodes(
352 wxr, page_data, base_data, forms_data, expanded_node
353 )
354 form_of = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
355 for word_entry in page_data[orig_data_len - len(page_data) - 1 :]:
356 for sense in word_entry.senses:
357 sense.tags.append("form-of")
358 if form_of != "": 358 ↛ 356line 358 didn't jump to line 356 because the condition on line 358 was always true
359 sense.form_of.append(AltForm(word=form_of))
360 extract_section_categories(wxr, word_entry, expanded_node)
361 word_entry.tags.append("form-of")