Coverage for src/wiktextract/extractor/nl/pos.py: 90%
130 statements
« prev ^ index » next coverage.py v7.6.4, created at 2024-10-25 10:11 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2024-10-25 10:11 +0000
1import re
3from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode
5from ...page import clean_node
6from ...wxr_context import WiktextractContext
7from .example import (
8 EXAMPLE_TEMPLATES,
9 extract_example_list_item,
10 extract_example_template,
11)
12from .models import AltForm, Sense, WordEntry
13from .section_titles import POS_DATA
14from .tags import translate_raw_tags
17def extract_pos_section(
18 wxr: WiktextractContext,
19 page_data: list[WordEntry],
20 base_data: WordEntry,
21 forms_data: WordEntry,
22 level_node: LevelNode,
23 pos_title: str,
24) -> None:
25 page_data.append(base_data.model_copy(deep=True))
26 page_data[-1].pos_title = pos_title
27 pos_data = POS_DATA[pos_title]
28 page_data[-1].pos = pos_data["pos"]
29 page_data[-1].tags.extend(pos_data.get("tags", []))
30 if forms_data.pos == "unknown":
31 forms_data.pos = page_data[-1].pos
32 if forms_data.pos == page_data[-1].pos:
33 page_data[-1].forms.extend(forms_data.forms)
34 page_data[-1].categories.extend(forms_data.categories)
35 else:
36 forms_data.forms.clear()
37 forms_data.categories.clear()
38 extract_pos_section_nodes(wxr, page_data, base_data, forms_data, level_node)
41def extract_pos_section_nodes(
42 wxr: WiktextractContext,
43 page_data: list[WordEntry],
44 base_data: WordEntry,
45 forms_data: WordEntry,
46 level_node: LevelNode,
47) -> None:
48 gloss_list_start = 0
49 for index, node in enumerate(level_node.children):
50 if (
51 isinstance(node, WikiNode)
52 and node.kind == NodeKind.LIST
53 and node.sarg.endswith("#")
54 ):
55 if gloss_list_start == 0: 55 ↛ 60line 55 didn't jump to line 60 because the condition on line 55 was always true
56 gloss_list_start = index
57 extract_pos_header_line_nodes(
58 wxr, page_data[-1], level_node.children[:index]
59 )
60 for list_item in node.find_child(NodeKind.LIST_ITEM):
61 extract_gloss_list_item(wxr, page_data[-1], list_item)
62 elif isinstance(node, LevelNode):
63 title_text = clean_node(wxr, None, node.largs)
64 if title_text in POS_DATA:
65 # expanded from "eng-onv-d" form-of template
66 from .page import parse_section
68 parse_section(wxr, page_data, base_data, forms_data, node)
69 else:
70 break
71 elif (
72 isinstance(node, TemplateNode)
73 and node.template_name in EXAMPLE_TEMPLATES
74 and len(page_data[-1].senses) > 0
75 ):
76 extract_example_template(wxr, page_data[-1].senses[-1], node)
77 elif isinstance(node, TemplateNode) and node.template_name in [
78 "noun-pl",
79 "noun-form",
80 ]:
81 extract_noun_form_of_template(wxr, page_data[-1], node)
82 elif isinstance(node, TemplateNode) and node.template_name.startswith(
83 (
84 "1ps",
85 "2ps",
86 "aanv-w",
87 "onv-d",
88 "ott-",
89 "ovt-",
90 "tps",
91 "volt-d",
92 "eng-onv-d",
93 )
94 ):
95 extract_verb_form_of_template(
96 wxr, page_data, base_data, forms_data, node
97 )
100# https://nl.wiktionary.org/wiki/Categorie:Lemmasjablonen
101# https://nl.wiktionary.org/wiki/Categorie:Werkwoordsjablonen
102GLOSS_TAG_TEMPLATES = frozenset(["auxl", "erga", "inerg"])
105def extract_gloss_list_item(
106 wxr: WiktextractContext, word_entry: WordEntry, list_item: WikiNode
107) -> None:
108 sense = Sense()
109 gloss_nodes = []
110 for child in list_item.children:
111 if isinstance(child, TemplateNode):
112 if child.template_name in GLOSS_TAG_TEMPLATES:
113 sense.raw_tags.append(clean_node(wxr, sense, child))
114 else:
115 expanded_text = clean_node(wxr, sense, child)
116 if expanded_text.startswith("(") and expanded_text.endswith(
117 ")"
118 ):
119 sense.raw_tags.append(expanded_text.strip("() "))
120 else:
121 gloss_nodes.append(expanded_text)
122 elif isinstance(child, WikiNode) and child.kind == NodeKind.LIST:
123 if child.sarg.endswith("*"): 123 ↛ 110line 123 didn't jump to line 110 because the condition on line 123 was always true
124 for next_list_item in child.find_child(NodeKind.LIST_ITEM):
125 extract_example_list_item(wxr, sense, next_list_item)
126 elif isinstance(child, WikiNode) and child.kind == NodeKind.ITALIC:
127 italic_text = clean_node(wxr, sense, child)
128 if italic_text.startswith("(") and italic_text.endswith(")"): 128 ↛ 131line 128 didn't jump to line 131 because the condition on line 128 was always true
129 sense.raw_tags.append(italic_text.strip("() "))
130 else:
131 gloss_nodes.append(italic_text)
132 else:
133 gloss_nodes.append(child)
135 gloss_text = clean_node(wxr, sense, gloss_nodes)
136 if gloss_text.startswith(","): # between qualifier templates
137 gloss_text = gloss_text.removeprefix(",").strip()
138 m = re.match(r"\(([^()]+)\)", gloss_text)
139 if m is not None: # expanded "verouderd" template in "2ps" template
140 gloss_text = gloss_text[m.end() :].strip()
141 sense.raw_tags.append(m.group(1))
142 if len(gloss_text) > 0: 142 ↛ exitline 142 didn't return from function 'extract_gloss_list_item' because the condition on line 142 was always true
143 sense.glosses.append(gloss_text)
144 translate_raw_tags(sense)
145 word_entry.senses.append(sense)
148def extract_pos_header_line_nodes(
149 wxr: WiktextractContext, word_entry: WordEntry, nodes: list[WikiNode | str]
150) -> None:
151 for node in nodes:
152 if isinstance(node, str) and word_entry.etymology_index == "":
153 m = re.search(r"\[(.+)\]", node.strip())
154 if m is not None:
155 word_entry.etymology_index = m.group(1).strip()
156 elif isinstance(node, TemplateNode) and node.template_name == "-l-":
157 extract_l_template(wxr, word_entry, node)
160def extract_l_template(
161 wxr: WiktextractContext, word_entry: WordEntry, node: TemplateNode
162) -> None:
163 # https://nl.wiktionary.org/wiki/Sjabloon:-l-
164 first_arg = clean_node(wxr, None, node.template_parameters.get(1, ""))
165 gender_args = {
166 "n": "neuter",
167 "m": "masculine",
168 "fm": ["feminine", "masculine"],
169 "p": "plural",
170 }
171 tag = gender_args.get(first_arg, [])
172 if isinstance(tag, str):
173 word_entry.tags.append(tag)
174 elif isinstance(tag, list): 174 ↛ exitline 174 didn't return from function 'extract_l_template' because the condition on line 174 was always true
175 word_entry.tags.extend(tag)
178# https://nl.wiktionary.org/wiki/Sjabloon:noun-pl
179# https://nl.wiktionary.org/wiki/Sjabloon:noun-form
180# "getal" and "gesl" args
181NOUN_FORM_OF_TEMPLATE_NUM_TAGS = {
182 "s": "singular",
183 "p": "plural",
184 "d": "dual",
185 "c": "collective",
186}
187NOUN_FORM_OF_TEMPLATE_GENDER_TAGS = {
188 "m": "masculine",
189 "f": "feminine",
190 "n": "neuter",
191 "c": "common",
192 "fm": ["feminine", "masculine"],
193 "mf": ["feminine", "masculine"],
194 "mn": ["masculine", "neuter"],
195}
198def extract_noun_form_of_template(
199 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
200) -> None:
201 sense = Sense(tags=["form-of"])
202 if t_node.template_name == "noun-pl":
203 sense.tags.append("plural")
204 else:
205 num_arg = t_node.template_parameters.get("getal", "")
206 if num_arg in NOUN_FORM_OF_TEMPLATE_NUM_TAGS: 206 ↛ 209line 206 didn't jump to line 209 because the condition on line 206 was always true
207 sense.tags.append(NOUN_FORM_OF_TEMPLATE_NUM_TAGS[num_arg])
209 gender_arg = t_node.template_parameters.get("gesl", "")
210 if gender_arg in NOUN_FORM_OF_TEMPLATE_GENDER_TAGS: 210 ↛ 211line 210 didn't jump to line 211 because the condition on line 210 was never true
211 gender_tag = NOUN_FORM_OF_TEMPLATE_GENDER_TAGS[gender_arg]
212 if isinstance(gender_tag, str):
213 sense.tags.append(gender_tag)
214 elif isinstance(gender_tag, list):
215 sense.tags.extend(gender_tag)
217 form_of = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
218 if form_of != "": 218 ↛ 221line 218 didn't jump to line 221 because the condition on line 218 was always true
219 sense.form_of.append(AltForm(word=form_of))
221 expanded_node = wxr.wtp.parse(
222 wxr.wtp.node_to_wikitext(t_node), expand_all=True
223 )
224 for list_item in expanded_node.find_child_recursively(NodeKind.LIST_ITEM): 224 ↛ 227line 224 didn't jump to line 227 because the loop on line 224 didn't complete
225 sense.glosses.append(clean_node(wxr, None, list_item.children))
226 break
227 clean_node(wxr, sense, expanded_node)
228 word_entry.senses.append(sense)
231def extract_verb_form_of_template(
232 wxr: WiktextractContext,
233 page_data: list[WordEntry],
234 base_data: WordEntry,
235 forms_data: WordEntry,
236 t_node: TemplateNode,
237) -> None:
238 # https://nl.wiktionary.org/wiki/Categorie:Werkwoordsvormsjablonen_voor_het_Nederlands
239 from .page import extract_section_categories
241 orig_data_len = len(page_data)
242 expanded_node = wxr.wtp.parse(
243 wxr.wtp.node_to_wikitext(t_node), expand_all=True
244 )
245 extract_pos_section_nodes(
246 wxr, page_data, base_data, forms_data, expanded_node
247 )
248 form_of = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
249 for word_entry in page_data[orig_data_len - len(page_data) - 1 :]:
250 for sense in word_entry.senses:
251 sense.tags.append("form-of")
252 if form_of != "": 252 ↛ 250line 252 didn't jump to line 250 because the condition on line 252 was always true
253 sense.form_of.append(AltForm(word=form_of))
254 extract_section_categories(wxr, word_entry, expanded_node)
255 word_entry.tags.append("form-of")