Coverage for src/wiktextract/extractor/nl/inflection.py: 96%
126 statements
« prev ^ index » next coverage.py v7.6.4, created at 2024-10-25 10:11 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2024-10-25 10:11 +0000
1import re
2from dataclasses import dataclass
4from wikitextprocessor import NodeKind, TemplateNode, WikiNode
6from ...page import clean_node
7from ...wxr_context import WiktextractContext
8from .models import Form, WordEntry
9from .tags import translate_raw_tags
12def extract_inflection_template(
13 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
14) -> None:
15 if t_node.template_name in ["-nlnoun-", "adjcomp"]:
16 extract_noun_adj_table(wxr, word_entry, t_node)
17 elif t_node.template_name == "-nlstam-":
18 extract_nlstam_template(wxr, word_entry, t_node)
21def extract_noun_adj_table(
22 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
23) -> None:
24 # https://nl.wiktionary.org/wiki/Sjabloon:-nlnoun-
25 # https://nl.wiktionary.org/wiki/Sjabloon:adjcomp
26 expanded_node = wxr.wtp.parse(
27 wxr.wtp.node_to_wikitext(t_node), expand_all=True
28 )
29 column_headers = []
30 for table_node in expanded_node.find_child(NodeKind.TABLE):
31 for row_node in table_node.find_child(NodeKind.TABLE_ROW):
32 for header_node in row_node.find_child(NodeKind.TABLE_HEADER_CELL):
33 header_text = clean_node(wxr, None, header_node)
34 if header_text != "":
35 column_headers.append(header_text)
36 row_header = ""
37 for col_index, data_node in enumerate(
38 row_node.find_child(NodeKind.TABLE_CELL)
39 ):
40 if col_index == 0:
41 row_header = clean_node(wxr, None, data_node)
42 else:
43 for form_str in clean_node(
44 wxr, None, data_node
45 ).splitlines():
46 if form_str not in ["", "-", wxr.wtp.title]:
47 form = Form(form=form_str)
48 if row_header not in ["", "naamwoord"]:
49 form.raw_tags.append(row_header)
50 if col_index - 1 < len(column_headers): 50 ↛ 54line 50 didn't jump to line 54 because the condition on line 50 was always true
51 form.raw_tags.append(
52 column_headers[col_index - 1]
53 )
54 translate_raw_tags(form)
55 word_entry.forms.append(form)
57 for link_node in expanded_node.find_child(NodeKind.LINK):
58 clean_node(wxr, word_entry, link_node)
61def extract_nlstam_template(
62 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
63) -> None:
64 # verb table
65 # https://nl.wiktionary.org/wiki/Sjabloon:-nlstam-
66 for arg in [2, 3]:
67 form_str = clean_node(
68 wxr, None, t_node.template_parameters.get(arg, "")
69 )
70 if form_str != "": 70 ↛ 66line 70 didn't jump to line 66 because the condition on line 70 was always true
71 form = Form(
72 form=form_str,
73 ipa=clean_node(
74 wxr, None, t_node.template_parameters.get(arg + 3, "")
75 ),
76 )
77 form.tags.extend(["past"] if arg == 2 else ["past", "participle"])
78 word_entry.forms.append(form)
79 clean_node(wxr, word_entry, t_node)
80 extract_vervoeging_page(wxr, word_entry)
83def extract_vervoeging_page(
84 wxr: WiktextractContext, word_entry: WordEntry
85) -> None:
86 page = wxr.wtp.get_page(f"{wxr.wtp.title}/vervoeging", 0)
87 if page is None: 87 ↛ 88line 87 didn't jump to line 88 because the condition on line 87 was never true
88 return
89 root = wxr.wtp.parse(page.body)
90 for t_node in root.find_child(NodeKind.TEMPLATE):
91 if t_node.template_name == "-nlverb-": 91 ↛ 90line 91 didn't jump to line 90 because the condition on line 91 was always true
92 extract_nlverb_template(wxr, word_entry, t_node)
95@dataclass
96class TableHeader:
97 text: str
98 col_index: int
99 colspan: int
100 row_index: int
101 rowspan: int
104NLVERB_HEADER_PREFIXES = {
105 "vervoeging van de bedrijvende vorm van": ["active"],
106 "onpersoonlijke lijdende vorm": ["impersonal", "passive"],
107 "lijdende vorm": ["passive"],
108}
111def extract_nlverb_template(
112 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
113) -> None:
114 # https://nl.wiktionary.org/wiki/Sjabloon:-nlverb-
115 expanded_node = wxr.wtp.parse(
116 wxr.wtp.node_to_wikitext(t_node), expand_all=True
117 )
118 for link_node in expanded_node.find_child(NodeKind.LINK):
119 clean_node(wxr, word_entry, link_node)
120 for table_node in expanded_node.find_child(NodeKind.TABLE):
121 row_index = 0
122 shared_tags = []
123 shared_raw_tags = []
124 last_row_all_header = False
125 col_headers = []
126 row_headers = []
127 for row_node in table_node.find_child(NodeKind.TABLE_ROW):
128 col_index = 0
129 for row_header in row_headers:
130 if (
131 row_index >= row_header.row_index
132 and row_index < row_header.row_index + row_header.rowspan
133 ):
134 col_index += row_header.rowspan
136 current_row_all_header = all(
137 nlverb_table_cell_is_header(n)
138 for n in row_node.find_child(
139 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
140 )
141 )
142 if current_row_all_header and not last_row_all_header:
143 row_index = 0
144 shared_tags.clear()
145 shared_raw_tags.clear()
146 col_headers.clear()
147 row_headers.clear()
149 is_row_first_node = True
150 for cell_node in row_node.find_child(
151 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
152 ):
153 cell_colspan = 1
154 cell_colspan_str = cell_node.attrs.get("colspan", "1")
155 if re.fullmatch(r"\d+", cell_colspan_str): 155 ↛ 157line 155 didn't jump to line 157 because the condition on line 155 was always true
156 cell_colspan = int(cell_colspan_str)
157 cell_rowspan = 1
158 cell_rowspan_str = cell_node.attrs.get("rowspan", "1")
159 if re.fullmatch(r"\d+", cell_rowspan_str): 159 ↛ 161line 159 didn't jump to line 161 because the condition on line 159 was always true
160 cell_rowspan = int(cell_rowspan_str)
161 cell_str = clean_node(wxr, None, cell_node)
162 if cell_str in ["", wxr.wtp.title]:
163 col_index += cell_colspan
164 is_row_first_node = False
165 continue
166 if nlverb_table_cell_is_header(cell_node):
167 for (
168 header_prefix,
169 prefix_tags,
170 ) in NLVERB_HEADER_PREFIXES.items():
171 if cell_str.startswith(header_prefix):
172 shared_tags.extend(prefix_tags)
173 break
174 else:
175 if current_row_all_header:
176 if is_row_first_node:
177 shared_raw_tags.append(cell_str)
178 else:
179 col_headers.append(
180 TableHeader(
181 cell_str,
182 col_index,
183 cell_colspan,
184 row_index,
185 cell_rowspan,
186 )
187 )
188 else:
189 if "(" in cell_str: 189 ↛ 190line 189 didn't jump to line 190 because the condition on line 189 was never true
190 cell_str = cell_str[
191 : cell_str.index("(")
192 ].strip()
193 row_headers.append(
194 TableHeader(
195 cell_str,
196 col_index,
197 cell_colspan,
198 row_index,
199 cell_rowspan,
200 )
201 )
202 else:
203 form = Form(
204 form=cell_str,
205 tags=shared_tags,
206 raw_tags=shared_raw_tags,
207 source=f"{wxr.wtp.title}/vervoeging",
208 )
209 for row_header in row_headers:
210 if (
211 row_index >= row_header.row_index
212 and row_index
213 < row_header.row_index + row_header.rowspan
214 ):
215 form.raw_tags.append(row_header.text)
216 for col_header in col_headers:
217 if (
218 col_index >= col_header.col_index
219 and col_index
220 < col_header.col_index + col_header.colspan
221 ):
222 form.raw_tags.append(col_header.text)
223 translate_raw_tags(form)
224 word_entry.forms.append(form)
226 col_index += cell_colspan
227 is_row_first_node = False
229 row_index += 1
230 last_row_all_header = current_row_all_header
233def nlverb_table_cell_is_header(node: WikiNode) -> bool:
234 return (
235 node.kind == NodeKind.TABLE_HEADER_CELL
236 or node.attrs.get("class", "") == "infoboxrijhoofding"
237 )