Coverage for src / wiktextract / extractor / de / inflection.py: 87%
227 statements
« prev ^ index » next coverage.py v7.13.1, created at 2026-01-19 11:25 +0000
« prev ^ index » next coverage.py v7.13.1, created at 2026-01-19 11:25 +0000
1import re
2from dataclasses import dataclass
4from wikitextprocessor import NodeKind, TemplateNode, WikiNode
6from ...page import clean_node
7from ...wxr_context import WiktextractContext
8from .flexion import parse_flexion_page
9from .models import Form, WordEntry
10from .tags import translate_raw_tags
12# Kategorie:Wiktionary:Flexionstabelle (Deutsch)
15def extract_inf_table_template(
16 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
17) -> None:
18 if (
19 "Substantiv Übersicht" in t_node.template_name
20 or t_node.template_name.endswith(
21 (
22 "Nachname Übersicht",
23 "Eigenname Übersicht",
24 "Vorname Übersicht m",
25 "Vorname Übersicht f",
26 "Name Übersicht",
27 "Pronomina-Tabelle",
28 "Pronomen Übersicht",
29 "adjektivisch Übersicht",
30 "Substantiv Dialekt",
31 "Toponym Übersicht",
32 )
33 )
34 or re.search(r" Personalpronomen \d$", t_node.template_name)
35 ):
36 extract_noun_table_template(wxr, word_entry, t_node)
37 elif t_node.template_name.endswith(
38 ("Adjektiv Übersicht", "Adverb Übersicht")
39 ):
40 process_adj_table(wxr, word_entry, t_node)
41 elif (
42 t_node.template_name.endswith("Verb Übersicht")
43 or t_node.template_name == "Kardinalzahl 2-12"
44 ):
45 process_verb_table(wxr, word_entry, t_node)
46 elif t_node.template_name == "Deutsch Possessivpronomen": 46 ↛ exitline 46 didn't return from function 'extract_inf_table_template' because the condition on line 46 was always true
47 extract_de_pronoun_table(wxr, word_entry, t_node)
50@dataclass
51class RowspanHeader:
52 text: str
53 index: int
54 span: int
57def process_verb_table(
58 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
59) -> None:
60 # Vorlage:Deutsch Verb Übersicht
61 expanded_template = wxr.wtp.parse(
62 wxr.wtp.node_to_wikitext(t_node), expand_all=True
63 )
64 table_nodes = list(expanded_template.find_child(NodeKind.TABLE))
65 if len(table_nodes) == 0: 65 ↛ 66line 65 didn't jump to line 66 because the condition on line 65 was never true
66 return
67 table_node = table_nodes[0]
68 col_headers = []
69 has_person = False
70 row_headers = []
71 for table_row in table_node.find_child(NodeKind.TABLE_ROW):
72 col_index = 0
73 header_col_index = 0
74 pronouns = []
75 for table_cell in table_row.find_child(
76 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
77 ):
78 cell_text = clean_node(wxr, None, table_cell)
79 if cell_text.startswith("All other forms:"):
80 for link_node in table_cell.find_child_recursively(
81 NodeKind.LINK
82 ):
83 link_text = clean_node(wxr, None, link_node)
84 if link_text.startswith("Flexion:"): 84 ↛ 80line 84 didn't jump to line 80 because the condition on line 84 was always true
85 parse_flexion_page(wxr, word_entry, link_text)
86 elif table_cell.kind == NodeKind.TABLE_HEADER_CELL:
87 if cell_text == "":
88 continue
89 elif header_col_index == 0:
90 rowspan = int(table_cell.attrs.get("rowspan", "1"))
91 row_headers.append(RowspanHeader(cell_text, 0, rowspan))
92 elif cell_text in ("Person", "Wortform"):
93 has_person = True
94 else: # new table
95 col_headers.append(cell_text)
96 has_person = False
97 pronouns.clear()
98 header_col_index += 1
99 elif table_cell.kind == NodeKind.TABLE_CELL: 99 ↛ 75line 99 didn't jump to line 75 because the condition on line 99 was always true
100 if (
101 "background-color: #f4f4f4"
102 in table_cell.attrs.get("style", "").lower()
103 ):
104 # Template:Englisch Verb Übersicht
105 rowspan = int(table_cell.attrs.get("rowspan", "1"))
106 row_headers.append(RowspanHeader(cell_text, 0, rowspan))
107 continue
108 elif has_person and col_index == 0:
109 if cell_text in ("Singular", "Plural"):
110 row_headers.append(RowspanHeader(cell_text, 0, 1))
111 else:
112 pronouns = list(
113 filter(None, map(str.strip, cell_text.split(",")))
114 )
115 else:
116 for cell_line in cell_text.splitlines():
117 for form_str in map(str.strip, cell_line.split(",")):
118 if form_str in ["", "—"]: 118 ↛ 119line 118 didn't jump to line 119 because the condition on line 118 was never true
119 continue
120 elif form_str.startswith("Flexion:"): 120 ↛ 121line 120 didn't jump to line 121 because the condition on line 120 was never true
121 parse_flexion_page(wxr, word_entry, form_str)
122 continue
123 form = Form(form=form_str, pronouns=pronouns)
124 if col_index < len(col_headers):
125 form.raw_tags.append(col_headers[col_index])
126 for row_header in row_headers:
127 form.raw_tags.append(row_header.text)
128 translate_raw_tags(form)
129 word_entry.forms.append(form)
130 col_index += 1
132 new_row_headers = []
133 for row_header in row_headers:
134 if row_header.span > 1:
135 row_header.span -= 1
136 new_row_headers.append(row_header)
137 row_headers = new_row_headers
140def extract_noun_table_template(
141 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
142):
143 # Vorlage:Deutsch Substantiv Übersicht
144 from .page import extract_note_section
146 expanded_template = wxr.wtp.parse(
147 wxr.wtp.node_to_wikitext(t_node), expand_all=True
148 )
149 clean_node(wxr, word_entry, expanded_template)
150 for table in expanded_template.find_child(NodeKind.TABLE):
151 process_noun_table(wxr, word_entry, table, t_node.template_name)
153 # Vorlage:Deutsch Nachname Übersicht
154 for level_node in expanded_template.find_child(NodeKind.LEVEL4): 154 ↛ 155line 154 didn't jump to line 155 because the loop on line 154 never started
155 section_text = clean_node(wxr, None, level_node.largs)
156 if section_text.startswith("Anmerkung"):
157 extract_note_section(wxr, word_entry, level_node)
160def process_noun_table(
161 wxr: WiktextractContext,
162 word_entry: WordEntry,
163 table: WikiNode,
164 template_name: str,
165):
166 column_headers = []
167 table_header = ""
168 forms = []
169 flexion_pages = []
170 for table_row in table.find_child(NodeKind.TABLE_ROW):
171 row_header = ""
172 is_header_row = not table_row.contain_node(NodeKind.TABLE_CELL)
173 row_has_header = table_row.contain_node(NodeKind.TABLE_HEADER_CELL)
174 col_index = 0
175 for table_cell in table_row.find_child(
176 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
177 ):
178 cell_text = clean_node(wxr, None, table_cell)
179 if table_cell.kind == NodeKind.TABLE_HEADER_CELL:
180 if (
181 cell_text in ["", "Kasus", "Utrum", "m", "f", "m, f"]
182 and col_index == 0
183 ):
184 continue
185 elif is_header_row:
186 colspan = int(table_cell.attrs.get("colspan", "1"))
187 if cell_text != "": 187 ↛ 195line 187 didn't jump to line 195 because the condition on line 187 was always true
188 column_headers.append(
189 RowspanHeader(
190 re.sub(r"\s*\d+$", "", cell_text),
191 col_index,
192 colspan,
193 )
194 )
195 col_index += colspan
196 else:
197 row_header = cell_text
198 elif cell_text == "": 198 ↛ 199line 198 didn't jump to line 199 because the condition on line 198 was never true
199 continue
200 elif not row_has_header: 200 ↛ 202line 200 didn't jump to line 202 because the condition on line 200 was never true
201 # Vorlage:Deutsch adjektivisch Übersicht
202 table_header = cell_text
203 column_headers.clear()
204 for link_node in table_cell.find_child(NodeKind.LINK):
205 link_text = clean_node(wxr, None, link_node)
206 if link_text.startswith("Flexion:"):
207 flexion_pages.append(link_text)
208 else:
209 for form_text in cell_text.splitlines():
210 form_text = form_text.strip()
211 if form_text.startswith("(") and form_text.endswith(")"):
212 form_text = form_text.strip("() ")
213 if form_text in ["—", "–", "-", "", "?"]: 213 ↛ 214line 213 didn't jump to line 214 because the condition on line 213 was never true
214 continue
215 form = Form(form=form_text)
216 if table_header != "": 216 ↛ 217line 216 didn't jump to line 217 because the condition on line 216 was never true
217 form.raw_tags.append(table_header)
218 if len(row_header) > 0: 218 ↛ 220line 218 didn't jump to line 220 because the condition on line 218 was always true
219 form.raw_tags.append(row_header)
220 for col_header in column_headers:
221 if (
222 col_header.text not in ("", "—")
223 and col_index >= col_header.index
224 and col_index < col_header.index + col_header.span
225 ):
226 form.raw_tags.append(col_header.text)
227 translate_raw_tags(form)
228 forms.append(form)
229 col_index += 1
231 if template_name in ( 231 ↛ 237line 231 didn't jump to line 237 because the condition on line 231 was always true
232 "Deutsch Substantiv Übersicht",
233 "Deutsch Vorname Übersicht m",
234 "Deutsch Vorname Übersicht f",
235 ):
236 forms = separate_de_article(wxr, forms)
237 word_entry.forms.extend(forms)
238 for flexion_page in flexion_pages: 238 ↛ 239line 238 didn't jump to line 239 because the loop on line 238 never started
239 parse_flexion_page(wxr, word_entry, flexion_page)
242def process_adj_table(
243 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
244) -> None:
245 # Vorlage:Deutsch Adjektiv Übersicht
246 expanded_template = wxr.wtp.parse(
247 wxr.wtp.node_to_wikitext(t_node), expand_all=True
248 )
249 table_nodes = list(expanded_template.find_child(NodeKind.TABLE))
250 if len(table_nodes) == 0: 250 ↛ 251line 250 didn't jump to line 251 because the condition on line 250 was never true
251 return
252 table_node = table_nodes[0]
253 column_headers = []
254 for table_row in table_node.find_child(NodeKind.TABLE_ROW):
255 for col_index, table_cell in enumerate(
256 table_row.find_child(
257 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
258 )
259 ):
260 cell_text = clean_node(wxr, None, table_cell)
261 # because {{int:}} magic word is not implemented
262 # template "Textbaustein-Intl" expands to English words
263 if cell_text.startswith("All other forms:"):
264 for link_node in table_cell.find_child(NodeKind.LINK):
265 parse_flexion_page(
266 wxr, word_entry, clean_node(wxr, None, link_node)
267 )
268 elif table_cell.kind == NodeKind.TABLE_HEADER_CELL:
269 column_headers.append(cell_text)
270 else:
271 for form_text in cell_text.splitlines():
272 if form_text in ("—", "", "?"): 272 ↛ 273line 272 didn't jump to line 273 because the condition on line 272 was never true
273 continue
274 form = Form(form=form_text)
275 if col_index < len(column_headers): 275 ↛ 277line 275 didn't jump to line 277 because the condition on line 275 was always true
276 form.raw_tags.append(column_headers[col_index])
277 translate_raw_tags(form)
278 word_entry.forms.append(form)
281def extract_de_pronoun_table(
282 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
283) -> None:
284 # Vorlage:Deutsch Possessivpronomen
285 expanded_template = wxr.wtp.parse(
286 wxr.wtp.node_to_wikitext(t_node), expand_all=True
287 )
288 table_nodes = list(expanded_template.find_child(NodeKind.TABLE))
289 if len(table_nodes) == 0: 289 ↛ 290line 289 didn't jump to line 290 because the condition on line 289 was never true
290 return
291 table_node = table_nodes[0]
292 col_headers = []
293 table_header = ""
294 for row in table_node.find_child(NodeKind.TABLE_ROW):
295 row_header = ""
296 row_has_data = row.contain_node(NodeKind.TABLE_CELL)
297 col_index = 0
298 article = ""
299 for cell in row.find_child(
300 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
301 ):
302 cell_text = clean_node(wxr, None, cell)
303 if cell.kind == NodeKind.TABLE_HEADER_CELL:
304 if cell_text == "":
305 continue
306 elif row_has_data:
307 row_header = cell_text
308 elif len(list(row.find_child(NodeKind.TABLE_HEADER_CELL))) == 1:
309 table_header = cell_text
310 col_headers.clear() # new table
311 article = ""
312 else:
313 colspan = 1
314 colspan_str = cell.attrs.get("colspan", "1")
315 if re.fullmatch(r"\d+", colspan_str): 315 ↛ 317line 315 didn't jump to line 317 because the condition on line 315 was always true
316 colspan = int(colspan_str)
317 if cell_text != "—":
318 col_headers.append(
319 RowspanHeader(cell_text, col_index, colspan)
320 )
321 col_index += colspan
322 elif cell.kind == NodeKind.TABLE_CELL: 322 ↛ 299line 322 didn't jump to line 299 because the condition on line 322 was always true
323 if col_index % 2 == 0:
324 if cell_text != "—":
325 article = cell_text
326 else:
327 form = Form(form=cell_text, article=article)
328 if table_header != "": 328 ↛ 330line 328 didn't jump to line 330 because the condition on line 328 was always true
329 form.raw_tags.append(table_header)
330 if row_header != "": 330 ↛ 332line 330 didn't jump to line 332 because the condition on line 330 was always true
331 form.raw_tags.append(row_header)
332 for header in col_headers:
333 if (
334 col_index >= header.index
335 and col_index < header.index + header.span
336 and header.text != "Wortform"
337 ):
338 form.raw_tags.append(header.text)
339 translate_raw_tags(form)
340 if form.form not in ["", "—"]: 340 ↛ 342line 340 didn't jump to line 342 because the condition on line 340 was always true
341 word_entry.forms.append(form)
342 article = ""
343 col_index += 1
346def separate_de_article(
347 wxr: WiktextractContext, forms: list[Form]
348) -> list[Form]:
349 # https://en.wikipedia.org/wiki/German_articles
350 # https://de.wiktionary.org/wiki/Vorlage:Deutsch_Substantiv_Übersicht
351 # https://de.wiktionary.org/wiki/Vorlage:Deutsch_Vorname_Übersicht_m
352 # * May contain parens around the article
353 new_forms = []
354 for form in forms:
355 m = re.match(r"\(?(der|die|das|den|dem|des)\)?\s+", form.form)
356 if m is not None:
357 form.form = form.form[m.end() :]
358 form.article = m.group(1)
359 if form.form not in ["", "—"]: 359 ↛ 354line 359 didn't jump to line 354 because the condition on line 359 was always true
360 new_forms.append(form)
361 return new_forms