Coverage for src / wiktextract / extractor / de / inflection.py: 87%
227 statements
« prev ^ index » next coverage.py v7.13.1, created at 2025-12-29 01:50 +0000
« prev ^ index » next coverage.py v7.13.1, created at 2025-12-29 01:50 +0000
1import re
2from dataclasses import dataclass
4from wikitextprocessor import NodeKind, TemplateNode, WikiNode
6from ...page import clean_node
7from ...wxr_context import WiktextractContext
8from .flexion import parse_flexion_page
9from .models import Form, WordEntry
10from .tags import translate_raw_tags
12# Kategorie:Wiktionary:Flexionstabelle (Deutsch)
15def extract_inf_table_template(
16 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
17) -> None:
18 if (
19 "Substantiv Übersicht" in t_node.template_name
20 or t_node.template_name.endswith(
21 (
22 "Nachname Übersicht",
23 "Eigenname Übersicht",
24 "Vorname Übersicht m",
25 "Name Übersicht",
26 "Pronomina-Tabelle",
27 "Pronomen Übersicht",
28 "adjektivisch Übersicht",
29 "Substantiv Dialekt",
30 "Toponym Übersicht",
31 )
32 )
33 or re.search(r" Personalpronomen \d$", t_node.template_name)
34 ):
35 extract_noun_table_template(wxr, word_entry, t_node)
36 elif t_node.template_name.endswith(
37 ("Adjektiv Übersicht", "Adverb Übersicht")
38 ):
39 process_adj_table(wxr, word_entry, t_node)
40 elif (
41 t_node.template_name.endswith("Verb Übersicht")
42 or t_node.template_name == "Kardinalzahl 2-12"
43 ):
44 process_verb_table(wxr, word_entry, t_node)
45 elif t_node.template_name == "Deutsch Possessivpronomen": 45 ↛ exitline 45 didn't return from function 'extract_inf_table_template' because the condition on line 45 was always true
46 extract_de_pronoun_table(wxr, word_entry, t_node)
49@dataclass
50class RowspanHeader:
51 text: str
52 index: int
53 span: int
56def process_verb_table(
57 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
58) -> None:
59 # Vorlage:Deutsch Verb Übersicht
60 expanded_template = wxr.wtp.parse(
61 wxr.wtp.node_to_wikitext(t_node), expand_all=True
62 )
63 table_nodes = list(expanded_template.find_child(NodeKind.TABLE))
64 if len(table_nodes) == 0: 64 ↛ 65line 64 didn't jump to line 65 because the condition on line 64 was never true
65 return
66 table_node = table_nodes[0]
67 col_headers = []
68 has_person = False
69 row_headers = []
70 for table_row in table_node.find_child(NodeKind.TABLE_ROW):
71 col_index = 0
72 header_col_index = 0
73 pronouns = []
74 for table_cell in table_row.find_child(
75 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
76 ):
77 cell_text = clean_node(wxr, None, table_cell)
78 if cell_text.startswith("All other forms:"):
79 for link_node in table_cell.find_child_recursively(
80 NodeKind.LINK
81 ):
82 link_text = clean_node(wxr, None, link_node)
83 if link_text.startswith("Flexion:"): 83 ↛ 79line 83 didn't jump to line 79 because the condition on line 83 was always true
84 parse_flexion_page(wxr, word_entry, link_text)
85 elif table_cell.kind == NodeKind.TABLE_HEADER_CELL:
86 if cell_text == "":
87 continue
88 elif header_col_index == 0:
89 rowspan = int(table_cell.attrs.get("rowspan", "1"))
90 row_headers.append(RowspanHeader(cell_text, 0, rowspan))
91 elif cell_text in ("Person", "Wortform"):
92 has_person = True
93 else: # new table
94 col_headers.append(cell_text)
95 has_person = False
96 pronouns.clear()
97 header_col_index += 1
98 elif table_cell.kind == NodeKind.TABLE_CELL: 98 ↛ 74line 98 didn't jump to line 74 because the condition on line 98 was always true
99 if (
100 "background-color: #f4f4f4"
101 in table_cell.attrs.get("style", "").lower()
102 ):
103 # Template:Englisch Verb Übersicht
104 rowspan = int(table_cell.attrs.get("rowspan", "1"))
105 row_headers.append(RowspanHeader(cell_text, 0, rowspan))
106 continue
107 elif has_person and col_index == 0:
108 if cell_text in ("Singular", "Plural"):
109 row_headers.append(RowspanHeader(cell_text, 0, 1))
110 else:
111 pronouns = list(
112 filter(None, map(str.strip, cell_text.split(",")))
113 )
114 else:
115 for cell_line in cell_text.splitlines():
116 for form_str in map(str.strip, cell_line.split(",")):
117 if form_str in ["", "—"]: 117 ↛ 118line 117 didn't jump to line 118 because the condition on line 117 was never true
118 continue
119 elif form_str.startswith("Flexion:"): 119 ↛ 120line 119 didn't jump to line 120 because the condition on line 119 was never true
120 parse_flexion_page(wxr, word_entry, form_str)
121 continue
122 form = Form(form=form_str, pronouns=pronouns)
123 if col_index < len(col_headers):
124 form.raw_tags.append(col_headers[col_index])
125 for row_header in row_headers:
126 form.raw_tags.append(row_header.text)
127 translate_raw_tags(form)
128 word_entry.forms.append(form)
129 col_index += 1
131 new_row_headers = []
132 for row_header in row_headers:
133 if row_header.span > 1:
134 row_header.span -= 1
135 new_row_headers.append(row_header)
136 row_headers = new_row_headers
139def extract_noun_table_template(
140 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
141):
142 # Vorlage:Deutsch Substantiv Übersicht
143 from .page import extract_note_section
145 expanded_template = wxr.wtp.parse(
146 wxr.wtp.node_to_wikitext(t_node), expand_all=True
147 )
148 clean_node(wxr, word_entry, expanded_template)
149 for table in expanded_template.find_child(NodeKind.TABLE):
150 process_noun_table(wxr, word_entry, table, t_node.template_name)
152 # Vorlage:Deutsch Nachname Übersicht
153 for level_node in expanded_template.find_child(NodeKind.LEVEL4): 153 ↛ 154line 153 didn't jump to line 154 because the loop on line 153 never started
154 section_text = clean_node(wxr, None, level_node.largs)
155 if section_text.startswith("Anmerkung"):
156 extract_note_section(wxr, word_entry, level_node)
159def process_noun_table(
160 wxr: WiktextractContext,
161 word_entry: WordEntry,
162 table: WikiNode,
163 template_name: str,
164):
165 column_headers = []
166 table_header = ""
167 forms = []
168 flexion_pages = []
169 for table_row in table.find_child(NodeKind.TABLE_ROW):
170 row_header = ""
171 is_header_row = not table_row.contain_node(NodeKind.TABLE_CELL)
172 row_has_header = table_row.contain_node(NodeKind.TABLE_HEADER_CELL)
173 col_index = 0
174 for table_cell in table_row.find_child(
175 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
176 ):
177 cell_text = clean_node(wxr, None, table_cell)
178 if table_cell.kind == NodeKind.TABLE_HEADER_CELL:
179 if (
180 cell_text in ["", "Kasus", "Utrum", "m", "f", "m, f"]
181 and col_index == 0
182 ):
183 continue
184 elif is_header_row:
185 colspan = int(table_cell.attrs.get("colspan", "1"))
186 if cell_text != "": 186 ↛ 194line 186 didn't jump to line 194 because the condition on line 186 was always true
187 column_headers.append(
188 RowspanHeader(
189 re.sub(r"\s*\d+$", "", cell_text),
190 col_index,
191 colspan,
192 )
193 )
194 col_index += colspan
195 else:
196 row_header = cell_text
197 elif cell_text == "": 197 ↛ 198line 197 didn't jump to line 198 because the condition on line 197 was never true
198 continue
199 elif not row_has_header: 199 ↛ 201line 199 didn't jump to line 201 because the condition on line 199 was never true
200 # Vorlage:Deutsch adjektivisch Übersicht
201 table_header = cell_text
202 column_headers.clear()
203 for link_node in table_cell.find_child(NodeKind.LINK):
204 link_text = clean_node(wxr, None, link_node)
205 if link_text.startswith("Flexion:"):
206 flexion_pages.append(link_text)
207 else:
208 for form_text in cell_text.splitlines():
209 form_text = form_text.strip()
210 if form_text.startswith("(") and form_text.endswith(")"):
211 form_text = form_text.strip("() ")
212 if form_text in ["—", "–", "-", "", "?"]: 212 ↛ 213line 212 didn't jump to line 213 because the condition on line 212 was never true
213 continue
214 form = Form(form=form_text)
215 if table_header != "": 215 ↛ 216line 215 didn't jump to line 216 because the condition on line 215 was never true
216 form.raw_tags.append(table_header)
217 if len(row_header) > 0: 217 ↛ 219line 217 didn't jump to line 219 because the condition on line 217 was always true
218 form.raw_tags.append(row_header)
219 for col_header in column_headers:
220 if (
221 col_header.text not in ("", "—")
222 and col_index >= col_header.index
223 and col_index < col_header.index + col_header.span
224 ):
225 form.raw_tags.append(col_header.text)
226 translate_raw_tags(form)
227 forms.append(form)
228 col_index += 1
230 if template_name in ( 230 ↛ 235line 230 didn't jump to line 235 because the condition on line 230 was always true
231 "Deutsch Substantiv Übersicht",
232 "Deutsch Vorname Übersicht m",
233 ):
234 forms = separate_de_article(wxr, forms)
235 word_entry.forms.extend(forms)
236 for flexion_page in flexion_pages: 236 ↛ 237line 236 didn't jump to line 237 because the loop on line 236 never started
237 parse_flexion_page(wxr, word_entry, flexion_page)
240def process_adj_table(
241 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
242) -> None:
243 # Vorlage:Deutsch Adjektiv Übersicht
244 expanded_template = wxr.wtp.parse(
245 wxr.wtp.node_to_wikitext(t_node), expand_all=True
246 )
247 table_nodes = list(expanded_template.find_child(NodeKind.TABLE))
248 if len(table_nodes) == 0: 248 ↛ 249line 248 didn't jump to line 249 because the condition on line 248 was never true
249 return
250 table_node = table_nodes[0]
251 column_headers = []
252 for table_row in table_node.find_child(NodeKind.TABLE_ROW):
253 for col_index, table_cell in enumerate(
254 table_row.find_child(
255 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
256 )
257 ):
258 cell_text = clean_node(wxr, None, table_cell)
259 # because {{int:}} magic word is not implemented
260 # template "Textbaustein-Intl" expands to English words
261 if cell_text.startswith("All other forms:"):
262 for link_node in table_cell.find_child(NodeKind.LINK):
263 parse_flexion_page(
264 wxr, word_entry, clean_node(wxr, None, link_node)
265 )
266 elif table_cell.kind == NodeKind.TABLE_HEADER_CELL:
267 column_headers.append(cell_text)
268 else:
269 for form_text in cell_text.splitlines():
270 if form_text in ("—", "", "?"): 270 ↛ 271line 270 didn't jump to line 271 because the condition on line 270 was never true
271 continue
272 form = Form(form=form_text)
273 if col_index < len(column_headers): 273 ↛ 275line 273 didn't jump to line 275 because the condition on line 273 was always true
274 form.raw_tags.append(column_headers[col_index])
275 translate_raw_tags(form)
276 word_entry.forms.append(form)
279def extract_de_pronoun_table(
280 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
281) -> None:
282 # Vorlage:Deutsch Possessivpronomen
283 expanded_template = wxr.wtp.parse(
284 wxr.wtp.node_to_wikitext(t_node), expand_all=True
285 )
286 table_nodes = list(expanded_template.find_child(NodeKind.TABLE))
287 if len(table_nodes) == 0: 287 ↛ 288line 287 didn't jump to line 288 because the condition on line 287 was never true
288 return
289 table_node = table_nodes[0]
290 col_headers = []
291 table_header = ""
292 for row in table_node.find_child(NodeKind.TABLE_ROW):
293 row_header = ""
294 row_has_data = row.contain_node(NodeKind.TABLE_CELL)
295 col_index = 0
296 article = ""
297 for cell in row.find_child(
298 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
299 ):
300 cell_text = clean_node(wxr, None, cell)
301 if cell.kind == NodeKind.TABLE_HEADER_CELL:
302 if cell_text == "":
303 continue
304 elif row_has_data:
305 row_header = cell_text
306 elif len(list(row.find_child(NodeKind.TABLE_HEADER_CELL))) == 1:
307 table_header = cell_text
308 col_headers.clear() # new table
309 article = ""
310 else:
311 colspan = 1
312 colspan_str = cell.attrs.get("colspan", "1")
313 if re.fullmatch(r"\d+", colspan_str): 313 ↛ 315line 313 didn't jump to line 315 because the condition on line 313 was always true
314 colspan = int(colspan_str)
315 if cell_text != "—":
316 col_headers.append(
317 RowspanHeader(cell_text, col_index, colspan)
318 )
319 col_index += colspan
320 elif cell.kind == NodeKind.TABLE_CELL: 320 ↛ 297line 320 didn't jump to line 297 because the condition on line 320 was always true
321 if col_index % 2 == 0:
322 if cell_text != "—":
323 article = cell_text
324 else:
325 form = Form(form=cell_text, article=article)
326 if table_header != "": 326 ↛ 328line 326 didn't jump to line 328 because the condition on line 326 was always true
327 form.raw_tags.append(table_header)
328 if row_header != "": 328 ↛ 330line 328 didn't jump to line 330 because the condition on line 328 was always true
329 form.raw_tags.append(row_header)
330 for header in col_headers:
331 if (
332 col_index >= header.index
333 and col_index < header.index + header.span
334 and header.text != "Wortform"
335 ):
336 form.raw_tags.append(header.text)
337 translate_raw_tags(form)
338 if form.form not in ["", "—"]: 338 ↛ 340line 338 didn't jump to line 340 because the condition on line 338 was always true
339 word_entry.forms.append(form)
340 article = ""
341 col_index += 1
344def separate_de_article(
345 wxr: WiktextractContext, forms: list[Form]
346) -> list[Form]:
347 # https://en.wikipedia.org/wiki/German_articles
348 # https://de.wiktionary.org/wiki/Vorlage:Deutsch_Substantiv_Übersicht
349 # https://de.wiktionary.org/wiki/Vorlage:Deutsch_Vorname_Übersicht_m
350 # * May contain parens around the article
351 new_forms = []
352 for form in forms:
353 m = re.match(r"\(?(der|die|das|den|dem|des)\)?\s+", form.form)
354 if m is not None:
355 form.form = form.form[m.end() :]
356 form.article = m.group(1)
357 if form.form not in ["", "—"]: 357 ↛ 352line 357 didn't jump to line 352 because the condition on line 357 was always true
358 new_forms.append(form)
359 return new_forms