Coverage for src/wiktextract/extractor/de/inflection.py: 86%
214 statements
« prev ^ index » next coverage.py v7.9.0, created at 2025-06-13 07:43 +0000
« prev ^ index » next coverage.py v7.9.0, created at 2025-06-13 07:43 +0000
1import re
2from dataclasses import dataclass
4from wikitextprocessor import NodeKind, TemplateNode
6from ...page import clean_node
7from ...wxr_context import WiktextractContext
8from .flexion import parse_flexion_page
9from .models import Form, WordEntry
10from .tags import translate_raw_tags
12# Kategorie:Wiktionary:Flexionstabelle (Deutsch)
15def extract_inf_table_template(
16 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
17) -> None:
18 if (
19 "Substantiv Übersicht" in t_node.template_name
20 or t_node.template_name.endswith(
21 (
22 "Nachname Übersicht",
23 "Eigenname Übersicht",
24 "Vorname Übersicht m",
25 "Name Übersicht",
26 "Pronomina-Tabelle",
27 "Pronomen Übersicht",
28 "adjektivisch Übersicht",
29 "Substantiv Dialekt",
30 "Toponym Übersicht",
31 )
32 )
33 or re.search(r" Personalpronomen \d$", t_node.template_name)
34 ):
35 process_noun_table(wxr, word_entry, t_node)
36 elif t_node.template_name.endswith(
37 ("Adjektiv Übersicht", "Adverb Übersicht")
38 ):
39 process_adj_table(wxr, word_entry, t_node)
40 elif (
41 t_node.template_name.endswith("Verb Übersicht")
42 or t_node.template_name == "Kardinalzahl 2-12"
43 ):
44 process_verb_table(wxr, word_entry, t_node)
45 elif t_node.template_name == "Deutsch Possessivpronomen": 45 ↛ exitline 45 didn't return from function 'extract_inf_table_template' because the condition on line 45 was always true
46 extract_pronoun_table(wxr, word_entry, t_node)
49@dataclass
50class RowspanHeader:
51 text: str
52 index: int
53 span: int
56def process_verb_table(
57 wxr: WiktextractContext, word_entry: WordEntry, template_node: TemplateNode
58) -> None:
59 # Vorlage:Deutsch Verb Übersicht
60 expanded_template = wxr.wtp.parse(
61 wxr.wtp.node_to_wikitext(template_node), expand_all=True
62 )
63 table_nodes = list(expanded_template.find_child(NodeKind.TABLE))
64 if len(table_nodes) == 0: 64 ↛ 65line 64 didn't jump to line 65 because the condition on line 64 was never true
65 return
66 table_node = table_nodes[0]
67 col_headers = []
68 has_person = False
69 row_headers = []
70 for table_row in table_node.find_child(NodeKind.TABLE_ROW):
71 col_index = 0
72 header_col_index = 0
73 person = ""
74 for table_cell in table_row.find_child(
75 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
76 ):
77 cell_text = clean_node(wxr, None, table_cell)
78 if cell_text.startswith("All other forms:"):
79 for link_node in table_cell.find_child_recursively(
80 NodeKind.LINK
81 ):
82 link_text = clean_node(wxr, None, link_node)
83 if link_text.startswith("Flexion:"): 83 ↛ 79line 83 didn't jump to line 79 because the condition on line 83 was always true
84 parse_flexion_page(wxr, word_entry, link_text)
85 elif table_cell.kind == NodeKind.TABLE_HEADER_CELL:
86 if cell_text == "":
87 continue
88 elif header_col_index == 0:
89 rowspan = int(table_cell.attrs.get("rowspan", "1"))
90 row_headers.append(RowspanHeader(cell_text, 0, rowspan))
91 elif cell_text in ("Person", "Wortform"):
92 has_person = True
93 else: # new table
94 col_headers.append(cell_text)
95 has_person = False
96 person = ""
97 header_col_index += 1
98 elif table_cell.kind == NodeKind.TABLE_CELL: 98 ↛ 74line 98 didn't jump to line 74 because the condition on line 98 was always true
99 if has_person and col_index == 0:
100 if cell_text in ("Singular", "Plural"):
101 row_headers.append(RowspanHeader(cell_text, 0, 1))
102 else:
103 person = cell_text
104 else:
105 for cell_line in cell_text.splitlines():
106 cell_line = cell_line.strip()
107 if cell_line in ["", "—"]: 107 ↛ 108line 107 didn't jump to line 108 because the condition on line 107 was never true
108 continue
109 elif cell_line.startswith("Flexion:"): 109 ↛ 110line 109 didn't jump to line 110 because the condition on line 109 was never true
110 parse_flexion_page(wxr, word_entry, cell_line)
111 continue
112 for p in person.split(","):
113 p = p.strip()
114 form_text = cell_line
115 if p != "":
116 form_text = p + " " + cell_line
117 if form_text == wxr.wtp.title:
118 continue
119 form = Form(form=form_text)
120 if col_index < len(col_headers):
121 form.raw_tags.append(col_headers[col_index])
122 for row_header in row_headers:
123 form.raw_tags.append(row_header.text)
124 translate_raw_tags(form)
125 word_entry.forms.append(form)
126 col_index += 1
128 new_row_headers = []
129 for row_header in row_headers:
130 if row_header.span > 1:
131 row_header.span -= 1
132 new_row_headers.append(row_header)
133 row_headers = new_row_headers
136def process_noun_table(
137 wxr: WiktextractContext, word_entry: WordEntry, template_node: TemplateNode
138) -> None:
139 # Vorlage:Deutsch Substantiv Übersicht
140 from .page import extract_note_section
142 expanded_template = wxr.wtp.parse(
143 wxr.wtp.node_to_wikitext(template_node), expand_all=True
144 )
145 table_nodes = list(expanded_template.find_child(NodeKind.TABLE))
146 if len(table_nodes) == 0: 146 ↛ 147line 146 didn't jump to line 147 because the condition on line 146 was never true
147 return
148 table_node = table_nodes[0]
149 column_headers = []
150 table_header = ""
151 for table_row in table_node.find_child(NodeKind.TABLE_ROW):
152 row_header = ""
153 is_header_row = not table_row.contain_node(NodeKind.TABLE_CELL)
154 row_has_header = table_row.contain_node(NodeKind.TABLE_HEADER_CELL)
155 col_index = 0
156 for table_cell in table_row.find_child(
157 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
158 ):
159 cell_text = clean_node(wxr, None, table_cell)
160 if table_cell.kind == NodeKind.TABLE_HEADER_CELL:
161 if (
162 cell_text in ["", "Kasus", "Utrum", "m", "f", "m, f"]
163 and col_index == 0
164 ):
165 continue
166 elif is_header_row:
167 colspan = int(table_cell.attrs.get("colspan", "1"))
168 if cell_text != "": 168 ↛ 176line 168 didn't jump to line 176 because the condition on line 168 was always true
169 column_headers.append(
170 RowspanHeader(
171 re.sub(r"\s*\d+$", "", cell_text),
172 col_index,
173 colspan,
174 )
175 )
176 col_index += colspan
177 else:
178 row_header = cell_text
179 elif cell_text == "": 179 ↛ 180line 179 didn't jump to line 180 because the condition on line 179 was never true
180 continue
181 elif not row_has_header: 181 ↛ 183line 181 didn't jump to line 183 because the condition on line 181 was never true
182 # Vorlage:Deutsch adjektivisch Übersicht
183 table_header = cell_text
184 column_headers.clear()
185 for link_node in table_cell.find_child(NodeKind.LINK):
186 link_text = clean_node(wxr, None, link_node)
187 if link_text.startswith("Flexion:"):
188 parse_flexion_page(wxr, word_entry, link_text)
189 else:
190 for form_text in cell_text.splitlines():
191 form_text = form_text.strip()
192 if form_text.startswith("(") and form_text.endswith(")"): 192 ↛ 193line 192 didn't jump to line 193 because the condition on line 192 was never true
193 form_text = form_text.strip("() ")
194 if form_text in ["—", "–", "-", "", "?", wxr.wtp.title]: 194 ↛ 195line 194 didn't jump to line 195 because the condition on line 194 was never true
195 continue
196 form = Form(form=form_text)
197 if table_header != "": 197 ↛ 198line 197 didn't jump to line 198 because the condition on line 197 was never true
198 form.raw_tags.append(table_header)
199 if len(row_header) > 0: 199 ↛ 201line 199 didn't jump to line 201 because the condition on line 199 was always true
200 form.raw_tags.append(row_header)
201 for col_header in column_headers:
202 if (
203 col_header.text not in ("", "—")
204 and col_index >= col_header.index
205 and col_index < col_header.index + col_header.span
206 ):
207 form.raw_tags.append(col_header.text)
208 translate_raw_tags(form)
209 word_entry.forms.append(form)
210 col_index += 1
212 clean_node(wxr, word_entry, expanded_template) # category links
213 # Vorlage:Deutsch Nachname Übersicht
214 for level_node in expanded_template.find_child(NodeKind.LEVEL4): 214 ↛ 215line 214 didn't jump to line 215 because the loop on line 214 never started
215 section_text = clean_node(wxr, None, level_node.largs)
216 if section_text.startswith("Anmerkung"):
217 extract_note_section(wxr, word_entry, level_node)
220def process_adj_table(
221 wxr: WiktextractContext, word_entry: WordEntry, template_node: TemplateNode
222) -> None:
223 # Vorlage:Deutsch Adjektiv Übersicht
224 expanded_template = wxr.wtp.parse(
225 wxr.wtp.node_to_wikitext(template_node), expand_all=True
226 )
227 table_nodes = list(expanded_template.find_child(NodeKind.TABLE))
228 if len(table_nodes) == 0: 228 ↛ 229line 228 didn't jump to line 229 because the condition on line 228 was never true
229 return
230 table_node = table_nodes[0]
231 column_headers = []
232 for table_row in table_node.find_child(NodeKind.TABLE_ROW):
233 for col_index, table_cell in enumerate(
234 table_row.find_child(
235 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
236 )
237 ):
238 cell_text = clean_node(wxr, None, table_cell)
239 # because {{int:}} magic word is not implemented
240 # template "Textbaustein-Intl" expands to English words
241 if cell_text.startswith("All other forms:"):
242 for link_node in table_cell.find_child(NodeKind.LINK):
243 parse_flexion_page(
244 wxr, word_entry, clean_node(wxr, None, link_node)
245 )
246 elif table_cell.kind == NodeKind.TABLE_HEADER_CELL:
247 column_headers.append(cell_text)
248 else:
249 for form_text in cell_text.splitlines():
250 if form_text in ("—", "", "?"): 250 ↛ 251line 250 didn't jump to line 251 because the condition on line 250 was never true
251 continue
252 form = Form(form=form_text)
253 if col_index < len(column_headers): 253 ↛ 255line 253 didn't jump to line 255 because the condition on line 253 was always true
254 form.raw_tags.append(column_headers[col_index])
255 translate_raw_tags(form)
256 word_entry.forms.append(form)
259def extract_pronoun_table(
260 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
261) -> None:
262 # Vorlage:Deutsch Possessivpronomen
263 expanded_template = wxr.wtp.parse(
264 wxr.wtp.node_to_wikitext(t_node), expand_all=True
265 )
266 table_nodes = list(expanded_template.find_child(NodeKind.TABLE))
267 if len(table_nodes) == 0: 267 ↛ 268line 267 didn't jump to line 268 because the condition on line 267 was never true
268 return
269 table_node = table_nodes[0]
270 col_headers = []
271 table_header = ""
272 for row in table_node.find_child(NodeKind.TABLE_ROW):
273 row_header = ""
274 row_has_data = row.contain_node(NodeKind.TABLE_CELL)
275 col_index = 0
276 article = ""
277 for cell in row.find_child(
278 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
279 ):
280 cell_text = clean_node(wxr, None, cell)
281 if cell.kind == NodeKind.TABLE_HEADER_CELL:
282 if cell_text == "":
283 continue
284 elif row_has_data:
285 row_header = cell_text
286 elif len(list(row.find_child(NodeKind.TABLE_HEADER_CELL))) == 1:
287 table_header = cell_text
288 col_headers.clear() # new table
289 article = ""
290 else:
291 colspan = 1
292 colspan_str = cell.attrs.get("colspan", "1")
293 if re.fullmatch(r"\d+", colspan_str): 293 ↛ 295line 293 didn't jump to line 295 because the condition on line 293 was always true
294 colspan = int(colspan_str)
295 if cell_text != "—":
296 col_headers.append(
297 RowspanHeader(cell_text, col_index, colspan)
298 )
299 col_index += colspan
300 elif cell.kind == NodeKind.TABLE_CELL: 300 ↛ 277line 300 didn't jump to line 277 because the condition on line 300 was always true
301 if col_index % 2 == 0:
302 article = cell_text
303 else:
304 form_str = (
305 article + " " + cell_text
306 if article not in ["", "—"]
307 else cell_text
308 )
309 form = Form(form=form_str)
310 if table_header != "": 310 ↛ 312line 310 didn't jump to line 312 because the condition on line 310 was always true
311 form.raw_tags.append(table_header)
312 if row_header != "": 312 ↛ 314line 312 didn't jump to line 314 because the condition on line 312 was always true
313 form.raw_tags.append(row_header)
314 for header in col_headers:
315 if (
316 col_index >= header.index
317 and col_index < header.index + header.span
318 and header.text != "Wortform"
319 ):
320 form.raw_tags.append(header.text)
321 translate_raw_tags(form)
322 if form.form != wxr.wtp.title:
323 word_entry.forms.append(form)
324 article = ""
325 col_index += 1