Coverage for src / wiktextract / extractor / de / inflection.py: 86%
211 statements
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-05 07:46 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-05 07:46 +0000
1import re
2from dataclasses import dataclass
4from wikitextprocessor import NodeKind, TemplateNode
6from ...page import clean_node
7from ...wxr_context import WiktextractContext
8from .flexion import parse_flexion_page
9from .models import Form, WordEntry
10from .tags import translate_raw_tags
12# Kategorie:Wiktionary:Flexionstabelle (Deutsch)
15def extract_inf_table_template(
16 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
17) -> None:
18 if (
19 "Substantiv Übersicht" in t_node.template_name
20 or t_node.template_name.endswith(
21 (
22 "Nachname Übersicht",
23 "Eigenname Übersicht",
24 "Vorname Übersicht m",
25 "Name Übersicht",
26 "Pronomina-Tabelle",
27 "Pronomen Übersicht",
28 "adjektivisch Übersicht",
29 "Substantiv Dialekt",
30 "Toponym Übersicht",
31 )
32 )
33 or re.search(r" Personalpronomen \d$", t_node.template_name)
34 ):
35 process_noun_table(wxr, word_entry, t_node)
36 elif t_node.template_name.endswith(
37 ("Adjektiv Übersicht", "Adverb Übersicht")
38 ):
39 process_adj_table(wxr, word_entry, t_node)
40 elif (
41 t_node.template_name.endswith("Verb Übersicht")
42 or t_node.template_name == "Kardinalzahl 2-12"
43 ):
44 process_verb_table(wxr, word_entry, t_node)
45 elif t_node.template_name == "Deutsch Possessivpronomen": 45 ↛ exitline 45 didn't return from function 'extract_inf_table_template' because the condition on line 45 was always true
46 extract_pronoun_table(wxr, word_entry, t_node)
49@dataclass
50class RowspanHeader:
51 text: str
52 index: int
53 span: int
56def process_verb_table(
57 wxr: WiktextractContext, word_entry: WordEntry, template_node: TemplateNode
58) -> None:
59 # Vorlage:Deutsch Verb Übersicht
60 expanded_template = wxr.wtp.parse(
61 wxr.wtp.node_to_wikitext(template_node), expand_all=True
62 )
63 table_nodes = list(expanded_template.find_child(NodeKind.TABLE))
64 if len(table_nodes) == 0: 64 ↛ 65line 64 didn't jump to line 65 because the condition on line 64 was never true
65 return
66 table_node = table_nodes[0]
67 col_headers = []
68 has_person = False
69 row_headers = []
70 for table_row in table_node.find_child(NodeKind.TABLE_ROW):
71 col_index = 0
72 header_col_index = 0
73 pronouns = []
74 for table_cell in table_row.find_child(
75 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
76 ):
77 cell_text = clean_node(wxr, None, table_cell)
78 if cell_text.startswith("All other forms:"):
79 for link_node in table_cell.find_child_recursively(
80 NodeKind.LINK
81 ):
82 link_text = clean_node(wxr, None, link_node)
83 if link_text.startswith("Flexion:"): 83 ↛ 79line 83 didn't jump to line 79 because the condition on line 83 was always true
84 parse_flexion_page(wxr, word_entry, link_text)
85 elif table_cell.kind == NodeKind.TABLE_HEADER_CELL:
86 if cell_text == "":
87 continue
88 elif header_col_index == 0:
89 rowspan = int(table_cell.attrs.get("rowspan", "1"))
90 row_headers.append(RowspanHeader(cell_text, 0, rowspan))
91 elif cell_text in ("Person", "Wortform"):
92 has_person = True
93 else: # new table
94 col_headers.append(cell_text)
95 has_person = False
96 pronouns.clear()
97 header_col_index += 1
98 elif table_cell.kind == NodeKind.TABLE_CELL: 98 ↛ 74line 98 didn't jump to line 74 because the condition on line 98 was always true
99 if (
100 "background-color: #f4f4f4"
101 in table_cell.attrs.get("style", "").lower()
102 ):
103 # Template:Englisch Verb Übersicht
104 rowspan = int(table_cell.attrs.get("rowspan", "1"))
105 row_headers.append(RowspanHeader(cell_text, 0, rowspan))
106 continue
107 elif has_person and col_index == 0:
108 if cell_text in ("Singular", "Plural"):
109 row_headers.append(RowspanHeader(cell_text, 0, 1))
110 else:
111 pronouns = list(
112 filter(None, map(str.strip, cell_text.split(",")))
113 )
114 else:
115 for cell_line in cell_text.splitlines():
116 for form_str in map(str.strip, cell_line.split(",")):
117 if form_str in ["", "—", wxr.wtp.title]:
118 continue
119 elif form_str.startswith("Flexion:"): 119 ↛ 120line 119 didn't jump to line 120 because the condition on line 119 was never true
120 parse_flexion_page(wxr, word_entry, form_str)
121 continue
122 form = Form(form=form_str, pronouns=pronouns)
123 if col_index < len(col_headers):
124 form.raw_tags.append(col_headers[col_index])
125 for row_header in row_headers:
126 form.raw_tags.append(row_header.text)
127 translate_raw_tags(form)
128 word_entry.forms.append(form)
129 col_index += 1
131 new_row_headers = []
132 for row_header in row_headers:
133 if row_header.span > 1:
134 row_header.span -= 1
135 new_row_headers.append(row_header)
136 row_headers = new_row_headers
139def process_noun_table(
140 wxr: WiktextractContext, word_entry: WordEntry, template_node: TemplateNode
141) -> None:
142 # Vorlage:Deutsch Substantiv Übersicht
143 from .page import extract_note_section
145 expanded_template = wxr.wtp.parse(
146 wxr.wtp.node_to_wikitext(template_node), expand_all=True
147 )
148 table_nodes = list(expanded_template.find_child(NodeKind.TABLE))
149 if len(table_nodes) == 0: 149 ↛ 150line 149 didn't jump to line 150 because the condition on line 149 was never true
150 return
151 table_node = table_nodes[0]
152 column_headers = []
153 table_header = ""
154 for table_row in table_node.find_child(NodeKind.TABLE_ROW):
155 row_header = ""
156 is_header_row = not table_row.contain_node(NodeKind.TABLE_CELL)
157 row_has_header = table_row.contain_node(NodeKind.TABLE_HEADER_CELL)
158 col_index = 0
159 for table_cell in table_row.find_child(
160 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
161 ):
162 cell_text = clean_node(wxr, None, table_cell)
163 if table_cell.kind == NodeKind.TABLE_HEADER_CELL:
164 if (
165 cell_text in ["", "Kasus", "Utrum", "m", "f", "m, f"]
166 and col_index == 0
167 ):
168 continue
169 elif is_header_row:
170 colspan = int(table_cell.attrs.get("colspan", "1"))
171 if cell_text != "": 171 ↛ 179line 171 didn't jump to line 179 because the condition on line 171 was always true
172 column_headers.append(
173 RowspanHeader(
174 re.sub(r"\s*\d+$", "", cell_text),
175 col_index,
176 colspan,
177 )
178 )
179 col_index += colspan
180 else:
181 row_header = cell_text
182 elif cell_text == "": 182 ↛ 183line 182 didn't jump to line 183 because the condition on line 182 was never true
183 continue
184 elif not row_has_header: 184 ↛ 186line 184 didn't jump to line 186 because the condition on line 184 was never true
185 # Vorlage:Deutsch adjektivisch Übersicht
186 table_header = cell_text
187 column_headers.clear()
188 for link_node in table_cell.find_child(NodeKind.LINK):
189 link_text = clean_node(wxr, None, link_node)
190 if link_text.startswith("Flexion:"):
191 parse_flexion_page(wxr, word_entry, link_text)
192 else:
193 for form_text in cell_text.splitlines():
194 form_text = form_text.strip()
195 if form_text.startswith("(") and form_text.endswith(")"): 195 ↛ 196line 195 didn't jump to line 196 because the condition on line 195 was never true
196 form_text = form_text.strip("() ")
197 if form_text in ["—", "–", "-", "", "?", wxr.wtp.title]: 197 ↛ 198line 197 didn't jump to line 198 because the condition on line 197 was never true
198 continue
199 form = Form(form=form_text)
200 if table_header != "": 200 ↛ 201line 200 didn't jump to line 201 because the condition on line 200 was never true
201 form.raw_tags.append(table_header)
202 if len(row_header) > 0: 202 ↛ 204line 202 didn't jump to line 204 because the condition on line 202 was always true
203 form.raw_tags.append(row_header)
204 for col_header in column_headers:
205 if (
206 col_header.text not in ("", "—")
207 and col_index >= col_header.index
208 and col_index < col_header.index + col_header.span
209 ):
210 form.raw_tags.append(col_header.text)
211 translate_raw_tags(form)
212 word_entry.forms.append(form)
213 col_index += 1
215 clean_node(wxr, word_entry, expanded_template) # category links
216 # Vorlage:Deutsch Nachname Übersicht
217 for level_node in expanded_template.find_child(NodeKind.LEVEL4): 217 ↛ 218line 217 didn't jump to line 218 because the loop on line 217 never started
218 section_text = clean_node(wxr, None, level_node.largs)
219 if section_text.startswith("Anmerkung"):
220 extract_note_section(wxr, word_entry, level_node)
223def process_adj_table(
224 wxr: WiktextractContext, word_entry: WordEntry, template_node: TemplateNode
225) -> None:
226 # Vorlage:Deutsch Adjektiv Übersicht
227 expanded_template = wxr.wtp.parse(
228 wxr.wtp.node_to_wikitext(template_node), expand_all=True
229 )
230 table_nodes = list(expanded_template.find_child(NodeKind.TABLE))
231 if len(table_nodes) == 0: 231 ↛ 232line 231 didn't jump to line 232 because the condition on line 231 was never true
232 return
233 table_node = table_nodes[0]
234 column_headers = []
235 for table_row in table_node.find_child(NodeKind.TABLE_ROW):
236 for col_index, table_cell in enumerate(
237 table_row.find_child(
238 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
239 )
240 ):
241 cell_text = clean_node(wxr, None, table_cell)
242 # because {{int:}} magic word is not implemented
243 # template "Textbaustein-Intl" expands to English words
244 if cell_text.startswith("All other forms:"):
245 for link_node in table_cell.find_child(NodeKind.LINK):
246 parse_flexion_page(
247 wxr, word_entry, clean_node(wxr, None, link_node)
248 )
249 elif table_cell.kind == NodeKind.TABLE_HEADER_CELL:
250 column_headers.append(cell_text)
251 else:
252 for form_text in cell_text.splitlines():
253 if form_text in ("—", "", "?"): 253 ↛ 254line 253 didn't jump to line 254 because the condition on line 253 was never true
254 continue
255 form = Form(form=form_text)
256 if col_index < len(column_headers): 256 ↛ 258line 256 didn't jump to line 258 because the condition on line 256 was always true
257 form.raw_tags.append(column_headers[col_index])
258 translate_raw_tags(form)
259 word_entry.forms.append(form)
262def extract_pronoun_table(
263 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
264) -> None:
265 # Vorlage:Deutsch Possessivpronomen
266 expanded_template = wxr.wtp.parse(
267 wxr.wtp.node_to_wikitext(t_node), expand_all=True
268 )
269 table_nodes = list(expanded_template.find_child(NodeKind.TABLE))
270 if len(table_nodes) == 0: 270 ↛ 271line 270 didn't jump to line 271 because the condition on line 270 was never true
271 return
272 table_node = table_nodes[0]
273 col_headers = []
274 table_header = ""
275 for row in table_node.find_child(NodeKind.TABLE_ROW):
276 row_header = ""
277 row_has_data = row.contain_node(NodeKind.TABLE_CELL)
278 col_index = 0
279 article = ""
280 for cell in row.find_child(
281 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
282 ):
283 cell_text = clean_node(wxr, None, cell)
284 if cell.kind == NodeKind.TABLE_HEADER_CELL:
285 if cell_text == "":
286 continue
287 elif row_has_data:
288 row_header = cell_text
289 elif len(list(row.find_child(NodeKind.TABLE_HEADER_CELL))) == 1:
290 table_header = cell_text
291 col_headers.clear() # new table
292 article = ""
293 else:
294 colspan = 1
295 colspan_str = cell.attrs.get("colspan", "1")
296 if re.fullmatch(r"\d+", colspan_str): 296 ↛ 298line 296 didn't jump to line 298 because the condition on line 296 was always true
297 colspan = int(colspan_str)
298 if cell_text != "—":
299 col_headers.append(
300 RowspanHeader(cell_text, col_index, colspan)
301 )
302 col_index += colspan
303 elif cell.kind == NodeKind.TABLE_CELL: 303 ↛ 280line 303 didn't jump to line 280 because the condition on line 303 was always true
304 if col_index % 2 == 0:
305 article = cell_text
306 else:
307 form_str = (
308 article + " " + cell_text
309 if article not in ["", "—"]
310 else cell_text
311 )
312 form = Form(form=form_str)
313 if table_header != "": 313 ↛ 315line 313 didn't jump to line 315 because the condition on line 313 was always true
314 form.raw_tags.append(table_header)
315 if row_header != "": 315 ↛ 317line 315 didn't jump to line 317 because the condition on line 315 was always true
316 form.raw_tags.append(row_header)
317 for header in col_headers:
318 if (
319 col_index >= header.index
320 and col_index < header.index + header.span
321 and header.text != "Wortform"
322 ):
323 form.raw_tags.append(header.text)
324 translate_raw_tags(form)
325 if form.form != wxr.wtp.title:
326 word_entry.forms.append(form)
327 article = ""
328 col_index += 1