Coverage for src/wiktextract/extractor/de/inflection.py: 86%
207 statements
« prev ^ index » next coverage.py v7.11.0, created at 2025-11-03 05:44 +0000
« prev ^ index » next coverage.py v7.11.0, created at 2025-11-03 05:44 +0000
1import re
2from dataclasses import dataclass
4from wikitextprocessor import NodeKind, TemplateNode
6from ...page import clean_node
7from ...wxr_context import WiktextractContext
8from .flexion import parse_flexion_page
9from .models import Form, WordEntry
10from .tags import translate_raw_tags
12# Kategorie:Wiktionary:Flexionstabelle (Deutsch)
15def extract_inf_table_template(
16 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
17) -> None:
18 if (
19 "Substantiv Übersicht" in t_node.template_name
20 or t_node.template_name.endswith(
21 (
22 "Nachname Übersicht",
23 "Eigenname Übersicht",
24 "Vorname Übersicht m",
25 "Name Übersicht",
26 "Pronomina-Tabelle",
27 "Pronomen Übersicht",
28 "adjektivisch Übersicht",
29 "Substantiv Dialekt",
30 "Toponym Übersicht",
31 )
32 )
33 or re.search(r" Personalpronomen \d$", t_node.template_name)
34 ):
35 process_noun_table(wxr, word_entry, t_node)
36 elif t_node.template_name.endswith(
37 ("Adjektiv Übersicht", "Adverb Übersicht")
38 ):
39 process_adj_table(wxr, word_entry, t_node)
40 elif (
41 t_node.template_name.endswith("Verb Übersicht")
42 or t_node.template_name == "Kardinalzahl 2-12"
43 ):
44 process_verb_table(wxr, word_entry, t_node)
45 elif t_node.template_name == "Deutsch Possessivpronomen": 45 ↛ exitline 45 didn't return from function 'extract_inf_table_template' because the condition on line 45 was always true
46 extract_pronoun_table(wxr, word_entry, t_node)
49@dataclass
50class RowspanHeader:
51 text: str
52 index: int
53 span: int
56def process_verb_table(
57 wxr: WiktextractContext, word_entry: WordEntry, template_node: TemplateNode
58) -> None:
59 # Vorlage:Deutsch Verb Übersicht
60 expanded_template = wxr.wtp.parse(
61 wxr.wtp.node_to_wikitext(template_node), expand_all=True
62 )
63 table_nodes = list(expanded_template.find_child(NodeKind.TABLE))
64 if len(table_nodes) == 0: 64 ↛ 65line 64 didn't jump to line 65 because the condition on line 64 was never true
65 return
66 table_node = table_nodes[0]
67 col_headers = []
68 has_person = False
69 row_headers = []
70 for table_row in table_node.find_child(NodeKind.TABLE_ROW):
71 col_index = 0
72 header_col_index = 0
73 pronouns = []
74 for table_cell in table_row.find_child(
75 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
76 ):
77 cell_text = clean_node(wxr, None, table_cell)
78 if cell_text.startswith("All other forms:"):
79 for link_node in table_cell.find_child_recursively(
80 NodeKind.LINK
81 ):
82 link_text = clean_node(wxr, None, link_node)
83 if link_text.startswith("Flexion:"): 83 ↛ 79line 83 didn't jump to line 79 because the condition on line 83 was always true
84 parse_flexion_page(wxr, word_entry, link_text)
85 elif table_cell.kind == NodeKind.TABLE_HEADER_CELL:
86 if cell_text == "":
87 continue
88 elif header_col_index == 0:
89 rowspan = int(table_cell.attrs.get("rowspan", "1"))
90 row_headers.append(RowspanHeader(cell_text, 0, rowspan))
91 elif cell_text in ("Person", "Wortform"):
92 has_person = True
93 else: # new table
94 col_headers.append(cell_text)
95 has_person = False
96 pronouns.clear()
97 header_col_index += 1
98 elif table_cell.kind == NodeKind.TABLE_CELL: 98 ↛ 74line 98 didn't jump to line 74 because the condition on line 98 was always true
99 if has_person and col_index == 0:
100 if cell_text in ("Singular", "Plural"):
101 row_headers.append(RowspanHeader(cell_text, 0, 1))
102 else:
103 pronouns = list(
104 filter(None, map(str.strip, cell_text.split(",")))
105 )
106 else:
107 for cell_line in cell_text.splitlines():
108 for form_str in map(str.strip, cell_line.split(",")):
109 if form_str in ["", "—", wxr.wtp.title]:
110 continue
111 elif form_str.startswith("Flexion:"): 111 ↛ 112line 111 didn't jump to line 112 because the condition on line 111 was never true
112 parse_flexion_page(wxr, word_entry, form_str)
113 continue
114 form = Form(form=form_str, pronouns=pronouns)
115 if col_index < len(col_headers):
116 form.raw_tags.append(col_headers[col_index])
117 for row_header in row_headers:
118 form.raw_tags.append(row_header.text)
119 translate_raw_tags(form)
120 word_entry.forms.append(form)
121 col_index += 1
123 new_row_headers = []
124 for row_header in row_headers:
125 if row_header.span > 1:
126 row_header.span -= 1
127 new_row_headers.append(row_header)
128 row_headers = new_row_headers
131def process_noun_table(
132 wxr: WiktextractContext, word_entry: WordEntry, template_node: TemplateNode
133) -> None:
134 # Vorlage:Deutsch Substantiv Übersicht
135 from .page import extract_note_section
137 expanded_template = wxr.wtp.parse(
138 wxr.wtp.node_to_wikitext(template_node), expand_all=True
139 )
140 table_nodes = list(expanded_template.find_child(NodeKind.TABLE))
141 if len(table_nodes) == 0: 141 ↛ 142line 141 didn't jump to line 142 because the condition on line 141 was never true
142 return
143 table_node = table_nodes[0]
144 column_headers = []
145 table_header = ""
146 for table_row in table_node.find_child(NodeKind.TABLE_ROW):
147 row_header = ""
148 is_header_row = not table_row.contain_node(NodeKind.TABLE_CELL)
149 row_has_header = table_row.contain_node(NodeKind.TABLE_HEADER_CELL)
150 col_index = 0
151 for table_cell in table_row.find_child(
152 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
153 ):
154 cell_text = clean_node(wxr, None, table_cell)
155 if table_cell.kind == NodeKind.TABLE_HEADER_CELL:
156 if (
157 cell_text in ["", "Kasus", "Utrum", "m", "f", "m, f"]
158 and col_index == 0
159 ):
160 continue
161 elif is_header_row:
162 colspan = int(table_cell.attrs.get("colspan", "1"))
163 if cell_text != "": 163 ↛ 171line 163 didn't jump to line 171 because the condition on line 163 was always true
164 column_headers.append(
165 RowspanHeader(
166 re.sub(r"\s*\d+$", "", cell_text),
167 col_index,
168 colspan,
169 )
170 )
171 col_index += colspan
172 else:
173 row_header = cell_text
174 elif cell_text == "": 174 ↛ 175line 174 didn't jump to line 175 because the condition on line 174 was never true
175 continue
176 elif not row_has_header: 176 ↛ 178line 176 didn't jump to line 178 because the condition on line 176 was never true
177 # Vorlage:Deutsch adjektivisch Übersicht
178 table_header = cell_text
179 column_headers.clear()
180 for link_node in table_cell.find_child(NodeKind.LINK):
181 link_text = clean_node(wxr, None, link_node)
182 if link_text.startswith("Flexion:"):
183 parse_flexion_page(wxr, word_entry, link_text)
184 else:
185 for form_text in cell_text.splitlines():
186 form_text = form_text.strip()
187 if form_text.startswith("(") and form_text.endswith(")"): 187 ↛ 188line 187 didn't jump to line 188 because the condition on line 187 was never true
188 form_text = form_text.strip("() ")
189 if form_text in ["—", "–", "-", "", "?", wxr.wtp.title]: 189 ↛ 190line 189 didn't jump to line 190 because the condition on line 189 was never true
190 continue
191 form = Form(form=form_text)
192 if table_header != "": 192 ↛ 193line 192 didn't jump to line 193 because the condition on line 192 was never true
193 form.raw_tags.append(table_header)
194 if len(row_header) > 0: 194 ↛ 196line 194 didn't jump to line 196 because the condition on line 194 was always true
195 form.raw_tags.append(row_header)
196 for col_header in column_headers:
197 if (
198 col_header.text not in ("", "—")
199 and col_index >= col_header.index
200 and col_index < col_header.index + col_header.span
201 ):
202 form.raw_tags.append(col_header.text)
203 translate_raw_tags(form)
204 word_entry.forms.append(form)
205 col_index += 1
207 clean_node(wxr, word_entry, expanded_template) # category links
208 # Vorlage:Deutsch Nachname Übersicht
209 for level_node in expanded_template.find_child(NodeKind.LEVEL4): 209 ↛ 210line 209 didn't jump to line 210 because the loop on line 209 never started
210 section_text = clean_node(wxr, None, level_node.largs)
211 if section_text.startswith("Anmerkung"):
212 extract_note_section(wxr, word_entry, level_node)
215def process_adj_table(
216 wxr: WiktextractContext, word_entry: WordEntry, template_node: TemplateNode
217) -> None:
218 # Vorlage:Deutsch Adjektiv Übersicht
219 expanded_template = wxr.wtp.parse(
220 wxr.wtp.node_to_wikitext(template_node), expand_all=True
221 )
222 table_nodes = list(expanded_template.find_child(NodeKind.TABLE))
223 if len(table_nodes) == 0: 223 ↛ 224line 223 didn't jump to line 224 because the condition on line 223 was never true
224 return
225 table_node = table_nodes[0]
226 column_headers = []
227 for table_row in table_node.find_child(NodeKind.TABLE_ROW):
228 for col_index, table_cell in enumerate(
229 table_row.find_child(
230 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
231 )
232 ):
233 cell_text = clean_node(wxr, None, table_cell)
234 # because {{int:}} magic word is not implemented
235 # template "Textbaustein-Intl" expands to English words
236 if cell_text.startswith("All other forms:"):
237 for link_node in table_cell.find_child(NodeKind.LINK):
238 parse_flexion_page(
239 wxr, word_entry, clean_node(wxr, None, link_node)
240 )
241 elif table_cell.kind == NodeKind.TABLE_HEADER_CELL:
242 column_headers.append(cell_text)
243 else:
244 for form_text in cell_text.splitlines():
245 if form_text in ("—", "", "?"): 245 ↛ 246line 245 didn't jump to line 246 because the condition on line 245 was never true
246 continue
247 form = Form(form=form_text)
248 if col_index < len(column_headers): 248 ↛ 250line 248 didn't jump to line 250 because the condition on line 248 was always true
249 form.raw_tags.append(column_headers[col_index])
250 translate_raw_tags(form)
251 word_entry.forms.append(form)
254def extract_pronoun_table(
255 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
256) -> None:
257 # Vorlage:Deutsch Possessivpronomen
258 expanded_template = wxr.wtp.parse(
259 wxr.wtp.node_to_wikitext(t_node), expand_all=True
260 )
261 table_nodes = list(expanded_template.find_child(NodeKind.TABLE))
262 if len(table_nodes) == 0: 262 ↛ 263line 262 didn't jump to line 263 because the condition on line 262 was never true
263 return
264 table_node = table_nodes[0]
265 col_headers = []
266 table_header = ""
267 for row in table_node.find_child(NodeKind.TABLE_ROW):
268 row_header = ""
269 row_has_data = row.contain_node(NodeKind.TABLE_CELL)
270 col_index = 0
271 article = ""
272 for cell in row.find_child(
273 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
274 ):
275 cell_text = clean_node(wxr, None, cell)
276 if cell.kind == NodeKind.TABLE_HEADER_CELL:
277 if cell_text == "":
278 continue
279 elif row_has_data:
280 row_header = cell_text
281 elif len(list(row.find_child(NodeKind.TABLE_HEADER_CELL))) == 1:
282 table_header = cell_text
283 col_headers.clear() # new table
284 article = ""
285 else:
286 colspan = 1
287 colspan_str = cell.attrs.get("colspan", "1")
288 if re.fullmatch(r"\d+", colspan_str): 288 ↛ 290line 288 didn't jump to line 290 because the condition on line 288 was always true
289 colspan = int(colspan_str)
290 if cell_text != "—":
291 col_headers.append(
292 RowspanHeader(cell_text, col_index, colspan)
293 )
294 col_index += colspan
295 elif cell.kind == NodeKind.TABLE_CELL: 295 ↛ 272line 295 didn't jump to line 272 because the condition on line 295 was always true
296 if col_index % 2 == 0:
297 article = cell_text
298 else:
299 form_str = (
300 article + " " + cell_text
301 if article not in ["", "—"]
302 else cell_text
303 )
304 form = Form(form=form_str)
305 if table_header != "": 305 ↛ 307line 305 didn't jump to line 307 because the condition on line 305 was always true
306 form.raw_tags.append(table_header)
307 if row_header != "": 307 ↛ 309line 307 didn't jump to line 309 because the condition on line 307 was always true
308 form.raw_tags.append(row_header)
309 for header in col_headers:
310 if (
311 col_index >= header.index
312 and col_index < header.index + header.span
313 and header.text != "Wortform"
314 ):
315 form.raw_tags.append(header.text)
316 translate_raw_tags(form)
317 if form.form != wxr.wtp.title:
318 word_entry.forms.append(form)
319 article = ""
320 col_index += 1