Coverage for src/wiktextract/extractor/de/flexion.py: 69%
188 statements
« prev ^ index » next coverage.py v7.14.3, created at 2026-06-23 09:14 +0000
« prev ^ index » next coverage.py v7.14.3, created at 2026-06-23 09:14 +0000
1from dataclasses import dataclass
3from wikitextprocessor.parser import (
4 LEVEL_KIND_FLAGS,
5 HTMLNode,
6 LevelNode,
7 NodeKind,
8 TemplateNode,
9 WikiNode,
10)
12from ...page import clean_node
13from ...wxr_context import WiktextractContext
14from .models import Form, WordEntry
15from .tags import TAGS, translate_raw_tags
18def parse_flexion_page(
19 wxr: WiktextractContext, word_entry: WordEntry, page_title: str
20) -> None:
21 # https://de.wiktionary.org/wiki/Hilfe:Flexionsseiten
22 LEVEL2_TAGS = ["Hilfsverb haben", "Hilfsverb sein"]
24 flexion_page = wxr.wtp.get_page_body(
25 page_title, wxr.wtp.NAMESPACE_DATA["Flexion"]["id"]
26 )
27 if flexion_page is None: 27 ↛ 28line 27 didn't jump to line 28 because the condition on line 27 was never true
28 return
29 flexion_root = wxr.wtp.parse(flexion_page)
30 shared_raw_tags = []
31 for node in flexion_root.find_child_recursively(
32 NodeKind.TEMPLATE | NodeKind.LEVEL2
33 ):
34 match node.kind:
35 case NodeKind.LEVEL2:
36 shared_raw_tags.clear()
37 section_str = clean_node(wxr, None, node.largs)
38 for word in section_str.split(" "):
39 word = word.strip(", ")
40 if word in TAGS and not page_title.endswith(f":{word}"): 40 ↛ 41line 40 didn't jump to line 41 because the condition on line 40 was never true
41 shared_raw_tags.append(word)
42 for raw_tag in LEVEL2_TAGS:
43 if raw_tag in section_str: 43 ↛ 44line 43 didn't jump to line 44 because the condition on line 43 was never true
44 shared_raw_tags.append(raw_tag)
45 case NodeKind.TEMPLATE: 45 ↛ 31line 45 didn't jump to line 31 because the pattern on line 45 always matched
46 if node.template_name == "Deklinationsseite Numerale": 46 ↛ 47line 46 didn't jump to line 47 because the condition on line 46 was never true
47 extract_deklinationsseite_numerale_template(
48 wxr, word_entry, node, page_title
49 )
50 elif node.template_name.startswith("Deklinationsseite"):
51 process_deklinationsseite_template(
52 wxr, word_entry, node, page_title
53 )
54 elif node.template_name.startswith("Deutsch Verb"):
55 process_deutsch_verb_template(
56 wxr, word_entry, node, page_title, shared_raw_tags
57 )
60@dataclass
61class SpanHeader:
62 text: str
63 index: int
64 span: int
67def process_deklinationsseite_template(
68 wxr: WiktextractContext,
69 word_entry: WordEntry,
70 template_node: TemplateNode,
71 page_tite: str,
72) -> None:
73 # https://de.wiktionary.org/wiki/Vorlage:Deklinationsseite_Adjektiv
74 expanded_template = wxr.wtp.parse(
75 wxr.wtp.node_to_wikitext(template_node), expand_all=True
76 )
77 h4_text = ""
78 for node in expanded_template.find_child(NodeKind.HTML | NodeKind.TABLE):
79 if isinstance(node, HTMLNode) and node.tag == "h4":
80 h4_text = clean_node(wxr, None, node)
81 elif node.kind == NodeKind.TABLE: 81 ↛ 78line 81 didn't jump to line 78 because the condition on line 81 was always true
82 col_headers = []
83 has_article = False
84 for row_node in node.find_child(NodeKind.TABLE_ROW):
85 col_index = 0
86 row_header = ""
87 article = ""
88 for cell_node in row_node.find_child(
89 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
90 ):
91 cell_text = clean_node(wxr, None, cell_node)
92 if cell_node.kind == NodeKind.TABLE_HEADER_CELL:
93 if cell_text == "":
94 continue
95 elif cell_text in ("Artikel", "Wortform"):
96 has_article = True
97 continue
98 elif "colspan" in cell_node.attrs:
99 col_span = int(cell_node.attrs.get("colspan"))
100 if col_span == 9: # new table
101 has_article = False
102 col_headers.clear()
103 col_headers.append(
104 SpanHeader(cell_text, col_index, col_span)
105 )
106 col_index += col_span
107 else:
108 row_header = cell_text
109 elif cell_node.kind == NodeKind.TABLE_CELL: 109 ↛ 88line 109 didn't jump to line 88 because the condition on line 109 was always true
110 if has_article and col_index % 2 == 0:
111 if cell_text != "—": 111 ↛ 137line 111 didn't jump to line 137 because the condition on line 111 was always true
112 article = cell_text
113 else:
114 raw_tags = []
115 if h4_text != "": 115 ↛ 117line 115 didn't jump to line 117 because the condition on line 115 was always true
116 raw_tags.append(h4_text)
117 if row_header != "":
118 raw_tags.append(row_header)
119 for col_header in col_headers:
120 if (
121 col_header.text not in ("", "—")
122 and col_index >= col_header.index
123 and col_index
124 < col_header.index + col_header.span
125 ):
126 raw_tags.append(col_header.text)
127 for line in cell_text.splitlines():
128 form = Form(
129 form=line,
130 source=page_tite,
131 raw_tags=raw_tags,
132 article=article,
133 )
134 if form.form not in ("", "—"): 134 ↛ 127line 134 didn't jump to line 127 because the condition on line 134 was always true
135 translate_raw_tags(form)
136 word_entry.forms.append(form)
137 col_index += int(cell_node.attrs.get("colspan", "1"))
140def process_deutsch_verb_template(
141 wxr: WiktextractContext,
142 word_entry: WordEntry,
143 template_node: TemplateNode,
144 page_tite: str,
145 shared_raw_tags: list[str],
146) -> None:
147 # Vorlage:Deutsch Verb regelmäßig
148 expanded_template = wxr.wtp.parse(
149 wxr.wtp.node_to_wikitext(template_node), expand_all=True
150 )
151 for level_node in expanded_template.find_child(LEVEL_KIND_FLAGS):
152 process_deutsch_verb_section(
153 wxr, word_entry, level_node, page_tite, shared_raw_tags
154 )
157def process_deutsch_verb_section(
158 wxr: WiktextractContext,
159 word_entry: WordEntry,
160 level_node: LevelNode,
161 page_tite: str,
162 shared_raw_tags: list[str],
163) -> None:
164 section_title = clean_node(wxr, None, level_node.largs)
165 new_raw_tags = shared_raw_tags.copy()
166 new_raw_tags.append(section_title)
167 for table_node in level_node.find_child(NodeKind.TABLE):
168 process_deutsch_verb_table(
169 wxr, word_entry, table_node, page_tite, new_raw_tags
170 )
171 for next_level in level_node.find_child(LEVEL_KIND_FLAGS): 171 ↛ 172line 171 didn't jump to line 172 because the loop on line 171 never started
172 process_deutsch_verb_section(
173 wxr, word_entry, next_level, page_tite, new_raw_tags
174 )
177def process_deutsch_verb_table(
178 wxr: WiktextractContext,
179 word_entry: WordEntry,
180 table: WikiNode,
181 page_tite: str,
182 shared_raw_tags: list[str],
183) -> None:
184 col_headers = []
185 for row in table.find_child(NodeKind.TABLE_ROW):
186 row_header = ""
187 col_index = 0
188 col_header_index = 0
189 is_bold_col_header = all(
190 c.contain_node(NodeKind.BOLD)
191 for c in row.find_child(NodeKind.TABLE_CELL)
192 if clean_node(wxr, None, c) != ""
193 )
194 if (
195 len(
196 list(
197 row.find_child(
198 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
199 )
200 )
201 )
202 == 1
203 ):
204 col_headers.clear() # new table
205 for cell in row.find_child(
206 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
207 ):
208 cell_text = clean_node(wxr, None, cell)
209 if cell_text in (
210 "Flexion der Verbaladjektive",
211 "(nichterweiterte) Infinitive",
212 ):
213 break
214 elif cell.kind == NodeKind.TABLE_HEADER_CELL and cell_text not in (
215 "",
216 "Person",
217 ):
218 colspan = int(cell.attrs.get("colspan", "1"))
219 col_headers.append(
220 SpanHeader(
221 cell_text,
222 col_header_index,
223 colspan,
224 )
225 )
226 col_header_index += colspan
227 elif cell.kind == NodeKind.TABLE_CELL: 227 ↛ 205line 227 didn't jump to line 205 because the condition on line 227 was always true
228 if cell_text in (
229 "",
230 "—",
231 "Text",
232 "Person",
233 ) or cell_text.startswith("Flexion:"):
234 col_index += 1
235 elif (
236 cell.contain_node(NodeKind.BOLD)
237 or (
238 len(list(cell.find_html("small"))) > 0
239 and len(list(cell.filter_empty_str_child())) == 1
240 )
241 # Vorlage:Deutsch Verb schwach untrennbar reflexiv
242 or cell.attrs.get("bgcolor", "").lower() == "#f4f4f4"
243 ): # header in cell
244 colspan = int(cell.attrs.get("colspan", "1"))
245 if is_bold_col_header:
246 for bold_node in cell.find_child(NodeKind.BOLD):
247 col_headers.append(
248 SpanHeader(
249 clean_node(wxr, None, bold_node),
250 col_header_index,
251 colspan,
252 )
253 )
254 else:
255 row_header = cell_text
256 col_header_index += colspan
257 else:
258 for form_text in cell_text.splitlines():
259 form_text = form_text.strip(", ")
260 form_raw_tag = ""
261 if ":" in form_text: 261 ↛ 262line 261 didn't jump to line 262 because the condition on line 261 was never true
262 form_raw_tag, form_text = form_text.split(":", 1)
263 form = Form(
264 form=form_text.strip(),
265 source=page_tite,
266 raw_tags=shared_raw_tags,
267 )
268 if form_raw_tag != "": 268 ↛ 269line 268 didn't jump to line 269 because the condition on line 268 was never true
269 form.raw_tags.append(form_raw_tag)
270 if row_header != "":
271 form.raw_tags.append(row_header)
272 for col_header in col_headers:
273 if (
274 col_index >= col_header.index
275 and col_index
276 < col_header.index + col_header.span
277 ):
278 if col_header.text.endswith("I"): 278 ↛ 279line 278 didn't jump to line 279 because the condition on line 278 was never true
279 form.raw_tags.append(col_header.text)
280 else:
281 for raw_tag in col_header.text.split():
282 form.raw_tags.append(raw_tag)
283 translate_raw_tags(form)
284 word_entry.forms.append(form)
285 col_index += 1
288def extract_deklinationsseite_numerale_template(
289 wxr: WiktextractContext,
290 word_entry: WordEntry,
291 t_node: TemplateNode,
292 page_tite: str,
293) -> None:
294 # https://de.wiktionary.org/wiki/Vorlage:Deklinationsseite_Numerale
295 expanded_template = wxr.wtp.parse(
296 wxr.wtp.node_to_wikitext(t_node), expand_all=True
297 )
298 for table in expanded_template.find_child(NodeKind.TABLE):
299 col_headers = []
300 for row in table.find_child(NodeKind.TABLE_ROW):
301 row_header = ""
302 row_has_data = row.contain_node(NodeKind.TABLE_CELL)
303 col_index = 0
304 for cell in row.find_child(
305 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
306 ):
307 cell_text = clean_node(wxr, None, cell)
308 if cell_text == "":
309 continue
310 if cell.kind == NodeKind.TABLE_HEADER_CELL:
311 if row_has_data:
312 row_header = cell_text
313 else:
314 col_span = int(cell.attrs.get("colspan", "1"))
315 if col_index == 0 and not row_has_data:
316 col_headers.clear() # new table
317 col_headers.append(
318 SpanHeader(cell_text, col_index, col_span)
319 )
320 col_index += col_span
321 else:
322 word_nodes = []
323 raw_tags = []
324 for cell_child in cell.children:
325 if (
326 isinstance(cell_child, HTMLNode)
327 and cell_child.tag == "br"
328 ):
329 word = clean_node(wxr, None, word_nodes)
330 if word != "":
331 deklinationsseite_numerale_add_form(
332 word_entry,
333 word,
334 page_tite,
335 raw_tags,
336 col_index,
337 row_header,
338 col_headers,
339 )
340 word_nodes.clear()
341 elif (
342 isinstance(cell_child, WikiNode)
343 and cell_child.kind == NodeKind.ITALIC
344 ):
345 raw_tag = clean_node(wxr, None, cell_child).strip(
346 ": "
347 )
348 if raw_tag != "":
349 raw_tags.append(raw_tag)
350 else:
351 word_nodes.append(cell_child)
352 word = clean_node(wxr, None, word_nodes)
353 if word != "":
354 deklinationsseite_numerale_add_form(
355 word_entry,
356 word,
357 page_tite,
358 raw_tags,
359 col_index,
360 row_header,
361 col_headers,
362 )
363 col_index += 1
366def deklinationsseite_numerale_add_form(
367 word_entry: WordEntry,
368 word: str,
369 source: str,
370 raw_tags: list[str],
371 index: int,
372 row_header: str,
373 col_headers: list[SpanHeader],
374) -> None:
375 form = Form(
376 form=word,
377 source=source,
378 raw_tags=raw_tags,
379 )
380 if row_header != "":
381 form.raw_tags.append(row_header)
382 for col_header in col_headers:
383 if (
384 index >= col_header.index
385 and index < col_header.index + col_header.span
386 ):
387 form.raw_tags.append(col_header.text)
388 translate_raw_tags(form)
389 word_entry.forms.append(form)