Coverage for src/wiktextract/extractor/de/flexion.py: 70%
190 statements
« prev ^ index » next coverage.py v7.9.0, created at 2025-06-13 07:43 +0000
« prev ^ index » next coverage.py v7.9.0, created at 2025-06-13 07:43 +0000
1from dataclasses import dataclass
3from wikitextprocessor.parser import (
4 LEVEL_KIND_FLAGS,
5 HTMLNode,
6 LevelNode,
7 NodeKind,
8 TemplateNode,
9 WikiNode,
10)
12from ...page import clean_node
13from ...wxr_context import WiktextractContext
14from .models import Form, WordEntry
15from .tags import GRAMMATICAL_TAGS, translate_raw_tags
18def parse_flexion_page(
19 wxr: WiktextractContext, word_entry: WordEntry, page_title: str
20) -> None:
21 # https://de.wiktionary.org/wiki/Hilfe:Flexionsseiten
22 LEVEL2_TAGS = ["Hilfsverb haben", "Hilfsverb sein"]
24 flexion_page = wxr.wtp.get_page_body(
25 page_title, wxr.wtp.NAMESPACE_DATA["Flexion"]["id"]
26 )
27 if flexion_page is None: 27 ↛ 28line 27 didn't jump to line 28 because the condition on line 27 was never true
28 return
29 flexion_root = wxr.wtp.parse(flexion_page)
30 shared_raw_tags = []
31 for node in flexion_root.find_child_recursively(
32 NodeKind.TEMPLATE | NodeKind.LEVEL2
33 ):
34 match node.kind:
35 case NodeKind.LEVEL2:
36 shared_raw_tags.clear()
37 section_str = clean_node(wxr, None, node.largs)
38 for word in section_str.split(" "):
39 word = word.strip(", ")
40 if word in GRAMMATICAL_TAGS and not page_title.endswith( 40 ↛ 43line 40 didn't jump to line 43 because the condition on line 40 was never true
41 f":{word}"
42 ):
43 shared_raw_tags.append(word)
44 for raw_tag in LEVEL2_TAGS:
45 if raw_tag in section_str: 45 ↛ 46line 45 didn't jump to line 46 because the condition on line 45 was never true
46 shared_raw_tags.append(raw_tag)
47 case NodeKind.TEMPLATE: 47 ↛ 31line 47 didn't jump to line 31 because the pattern on line 47 always matched
48 if node.template_name == "Deklinationsseite Numerale": 48 ↛ 49line 48 didn't jump to line 49 because the condition on line 48 was never true
49 extract_deklinationsseite_numerale_template(
50 wxr, word_entry, node, page_title
51 )
52 elif node.template_name.startswith("Deklinationsseite"):
53 process_deklinationsseite_template(
54 wxr, word_entry, node, page_title
55 )
56 elif node.template_name.startswith("Deutsch Verb"):
57 process_deutsch_verb_template(
58 wxr, word_entry, node, page_title, shared_raw_tags
59 )
62@dataclass
63class SpanHeader:
64 text: str
65 index: int
66 span: int
69def process_deklinationsseite_template(
70 wxr: WiktextractContext,
71 word_entry: WordEntry,
72 template_node: TemplateNode,
73 page_tite: str,
74) -> None:
75 # https://de.wiktionary.org/wiki/Vorlage:Deklinationsseite_Adjektiv
76 expanded_template = wxr.wtp.parse(
77 wxr.wtp.node_to_wikitext(template_node), expand_all=True
78 )
79 h4_text = ""
80 for node in expanded_template.find_child(NodeKind.HTML | NodeKind.TABLE):
81 if isinstance(node, HTMLNode) and node.tag == "h4":
82 h4_text = clean_node(wxr, None, node)
83 elif node.kind == NodeKind.TABLE: 83 ↛ 80line 83 didn't jump to line 80 because the condition on line 83 was always true
84 col_headers = []
85 has_article = False
86 for row_node in node.find_child(NodeKind.TABLE_ROW):
87 col_index = 0
88 row_header = ""
89 article = ""
90 for cell_node in row_node.find_child(
91 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
92 ):
93 cell_text = clean_node(wxr, None, cell_node)
94 if cell_node.kind == NodeKind.TABLE_HEADER_CELL:
95 if cell_text == "":
96 continue
97 elif cell_text in ("Artikel", "Wortform"):
98 has_article = True
99 continue
100 elif "colspan" in cell_node.attrs:
101 col_span = int(cell_node.attrs.get("colspan"))
102 if col_span == 9: # new table
103 has_article = False
104 col_headers.clear()
105 col_headers.append(
106 SpanHeader(cell_text, col_index, col_span)
107 )
108 col_index += col_span
109 else:
110 row_header = cell_text
111 elif cell_node.kind == NodeKind.TABLE_CELL: 111 ↛ 90line 111 didn't jump to line 90 because the condition on line 111 was always true
112 if has_article and col_index % 2 == 0:
113 article = cell_text
114 else:
115 form_text = ""
116 if article not in ("", "—"):
117 form_text = article + " "
118 raw_tags = []
119 if h4_text != "": 119 ↛ 121line 119 didn't jump to line 121 because the condition on line 119 was always true
120 raw_tags.append(h4_text)
121 if row_header != "":
122 raw_tags.append(row_header)
123 for col_header in col_headers:
124 if (
125 col_header.text not in ("", "—")
126 and col_index >= col_header.index
127 and col_index
128 < col_header.index + col_header.span
129 ):
130 raw_tags.append(col_header.text)
131 for line in cell_text.splitlines():
132 form = Form(
133 form=form_text + line,
134 source=page_tite,
135 raw_tags=raw_tags,
136 )
137 if form.form not in ("", "—"): 137 ↛ 131line 137 didn't jump to line 131 because the condition on line 137 was always true
138 translate_raw_tags(form)
139 word_entry.forms.append(form)
140 col_index += int(cell_node.attrs.get("colspan", "1"))
143def process_deutsch_verb_template(
144 wxr: WiktextractContext,
145 word_entry: WordEntry,
146 template_node: TemplateNode,
147 page_tite: str,
148 shared_raw_tags: list[str],
149) -> None:
150 # Vorlage:Deutsch Verb regelmäßig
151 expanded_template = wxr.wtp.parse(
152 wxr.wtp.node_to_wikitext(template_node), expand_all=True
153 )
154 for level_node in expanded_template.find_child(LEVEL_KIND_FLAGS):
155 process_deutsch_verb_section(
156 wxr, word_entry, level_node, page_tite, shared_raw_tags
157 )
160def process_deutsch_verb_section(
161 wxr: WiktextractContext,
162 word_entry: WordEntry,
163 level_node: LevelNode,
164 page_tite: str,
165 shared_raw_tags: list[str],
166) -> None:
167 section_title = clean_node(wxr, None, level_node.largs)
168 new_raw_tags = shared_raw_tags.copy()
169 new_raw_tags.append(section_title)
170 for table_node in level_node.find_child(NodeKind.TABLE):
171 process_deutsch_verb_table(
172 wxr, word_entry, table_node, page_tite, new_raw_tags
173 )
174 for next_level in level_node.find_child(LEVEL_KIND_FLAGS): 174 ↛ 175line 174 didn't jump to line 175 because the loop on line 174 never started
175 process_deutsch_verb_section(
176 wxr, word_entry, next_level, page_tite, new_raw_tags
177 )
180def process_deutsch_verb_table(
181 wxr: WiktextractContext,
182 word_entry: WordEntry,
183 table: WikiNode,
184 page_tite: str,
185 shared_raw_tags: list[str],
186) -> None:
187 col_headers = []
188 for row in table.find_child(NodeKind.TABLE_ROW):
189 row_header = ""
190 col_index = 0
191 col_header_index = 0
192 is_bold_col_header = all(
193 c.contain_node(NodeKind.BOLD)
194 for c in row.find_child(NodeKind.TABLE_CELL)
195 if clean_node(wxr, None, c) != ""
196 )
197 if (
198 len(
199 list(
200 row.find_child(
201 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
202 )
203 )
204 )
205 == 1
206 ):
207 col_headers.clear() # new table
208 for cell in row.find_child(
209 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
210 ):
211 cell_text = clean_node(wxr, None, cell)
212 if cell_text in (
213 "Flexion der Verbaladjektive",
214 "(nichterweiterte) Infinitive",
215 ):
216 break
217 elif cell.kind == NodeKind.TABLE_HEADER_CELL and cell_text not in (
218 "",
219 "Person",
220 ):
221 colspan = int(cell.attrs.get("colspan", "1"))
222 col_headers.append(
223 SpanHeader(
224 cell_text,
225 col_header_index,
226 colspan,
227 )
228 )
229 col_header_index += colspan
230 elif cell.kind == NodeKind.TABLE_CELL: 230 ↛ 208line 230 didn't jump to line 208 because the condition on line 230 was always true
231 if cell_text in (
232 "",
233 "—",
234 "Text",
235 "Person",
236 ) or cell_text.startswith("Flexion:"):
237 col_index += 1
238 elif (
239 cell.contain_node(NodeKind.BOLD)
240 or (
241 len(list(cell.find_html("small"))) > 0
242 and len(list(cell.filter_empty_str_child())) == 1
243 )
244 # Vorlage:Deutsch Verb schwach untrennbar reflexiv
245 or cell.attrs.get("bgcolor", "").lower() == "#f4f4f4"
246 ): # header in cell
247 colspan = int(cell.attrs.get("colspan", "1"))
248 if is_bold_col_header:
249 for bold_node in cell.find_child(NodeKind.BOLD):
250 col_headers.append(
251 SpanHeader(
252 clean_node(wxr, None, bold_node),
253 col_header_index,
254 colspan,
255 )
256 )
257 else:
258 row_header = cell_text
259 col_header_index += colspan
260 else:
261 for form_text in cell_text.splitlines():
262 form_text = form_text.strip(", ")
263 form_raw_tag = ""
264 if ":" in form_text: 264 ↛ 265line 264 didn't jump to line 265 because the condition on line 264 was never true
265 form_raw_tag, form_text = form_text.split(":", 1)
266 form = Form(
267 form=form_text.strip(),
268 source=page_tite,
269 raw_tags=shared_raw_tags,
270 )
271 if form_raw_tag != "": 271 ↛ 272line 271 didn't jump to line 272 because the condition on line 271 was never true
272 form.raw_tags.append(form_raw_tag)
273 if row_header != "":
274 form.raw_tags.append(row_header)
275 for col_header in col_headers:
276 if (
277 col_index >= col_header.index
278 and col_index
279 < col_header.index + col_header.span
280 ):
281 if col_header.text.endswith("I"): 281 ↛ 282line 281 didn't jump to line 282 because the condition on line 281 was never true
282 form.raw_tags.append(col_header.text)
283 else:
284 for raw_tag in col_header.text.split():
285 form.raw_tags.append(raw_tag)
286 translate_raw_tags(form)
287 word_entry.forms.append(form)
288 col_index += 1
291def extract_deklinationsseite_numerale_template(
292 wxr: WiktextractContext,
293 word_entry: WordEntry,
294 t_node: TemplateNode,
295 page_tite: str,
296) -> None:
297 # https://de.wiktionary.org/wiki/Vorlage:Deklinationsseite_Numerale
298 expanded_template = wxr.wtp.parse(
299 wxr.wtp.node_to_wikitext(t_node), expand_all=True
300 )
301 for table in expanded_template.find_child(NodeKind.TABLE):
302 col_headers = []
303 for row in table.find_child(NodeKind.TABLE_ROW):
304 row_header = ""
305 row_has_data = row.contain_node(NodeKind.TABLE_CELL)
306 col_index = 0
307 for cell in row.find_child(
308 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
309 ):
310 cell_text = clean_node(wxr, None, cell)
311 if cell_text == "":
312 continue
313 if cell.kind == NodeKind.TABLE_HEADER_CELL:
314 if row_has_data:
315 row_header = cell_text
316 else:
317 col_span = int(cell.attrs.get("colspan", "1"))
318 if col_index == 0 and not row_has_data:
319 col_headers.clear() # new table
320 col_headers.append(
321 SpanHeader(cell_text, col_index, col_span)
322 )
323 col_index += col_span
324 else:
325 word_nodes = []
326 raw_tags = []
327 for cell_child in cell.children:
328 if (
329 isinstance(cell_child, HTMLNode)
330 and cell_child.tag == "br"
331 ):
332 word = clean_node(wxr, None, word_nodes)
333 if word != "":
334 deklinationsseite_numerale_add_form(
335 word_entry,
336 word,
337 page_tite,
338 raw_tags,
339 col_index,
340 row_header,
341 col_headers,
342 )
343 word_nodes.clear()
344 elif (
345 isinstance(cell_child, WikiNode)
346 and cell_child.kind == NodeKind.ITALIC
347 ):
348 raw_tag = clean_node(wxr, None, cell_child).strip(
349 ": "
350 )
351 if raw_tag != "":
352 raw_tags.append(raw_tag)
353 else:
354 word_nodes.append(cell_child)
355 word = clean_node(wxr, None, word_nodes)
356 if word != "":
357 deklinationsseite_numerale_add_form(
358 word_entry,
359 word,
360 page_tite,
361 raw_tags,
362 col_index,
363 row_header,
364 col_headers,
365 )
366 col_index += 1
369def deklinationsseite_numerale_add_form(
370 word_entry: WordEntry,
371 word: str,
372 source: str,
373 raw_tags: list[str],
374 index: int,
375 row_header: str,
376 col_headers: list[SpanHeader],
377) -> None:
378 form = Form(
379 form=word,
380 source=source,
381 raw_tags=raw_tags,
382 )
383 if row_header != "":
384 form.raw_tags.append(row_header)
385 for col_header in col_headers:
386 if (
387 index >= col_header.index
388 and index < col_header.index + col_header.span
389 ):
390 form.raw_tags.append(col_header.text)
391 translate_raw_tags(form)
392 word_entry.forms.append(form)