Coverage for src / wiktextract / extractor / de / flexion.py: 69%
188 statements
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-12 08:09 +0000
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-12 08:09 +0000
1from dataclasses import dataclass
3from wikitextprocessor.parser import (
4 LEVEL_KIND_FLAGS,
5 HTMLNode,
6 LevelNode,
7 NodeKind,
8 TemplateNode,
9 WikiNode,
10)
12from ...page import clean_node
13from ...wxr_context import WiktextractContext
14from .models import Form, WordEntry
15from .tags import GRAMMATICAL_TAGS, translate_raw_tags
18def parse_flexion_page(
19 wxr: WiktextractContext, word_entry: WordEntry, page_title: str
20) -> None:
21 # https://de.wiktionary.org/wiki/Hilfe:Flexionsseiten
22 LEVEL2_TAGS = ["Hilfsverb haben", "Hilfsverb sein"]
24 flexion_page = wxr.wtp.get_page_body(
25 page_title, wxr.wtp.NAMESPACE_DATA["Flexion"]["id"]
26 )
27 if flexion_page is None: 27 ↛ 28line 27 didn't jump to line 28 because the condition on line 27 was never true
28 return
29 flexion_root = wxr.wtp.parse(flexion_page)
30 shared_raw_tags = []
31 for node in flexion_root.find_child_recursively(
32 NodeKind.TEMPLATE | NodeKind.LEVEL2
33 ):
34 match node.kind:
35 case NodeKind.LEVEL2:
36 shared_raw_tags.clear()
37 section_str = clean_node(wxr, None, node.largs)
38 for word in section_str.split(" "):
39 word = word.strip(", ")
40 if word in GRAMMATICAL_TAGS and not page_title.endswith( 40 ↛ 43line 40 didn't jump to line 43 because the condition on line 40 was never true
41 f":{word}"
42 ):
43 shared_raw_tags.append(word)
44 for raw_tag in LEVEL2_TAGS:
45 if raw_tag in section_str: 45 ↛ 46line 45 didn't jump to line 46 because the condition on line 45 was never true
46 shared_raw_tags.append(raw_tag)
47 case NodeKind.TEMPLATE: 47 ↛ 31line 47 didn't jump to line 31 because the pattern on line 47 always matched
48 if node.template_name == "Deklinationsseite Numerale": 48 ↛ 49line 48 didn't jump to line 49 because the condition on line 48 was never true
49 extract_deklinationsseite_numerale_template(
50 wxr, word_entry, node, page_title
51 )
52 elif node.template_name.startswith("Deklinationsseite"):
53 process_deklinationsseite_template(
54 wxr, word_entry, node, page_title
55 )
56 elif node.template_name.startswith("Deutsch Verb"):
57 process_deutsch_verb_template(
58 wxr, word_entry, node, page_title, shared_raw_tags
59 )
62@dataclass
63class SpanHeader:
64 text: str
65 index: int
66 span: int
69def process_deklinationsseite_template(
70 wxr: WiktextractContext,
71 word_entry: WordEntry,
72 template_node: TemplateNode,
73 page_tite: str,
74) -> None:
75 # https://de.wiktionary.org/wiki/Vorlage:Deklinationsseite_Adjektiv
76 expanded_template = wxr.wtp.parse(
77 wxr.wtp.node_to_wikitext(template_node), expand_all=True
78 )
79 h4_text = ""
80 for node in expanded_template.find_child(NodeKind.HTML | NodeKind.TABLE):
81 if isinstance(node, HTMLNode) and node.tag == "h4":
82 h4_text = clean_node(wxr, None, node)
83 elif node.kind == NodeKind.TABLE: 83 ↛ 80line 83 didn't jump to line 80 because the condition on line 83 was always true
84 col_headers = []
85 has_article = False
86 for row_node in node.find_child(NodeKind.TABLE_ROW):
87 col_index = 0
88 row_header = ""
89 article = ""
90 for cell_node in row_node.find_child(
91 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
92 ):
93 cell_text = clean_node(wxr, None, cell_node)
94 if cell_node.kind == NodeKind.TABLE_HEADER_CELL:
95 if cell_text == "":
96 continue
97 elif cell_text in ("Artikel", "Wortform"):
98 has_article = True
99 continue
100 elif "colspan" in cell_node.attrs:
101 col_span = int(cell_node.attrs.get("colspan"))
102 if col_span == 9: # new table
103 has_article = False
104 col_headers.clear()
105 col_headers.append(
106 SpanHeader(cell_text, col_index, col_span)
107 )
108 col_index += col_span
109 else:
110 row_header = cell_text
111 elif cell_node.kind == NodeKind.TABLE_CELL: 111 ↛ 90line 111 didn't jump to line 90 because the condition on line 111 was always true
112 if has_article and col_index % 2 == 0:
113 if cell_text != "—": 113 ↛ 139line 113 didn't jump to line 139 because the condition on line 113 was always true
114 article = cell_text
115 else:
116 raw_tags = []
117 if h4_text != "": 117 ↛ 119line 117 didn't jump to line 119 because the condition on line 117 was always true
118 raw_tags.append(h4_text)
119 if row_header != "":
120 raw_tags.append(row_header)
121 for col_header in col_headers:
122 if (
123 col_header.text not in ("", "—")
124 and col_index >= col_header.index
125 and col_index
126 < col_header.index + col_header.span
127 ):
128 raw_tags.append(col_header.text)
129 for line in cell_text.splitlines():
130 form = Form(
131 form=line,
132 source=page_tite,
133 raw_tags=raw_tags,
134 article=article,
135 )
136 if form.form not in ("", "—"): 136 ↛ 129line 136 didn't jump to line 129 because the condition on line 136 was always true
137 translate_raw_tags(form)
138 word_entry.forms.append(form)
139 col_index += int(cell_node.attrs.get("colspan", "1"))
142def process_deutsch_verb_template(
143 wxr: WiktextractContext,
144 word_entry: WordEntry,
145 template_node: TemplateNode,
146 page_tite: str,
147 shared_raw_tags: list[str],
148) -> None:
149 # Vorlage:Deutsch Verb regelmäßig
150 expanded_template = wxr.wtp.parse(
151 wxr.wtp.node_to_wikitext(template_node), expand_all=True
152 )
153 for level_node in expanded_template.find_child(LEVEL_KIND_FLAGS):
154 process_deutsch_verb_section(
155 wxr, word_entry, level_node, page_tite, shared_raw_tags
156 )
159def process_deutsch_verb_section(
160 wxr: WiktextractContext,
161 word_entry: WordEntry,
162 level_node: LevelNode,
163 page_tite: str,
164 shared_raw_tags: list[str],
165) -> None:
166 section_title = clean_node(wxr, None, level_node.largs)
167 new_raw_tags = shared_raw_tags.copy()
168 new_raw_tags.append(section_title)
169 for table_node in level_node.find_child(NodeKind.TABLE):
170 process_deutsch_verb_table(
171 wxr, word_entry, table_node, page_tite, new_raw_tags
172 )
173 for next_level in level_node.find_child(LEVEL_KIND_FLAGS): 173 ↛ 174line 173 didn't jump to line 174 because the loop on line 173 never started
174 process_deutsch_verb_section(
175 wxr, word_entry, next_level, page_tite, new_raw_tags
176 )
179def process_deutsch_verb_table(
180 wxr: WiktextractContext,
181 word_entry: WordEntry,
182 table: WikiNode,
183 page_tite: str,
184 shared_raw_tags: list[str],
185) -> None:
186 col_headers = []
187 for row in table.find_child(NodeKind.TABLE_ROW):
188 row_header = ""
189 col_index = 0
190 col_header_index = 0
191 is_bold_col_header = all(
192 c.contain_node(NodeKind.BOLD)
193 for c in row.find_child(NodeKind.TABLE_CELL)
194 if clean_node(wxr, None, c) != ""
195 )
196 if (
197 len(
198 list(
199 row.find_child(
200 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
201 )
202 )
203 )
204 == 1
205 ):
206 col_headers.clear() # new table
207 for cell in row.find_child(
208 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
209 ):
210 cell_text = clean_node(wxr, None, cell)
211 if cell_text in (
212 "Flexion der Verbaladjektive",
213 "(nichterweiterte) Infinitive",
214 ):
215 break
216 elif cell.kind == NodeKind.TABLE_HEADER_CELL and cell_text not in (
217 "",
218 "Person",
219 ):
220 colspan = int(cell.attrs.get("colspan", "1"))
221 col_headers.append(
222 SpanHeader(
223 cell_text,
224 col_header_index,
225 colspan,
226 )
227 )
228 col_header_index += colspan
229 elif cell.kind == NodeKind.TABLE_CELL: 229 ↛ 207line 229 didn't jump to line 207 because the condition on line 229 was always true
230 if cell_text in (
231 "",
232 "—",
233 "Text",
234 "Person",
235 ) or cell_text.startswith("Flexion:"):
236 col_index += 1
237 elif (
238 cell.contain_node(NodeKind.BOLD)
239 or (
240 len(list(cell.find_html("small"))) > 0
241 and len(list(cell.filter_empty_str_child())) == 1
242 )
243 # Vorlage:Deutsch Verb schwach untrennbar reflexiv
244 or cell.attrs.get("bgcolor", "").lower() == "#f4f4f4"
245 ): # header in cell
246 colspan = int(cell.attrs.get("colspan", "1"))
247 if is_bold_col_header:
248 for bold_node in cell.find_child(NodeKind.BOLD):
249 col_headers.append(
250 SpanHeader(
251 clean_node(wxr, None, bold_node),
252 col_header_index,
253 colspan,
254 )
255 )
256 else:
257 row_header = cell_text
258 col_header_index += colspan
259 else:
260 for form_text in cell_text.splitlines():
261 form_text = form_text.strip(", ")
262 form_raw_tag = ""
263 if ":" in form_text: 263 ↛ 264line 263 didn't jump to line 264 because the condition on line 263 was never true
264 form_raw_tag, form_text = form_text.split(":", 1)
265 form = Form(
266 form=form_text.strip(),
267 source=page_tite,
268 raw_tags=shared_raw_tags,
269 )
270 if form_raw_tag != "": 270 ↛ 271line 270 didn't jump to line 271 because the condition on line 270 was never true
271 form.raw_tags.append(form_raw_tag)
272 if row_header != "":
273 form.raw_tags.append(row_header)
274 for col_header in col_headers:
275 if (
276 col_index >= col_header.index
277 and col_index
278 < col_header.index + col_header.span
279 ):
280 if col_header.text.endswith("I"): 280 ↛ 281line 280 didn't jump to line 281 because the condition on line 280 was never true
281 form.raw_tags.append(col_header.text)
282 else:
283 for raw_tag in col_header.text.split():
284 form.raw_tags.append(raw_tag)
285 translate_raw_tags(form)
286 word_entry.forms.append(form)
287 col_index += 1
290def extract_deklinationsseite_numerale_template(
291 wxr: WiktextractContext,
292 word_entry: WordEntry,
293 t_node: TemplateNode,
294 page_tite: str,
295) -> None:
296 # https://de.wiktionary.org/wiki/Vorlage:Deklinationsseite_Numerale
297 expanded_template = wxr.wtp.parse(
298 wxr.wtp.node_to_wikitext(t_node), expand_all=True
299 )
300 for table in expanded_template.find_child(NodeKind.TABLE):
301 col_headers = []
302 for row in table.find_child(NodeKind.TABLE_ROW):
303 row_header = ""
304 row_has_data = row.contain_node(NodeKind.TABLE_CELL)
305 col_index = 0
306 for cell in row.find_child(
307 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
308 ):
309 cell_text = clean_node(wxr, None, cell)
310 if cell_text == "":
311 continue
312 if cell.kind == NodeKind.TABLE_HEADER_CELL:
313 if row_has_data:
314 row_header = cell_text
315 else:
316 col_span = int(cell.attrs.get("colspan", "1"))
317 if col_index == 0 and not row_has_data:
318 col_headers.clear() # new table
319 col_headers.append(
320 SpanHeader(cell_text, col_index, col_span)
321 )
322 col_index += col_span
323 else:
324 word_nodes = []
325 raw_tags = []
326 for cell_child in cell.children:
327 if (
328 isinstance(cell_child, HTMLNode)
329 and cell_child.tag == "br"
330 ):
331 word = clean_node(wxr, None, word_nodes)
332 if word != "":
333 deklinationsseite_numerale_add_form(
334 word_entry,
335 word,
336 page_tite,
337 raw_tags,
338 col_index,
339 row_header,
340 col_headers,
341 )
342 word_nodes.clear()
343 elif (
344 isinstance(cell_child, WikiNode)
345 and cell_child.kind == NodeKind.ITALIC
346 ):
347 raw_tag = clean_node(wxr, None, cell_child).strip(
348 ": "
349 )
350 if raw_tag != "":
351 raw_tags.append(raw_tag)
352 else:
353 word_nodes.append(cell_child)
354 word = clean_node(wxr, None, word_nodes)
355 if word != "":
356 deklinationsseite_numerale_add_form(
357 word_entry,
358 word,
359 page_tite,
360 raw_tags,
361 col_index,
362 row_header,
363 col_headers,
364 )
365 col_index += 1
368def deklinationsseite_numerale_add_form(
369 word_entry: WordEntry,
370 word: str,
371 source: str,
372 raw_tags: list[str],
373 index: int,
374 row_header: str,
375 col_headers: list[SpanHeader],
376) -> None:
377 form = Form(
378 form=word,
379 source=source,
380 raw_tags=raw_tags,
381 )
382 if row_header != "":
383 form.raw_tags.append(row_header)
384 for col_header in col_headers:
385 if (
386 index >= col_header.index
387 and index < col_header.index + col_header.span
388 ):
389 form.raw_tags.append(col_header.text)
390 translate_raw_tags(form)
391 word_entry.forms.append(form)