Coverage for src/wiktextract/extractor/de/flexion.py: 94%
118 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1from dataclasses import dataclass
3from wikitextprocessor import NodeKind
4from wikitextprocessor.parser import HTMLNode, TemplateNode
6from ...page import clean_node
7from ...wxr_context import WiktextractContext
8from .models import Form, WordEntry
9from .tags import translate_raw_tags
12def parse_flexion_page(
13 wxr: WiktextractContext, word_entry: WordEntry, page_title: str
14) -> None:
15 # https://de.wiktionary.org/wiki/Hilfe:Flexionsseiten
16 flexion_page = wxr.wtp.get_page_body(
17 page_title, wxr.wtp.NAMESPACE_DATA["Flexion"]["id"]
18 )
19 if flexion_page is None: 19 ↛ 20line 19 didn't jump to line 20 because the condition on line 19 was never true
20 return
21 flexion_root = wxr.wtp.parse(flexion_page)
22 for flexion_template in flexion_root.find_child_recursively(
23 NodeKind.TEMPLATE
24 ):
25 if flexion_template.template_name.startswith("Deklinationsseite"):
26 process_deklinationsseite_template(
27 wxr, word_entry, flexion_template, page_title
28 )
29 elif flexion_template.template_name.startswith("Deutsch Verb"):
30 process_deutsch_verb_template(
31 wxr, word_entry, flexion_template, page_title
32 )
35@dataclass
36class SpanHeader:
37 text: str
38 index: int
39 span: int
42def process_deklinationsseite_template(
43 wxr: WiktextractContext,
44 word_entry: WordEntry,
45 template_node: TemplateNode,
46 page_tite: str,
47) -> None:
48 # https://de.wiktionary.org/wiki/Vorlage:Deklinationsseite_Adjektiv
49 expanded_template = wxr.wtp.parse(
50 wxr.wtp.node_to_wikitext(template_node), expand_all=True
51 )
52 h4_text = ""
53 for node in expanded_template.find_child(NodeKind.HTML | NodeKind.TABLE):
54 if isinstance(node, HTMLNode) and node.tag == "h4":
55 h4_text = clean_node(wxr, None, node)
56 elif node.kind == NodeKind.TABLE: 56 ↛ 53line 56 didn't jump to line 53 because the condition on line 56 was always true
57 col_headers = []
58 has_article = False
59 for row_node in node.find_child(NodeKind.TABLE_ROW):
60 col_index = 0
61 row_header = ""
62 article = ""
63 for cell_node in row_node.find_child(
64 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
65 ):
66 cell_text = clean_node(wxr, None, cell_node)
67 if cell_node.kind == NodeKind.TABLE_HEADER_CELL:
68 if cell_text == "":
69 continue
70 elif cell_text in ("Artikel", "Wortform"):
71 has_article = True
72 continue
73 elif "colspan" in cell_node.attrs:
74 col_span = int(cell_node.attrs.get("colspan"))
75 if col_span == 9: # new table
76 has_article = False
77 col_headers.clear()
78 col_headers.append(
79 SpanHeader(cell_text, col_index, col_span)
80 )
81 col_index += col_span
82 else:
83 row_header = cell_text
84 elif cell_node.kind == NodeKind.TABLE_CELL: 84 ↛ 63line 84 didn't jump to line 63 because the condition on line 84 was always true
85 if has_article and col_index % 2 == 0:
86 article = cell_text
87 else:
88 form_text = ""
89 if article not in ("", "—"):
90 form_text = article + " "
91 form_text += cell_text
92 form = Form(form=form_text, source=page_tite)
93 if h4_text != "": 93 ↛ 95line 93 didn't jump to line 95 because the condition on line 93 was always true
94 form.raw_tags.append(h4_text)
95 if row_header != "":
96 form.raw_tags.append(row_header)
97 for col_header in col_headers:
98 if (
99 col_header.text not in ("", "—")
100 and col_index >= col_header.index
101 and col_index
102 < col_header.index + col_header.span
103 ):
104 form.raw_tags.append(col_header.text)
105 if form.form not in ("", "—"): 105 ↛ 108line 105 didn't jump to line 108 because the condition on line 105 was always true
106 translate_raw_tags(form)
107 word_entry.forms.append(form)
108 col_index += int(cell_node.attrs.get("colspan", "1"))
111def process_deutsch_verb_template(
112 wxr: WiktextractContext,
113 word_entry: WordEntry,
114 template_node: TemplateNode,
115 page_tite: str,
116) -> None:
117 # Vorlage:Deutsch Verb regelmäßig
118 expanded_template = wxr.wtp.parse(
119 wxr.wtp.node_to_wikitext(template_node), expand_all=True
120 )
121 for table in expanded_template.find_child_recursively(NodeKind.TABLE):
122 col_headers = []
123 for row in table.find_child(NodeKind.TABLE_ROW):
124 row_header = ""
125 col_index = 0
126 col_header_index = 0
127 is_bold_col_header = all(
128 c.contain_node(NodeKind.BOLD)
129 for c in row.find_child(NodeKind.TABLE_CELL)
130 if clean_node(wxr, None, c) != ""
131 )
132 if (
133 len(
134 list(
135 row.find_child(
136 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
137 )
138 )
139 )
140 == 1
141 ):
142 col_headers.clear() # new table
143 for cell in row.find_child(
144 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
145 ):
146 cell_text = clean_node(wxr, None, cell)
147 if cell_text in (
148 "Flexion der Verbaladjektive",
149 "(nichterweiterte) Infinitive",
150 ):
151 break
152 elif (
153 cell.kind == NodeKind.TABLE_HEADER_CELL
154 and cell_text not in ("", "Person")
155 ):
156 colspan = int(cell.attrs.get("colspan", "1"))
157 col_headers.append(
158 SpanHeader(
159 cell_text,
160 col_header_index,
161 colspan,
162 )
163 )
164 col_header_index += colspan
165 elif cell.kind == NodeKind.TABLE_CELL: 165 ↛ 143line 165 didn't jump to line 143 because the condition on line 165 was always true
166 if cell_text in (
167 "",
168 "—",
169 "Text",
170 "Person",
171 ) or cell_text.startswith("Flexion:"):
172 col_index += 1
173 elif (
174 cell.contain_node(NodeKind.BOLD)
175 or (
176 len(list(cell.find_html("small"))) > 0
177 and len(list(cell.filter_empty_str_child())) == 1
178 )
179 # Vorlage:Deutsch Verb schwach untrennbar reflexiv
180 or cell.attrs.get("bgcolor", "").lower() == "#f4f4f4"
181 ): # header in cell
182 colspan = int(cell.attrs.get("colspan", "1"))
183 if is_bold_col_header:
184 for bold_node in cell.find_child(NodeKind.BOLD):
185 col_headers.append(
186 SpanHeader(
187 clean_node(wxr, None, bold_node),
188 col_header_index,
189 colspan,
190 )
191 )
192 else:
193 row_header = cell_text
194 col_header_index += colspan
195 else:
196 for form_text in cell_text.splitlines():
197 form_text = form_text.strip(", ")
198 form_raw_tag = ""
199 if ":" in form_text: 199 ↛ 200line 199 didn't jump to line 200 because the condition on line 199 was never true
200 form_raw_tag, form_text = form_text.split(
201 ":", 1
202 )
203 form = Form(
204 form=form_text.strip(), source=page_tite
205 )
206 if form_raw_tag != "": 206 ↛ 207line 206 didn't jump to line 207 because the condition on line 206 was never true
207 form.raw_tags.append(form_raw_tag)
208 if row_header != "":
209 form.raw_tags.append(row_header)
210 for col_header in col_headers:
211 if (
212 col_index >= col_header.index
213 and col_index
214 < col_header.index + col_header.span
215 ):
216 if col_header.text.endswith("I"): 216 ↛ 217line 216 didn't jump to line 217 because the condition on line 216 was never true
217 form.raw_tags.append(col_header.text)
218 else:
219 for raw_tag in col_header.text.split():
220 form.raw_tags.append(raw_tag)
221 translate_raw_tags(form)
222 word_entry.forms.append(form)
223 col_index += 1