Coverage for src/wiktextract/extractor/de/inflection.py: 92%
127 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1import re
2from dataclasses import dataclass
4from wikitextprocessor import NodeKind, TemplateNode
6from ...page import clean_node
7from ...wxr_context import WiktextractContext
8from .flexion import parse_flexion_page
9from .models import Form, WordEntry
10from .tags import translate_raw_tags
13def extract_inf_table_template(
14 wxr: WiktextractContext,
15 word_entry: WordEntry,
16 template_node: TemplateNode,
17) -> None:
18 if template_node.template_name.endswith("Substantiv Übersicht"):
19 process_noun_table(wxr, word_entry, template_node)
20 elif template_node.template_name.endswith("Adjektiv Übersicht"):
21 process_adj_table(wxr, word_entry, template_node)
22 elif template_node.template_name.endswith("Verb Übersicht"): 22 ↛ exitline 22 didn't return from function 'extract_inf_table_template' because the condition on line 22 was always true
23 process_verb_table(wxr, word_entry, template_node)
26@dataclass
27class RowspanHeader:
28 text: str
29 index: int
30 span: int
33def process_verb_table(
34 wxr: WiktextractContext,
35 word_entry: WordEntry,
36 template_node: TemplateNode,
37) -> None:
38 # Vorlage:Deutsch Verb Übersicht
39 expanded_template = wxr.wtp.parse(
40 wxr.wtp.node_to_wikitext(template_node), expand_all=True
41 )
42 table_nodes = list(expanded_template.find_child(NodeKind.TABLE))
43 if len(table_nodes) == 0: 43 ↛ 44line 43 didn't jump to line 44 because the condition on line 43 was never true
44 return
45 table_node = table_nodes[0]
46 col_headers = []
47 has_person = False
48 row_headers = []
49 for table_row in table_node.find_child(NodeKind.TABLE_ROW):
50 col_index = 0
51 header_col_index = 0
52 person = ""
53 for table_cell in table_row.find_child(
54 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
55 ):
56 cell_text = clean_node(wxr, None, table_cell)
57 if cell_text.startswith("All other forms:"):
58 for link_node in table_cell.find_child_recursively(
59 NodeKind.LINK
60 ):
61 parse_flexion_page(
62 wxr, word_entry, clean_node(wxr, None, link_node)
63 )
64 elif table_cell.kind == NodeKind.TABLE_HEADER_CELL:
65 if cell_text == "":
66 continue
67 elif header_col_index == 0:
68 rowspan = int(table_cell.attrs.get("rowspan", "1"))
69 row_headers.append(RowspanHeader(cell_text, 0, rowspan))
70 elif cell_text in ("Person", "Wortform"):
71 has_person = True
72 else: # new table
73 col_headers.append(cell_text)
74 has_person = False
75 person = ""
76 header_col_index += 1
77 elif table_cell.kind == NodeKind.TABLE_CELL: 77 ↛ 53line 77 didn't jump to line 53 because the condition on line 77 was always true
78 if has_person and col_index == 0:
79 if cell_text in ("Singular", "Plural"):
80 row_headers.append(RowspanHeader(cell_text, 0, 1))
81 else:
82 person = cell_text
83 else:
84 for cell_line in cell_text.splitlines():
85 cell_line = cell_line.strip()
86 if cell_line == "": 86 ↛ 87line 86 didn't jump to line 87 because the condition on line 86 was never true
87 continue
88 for p in person.split(","):
89 p = p.strip()
90 form_text = cell_line
91 if p != "":
92 form_text = p + " " + cell_line
93 form = Form(form=form_text)
94 if col_index < len(col_headers):
95 form.raw_tags.append(col_headers[col_index])
96 for row_header in row_headers:
97 form.raw_tags.append(row_header.text)
98 translate_raw_tags(form)
99 word_entry.forms.append(form)
100 col_index += 1
102 new_row_headers = []
103 for row_header in row_headers:
104 if row_header.span > 1:
105 row_header.span -= 1
106 new_row_headers.append(row_header)
107 row_headers = new_row_headers
110def process_noun_table(
111 wxr: WiktextractContext,
112 word_entry: WordEntry,
113 template_node: TemplateNode,
114) -> None:
115 # Vorlage:Deutsch Substantiv Übersicht
116 expanded_template = wxr.wtp.parse(
117 wxr.wtp.node_to_wikitext(template_node), expand_all=True
118 )
119 table_nodes = list(expanded_template.find_child(NodeKind.TABLE))
120 if len(table_nodes) == 0: 120 ↛ 121line 120 didn't jump to line 121 because the condition on line 120 was never true
121 return
122 table_node = table_nodes[0]
123 column_headers = []
124 for table_row in table_node.find_child(NodeKind.TABLE_ROW):
125 row_header = ""
126 is_header_row = not table_row.contain_node(NodeKind.TABLE_CELL)
127 for col_index, table_cell in enumerate(
128 table_row.find_child(
129 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
130 )
131 ):
132 cell_text = clean_node(wxr, None, table_cell)
133 if table_cell.kind == NodeKind.TABLE_HEADER_CELL:
134 if is_header_row:
135 column_headers.append(re.sub(r"\s*\d+$", "", cell_text))
136 else:
137 row_header = cell_text
138 else:
139 for form_text in cell_text.splitlines():
140 form = Form(form=form_text)
141 if len(row_header) > 0: 141 ↛ 143line 141 didn't jump to line 143 because the condition on line 141 was always true
142 form.raw_tags.append(row_header)
143 if col_index < len(column_headers): 143 ↛ 145line 143 didn't jump to line 145 because the condition on line 143 was always true
144 form.raw_tags.append(column_headers[col_index])
145 if form.form not in ["—", "", "?"]: 145 ↛ 139line 145 didn't jump to line 139 because the condition on line 145 was always true
146 translate_raw_tags(form)
147 word_entry.forms.append(form)
149 clean_node(wxr, word_entry, expanded_template) # category links
152def process_adj_table(
153 wxr: WiktextractContext,
154 word_entry: WordEntry,
155 template_node: TemplateNode,
156) -> None:
157 # Vorlage:Deutsch Adjektiv Übersicht
158 expanded_template = wxr.wtp.parse(
159 wxr.wtp.node_to_wikitext(template_node), expand_all=True
160 )
161 table_nodes = list(expanded_template.find_child(NodeKind.TABLE))
162 if len(table_nodes) == 0: 162 ↛ 163line 162 didn't jump to line 163 because the condition on line 162 was never true
163 return
164 table_node = table_nodes[0]
165 column_headers = []
166 for table_row in table_node.find_child(NodeKind.TABLE_ROW):
167 for col_index, table_cell in enumerate(
168 table_row.find_child(
169 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
170 )
171 ):
172 cell_text = clean_node(wxr, None, table_cell)
173 # because {{int:}} magic word is not implemented
174 # template "Textbaustein-Intl" expands to English words
175 if cell_text.startswith("All other forms:"):
176 for link_node in table_cell.find_child(NodeKind.LINK):
177 parse_flexion_page(
178 wxr, word_entry, clean_node(wxr, None, link_node)
179 )
180 elif table_cell.kind == NodeKind.TABLE_HEADER_CELL:
181 column_headers.append(cell_text)
182 else:
183 for form_text in cell_text.splitlines():
184 if form_text in ("—", "", "?"): 184 ↛ 185line 184 didn't jump to line 185 because the condition on line 184 was never true
185 continue
186 form = Form(form=form_text)
187 if col_index < len(column_headers): 187 ↛ 189line 187 didn't jump to line 189 because the condition on line 187 was always true
188 form.raw_tags.append(column_headers[col_index])
189 translate_raw_tags(form)
190 word_entry.forms.append(form)