Coverage for src / wiktextract / extractor / de / inflection.py: 87%

227 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2026-01-19 11:25 +0000

1import re 

2from dataclasses import dataclass 

3 

4from wikitextprocessor import NodeKind, TemplateNode, WikiNode 

5 

6from ...page import clean_node 

7from ...wxr_context import WiktextractContext 

8from .flexion import parse_flexion_page 

9from .models import Form, WordEntry 

10from .tags import translate_raw_tags 

11 

12# Kategorie:Wiktionary:Flexionstabelle (Deutsch) 

13 

14 

15def extract_inf_table_template( 

16 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

17) -> None: 

18 if ( 

19 "Substantiv Übersicht" in t_node.template_name 

20 or t_node.template_name.endswith( 

21 ( 

22 "Nachname Übersicht", 

23 "Eigenname Übersicht", 

24 "Vorname Übersicht m", 

25 "Vorname Übersicht f", 

26 "Name Übersicht", 

27 "Pronomina-Tabelle", 

28 "Pronomen Übersicht", 

29 "adjektivisch Übersicht", 

30 "Substantiv Dialekt", 

31 "Toponym Übersicht", 

32 ) 

33 ) 

34 or re.search(r" Personalpronomen \d$", t_node.template_name) 

35 ): 

36 extract_noun_table_template(wxr, word_entry, t_node) 

37 elif t_node.template_name.endswith( 

38 ("Adjektiv Übersicht", "Adverb Übersicht") 

39 ): 

40 process_adj_table(wxr, word_entry, t_node) 

41 elif ( 

42 t_node.template_name.endswith("Verb Übersicht") 

43 or t_node.template_name == "Kardinalzahl 2-12" 

44 ): 

45 process_verb_table(wxr, word_entry, t_node) 

46 elif t_node.template_name == "Deutsch Possessivpronomen": 46 ↛ exitline 46 didn't return from function 'extract_inf_table_template' because the condition on line 46 was always true

47 extract_de_pronoun_table(wxr, word_entry, t_node) 

48 

49 

50@dataclass 

51class RowspanHeader: 

52 text: str 

53 index: int 

54 span: int 

55 

56 

57def process_verb_table( 

58 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

59) -> None: 

60 # Vorlage:Deutsch Verb Übersicht 

61 expanded_template = wxr.wtp.parse( 

62 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

63 ) 

64 table_nodes = list(expanded_template.find_child(NodeKind.TABLE)) 

65 if len(table_nodes) == 0: 65 ↛ 66line 65 didn't jump to line 66 because the condition on line 65 was never true

66 return 

67 table_node = table_nodes[0] 

68 col_headers = [] 

69 has_person = False 

70 row_headers = [] 

71 for table_row in table_node.find_child(NodeKind.TABLE_ROW): 

72 col_index = 0 

73 header_col_index = 0 

74 pronouns = [] 

75 for table_cell in table_row.find_child( 

76 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

77 ): 

78 cell_text = clean_node(wxr, None, table_cell) 

79 if cell_text.startswith("All other forms:"): 

80 for link_node in table_cell.find_child_recursively( 

81 NodeKind.LINK 

82 ): 

83 link_text = clean_node(wxr, None, link_node) 

84 if link_text.startswith("Flexion:"): 84 ↛ 80line 84 didn't jump to line 80 because the condition on line 84 was always true

85 parse_flexion_page(wxr, word_entry, link_text) 

86 elif table_cell.kind == NodeKind.TABLE_HEADER_CELL: 

87 if cell_text == "": 

88 continue 

89 elif header_col_index == 0: 

90 rowspan = int(table_cell.attrs.get("rowspan", "1")) 

91 row_headers.append(RowspanHeader(cell_text, 0, rowspan)) 

92 elif cell_text in ("Person", "Wortform"): 

93 has_person = True 

94 else: # new table 

95 col_headers.append(cell_text) 

96 has_person = False 

97 pronouns.clear() 

98 header_col_index += 1 

99 elif table_cell.kind == NodeKind.TABLE_CELL: 99 ↛ 75line 99 didn't jump to line 75 because the condition on line 99 was always true

100 if ( 

101 "background-color: #f4f4f4" 

102 in table_cell.attrs.get("style", "").lower() 

103 ): 

104 # Template:Englisch Verb Übersicht 

105 rowspan = int(table_cell.attrs.get("rowspan", "1")) 

106 row_headers.append(RowspanHeader(cell_text, 0, rowspan)) 

107 continue 

108 elif has_person and col_index == 0: 

109 if cell_text in ("Singular", "Plural"): 

110 row_headers.append(RowspanHeader(cell_text, 0, 1)) 

111 else: 

112 pronouns = list( 

113 filter(None, map(str.strip, cell_text.split(","))) 

114 ) 

115 else: 

116 for cell_line in cell_text.splitlines(): 

117 for form_str in map(str.strip, cell_line.split(",")): 

118 if form_str in ["", "—"]: 118 ↛ 119line 118 didn't jump to line 119 because the condition on line 118 was never true

119 continue 

120 elif form_str.startswith("Flexion:"): 120 ↛ 121line 120 didn't jump to line 121 because the condition on line 120 was never true

121 parse_flexion_page(wxr, word_entry, form_str) 

122 continue 

123 form = Form(form=form_str, pronouns=pronouns) 

124 if col_index < len(col_headers): 

125 form.raw_tags.append(col_headers[col_index]) 

126 for row_header in row_headers: 

127 form.raw_tags.append(row_header.text) 

128 translate_raw_tags(form) 

129 word_entry.forms.append(form) 

130 col_index += 1 

131 

132 new_row_headers = [] 

133 for row_header in row_headers: 

134 if row_header.span > 1: 

135 row_header.span -= 1 

136 new_row_headers.append(row_header) 

137 row_headers = new_row_headers 

138 

139 

140def extract_noun_table_template( 

141 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

142): 

143 # Vorlage:Deutsch Substantiv Übersicht 

144 from .page import extract_note_section 

145 

146 expanded_template = wxr.wtp.parse( 

147 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

148 ) 

149 clean_node(wxr, word_entry, expanded_template) 

150 for table in expanded_template.find_child(NodeKind.TABLE): 

151 process_noun_table(wxr, word_entry, table, t_node.template_name) 

152 

153 # Vorlage:Deutsch Nachname Übersicht 

154 for level_node in expanded_template.find_child(NodeKind.LEVEL4): 154 ↛ 155line 154 didn't jump to line 155 because the loop on line 154 never started

155 section_text = clean_node(wxr, None, level_node.largs) 

156 if section_text.startswith("Anmerkung"): 

157 extract_note_section(wxr, word_entry, level_node) 

158 

159 

160def process_noun_table( 

161 wxr: WiktextractContext, 

162 word_entry: WordEntry, 

163 table: WikiNode, 

164 template_name: str, 

165): 

166 column_headers = [] 

167 table_header = "" 

168 forms = [] 

169 flexion_pages = [] 

170 for table_row in table.find_child(NodeKind.TABLE_ROW): 

171 row_header = "" 

172 is_header_row = not table_row.contain_node(NodeKind.TABLE_CELL) 

173 row_has_header = table_row.contain_node(NodeKind.TABLE_HEADER_CELL) 

174 col_index = 0 

175 for table_cell in table_row.find_child( 

176 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

177 ): 

178 cell_text = clean_node(wxr, None, table_cell) 

179 if table_cell.kind == NodeKind.TABLE_HEADER_CELL: 

180 if ( 

181 cell_text in ["", "Kasus", "Utrum", "m", "f", "m, f"] 

182 and col_index == 0 

183 ): 

184 continue 

185 elif is_header_row: 

186 colspan = int(table_cell.attrs.get("colspan", "1")) 

187 if cell_text != "": 187 ↛ 195line 187 didn't jump to line 195 because the condition on line 187 was always true

188 column_headers.append( 

189 RowspanHeader( 

190 re.sub(r"\s*\d+$", "", cell_text), 

191 col_index, 

192 colspan, 

193 ) 

194 ) 

195 col_index += colspan 

196 else: 

197 row_header = cell_text 

198 elif cell_text == "": 198 ↛ 199line 198 didn't jump to line 199 because the condition on line 198 was never true

199 continue 

200 elif not row_has_header: 200 ↛ 202line 200 didn't jump to line 202 because the condition on line 200 was never true

201 # Vorlage:Deutsch adjektivisch Übersicht 

202 table_header = cell_text 

203 column_headers.clear() 

204 for link_node in table_cell.find_child(NodeKind.LINK): 

205 link_text = clean_node(wxr, None, link_node) 

206 if link_text.startswith("Flexion:"): 

207 flexion_pages.append(link_text) 

208 else: 

209 for form_text in cell_text.splitlines(): 

210 form_text = form_text.strip() 

211 if form_text.startswith("(") and form_text.endswith(")"): 

212 form_text = form_text.strip("() ") 

213 if form_text in ["—", "–", "-", "", "?"]: 213 ↛ 214line 213 didn't jump to line 214 because the condition on line 213 was never true

214 continue 

215 form = Form(form=form_text) 

216 if table_header != "": 216 ↛ 217line 216 didn't jump to line 217 because the condition on line 216 was never true

217 form.raw_tags.append(table_header) 

218 if len(row_header) > 0: 218 ↛ 220line 218 didn't jump to line 220 because the condition on line 218 was always true

219 form.raw_tags.append(row_header) 

220 for col_header in column_headers: 

221 if ( 

222 col_header.text not in ("", "—") 

223 and col_index >= col_header.index 

224 and col_index < col_header.index + col_header.span 

225 ): 

226 form.raw_tags.append(col_header.text) 

227 translate_raw_tags(form) 

228 forms.append(form) 

229 col_index += 1 

230 

231 if template_name in ( 231 ↛ 237line 231 didn't jump to line 237 because the condition on line 231 was always true

232 "Deutsch Substantiv Übersicht", 

233 "Deutsch Vorname Übersicht m", 

234 "Deutsch Vorname Übersicht f", 

235 ): 

236 forms = separate_de_article(wxr, forms) 

237 word_entry.forms.extend(forms) 

238 for flexion_page in flexion_pages: 238 ↛ 239line 238 didn't jump to line 239 because the loop on line 238 never started

239 parse_flexion_page(wxr, word_entry, flexion_page) 

240 

241 

242def process_adj_table( 

243 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

244) -> None: 

245 # Vorlage:Deutsch Adjektiv Übersicht 

246 expanded_template = wxr.wtp.parse( 

247 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

248 ) 

249 table_nodes = list(expanded_template.find_child(NodeKind.TABLE)) 

250 if len(table_nodes) == 0: 250 ↛ 251line 250 didn't jump to line 251 because the condition on line 250 was never true

251 return 

252 table_node = table_nodes[0] 

253 column_headers = [] 

254 for table_row in table_node.find_child(NodeKind.TABLE_ROW): 

255 for col_index, table_cell in enumerate( 

256 table_row.find_child( 

257 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

258 ) 

259 ): 

260 cell_text = clean_node(wxr, None, table_cell) 

261 # because {{int:}} magic word is not implemented 

262 # template "Textbaustein-Intl" expands to English words 

263 if cell_text.startswith("All other forms:"): 

264 for link_node in table_cell.find_child(NodeKind.LINK): 

265 parse_flexion_page( 

266 wxr, word_entry, clean_node(wxr, None, link_node) 

267 ) 

268 elif table_cell.kind == NodeKind.TABLE_HEADER_CELL: 

269 column_headers.append(cell_text) 

270 else: 

271 for form_text in cell_text.splitlines(): 

272 if form_text in ("—", "", "?"): 272 ↛ 273line 272 didn't jump to line 273 because the condition on line 272 was never true

273 continue 

274 form = Form(form=form_text) 

275 if col_index < len(column_headers): 275 ↛ 277line 275 didn't jump to line 277 because the condition on line 275 was always true

276 form.raw_tags.append(column_headers[col_index]) 

277 translate_raw_tags(form) 

278 word_entry.forms.append(form) 

279 

280 

281def extract_de_pronoun_table( 

282 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

283) -> None: 

284 # Vorlage:Deutsch Possessivpronomen 

285 expanded_template = wxr.wtp.parse( 

286 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

287 ) 

288 table_nodes = list(expanded_template.find_child(NodeKind.TABLE)) 

289 if len(table_nodes) == 0: 289 ↛ 290line 289 didn't jump to line 290 because the condition on line 289 was never true

290 return 

291 table_node = table_nodes[0] 

292 col_headers = [] 

293 table_header = "" 

294 for row in table_node.find_child(NodeKind.TABLE_ROW): 

295 row_header = "" 

296 row_has_data = row.contain_node(NodeKind.TABLE_CELL) 

297 col_index = 0 

298 article = "" 

299 for cell in row.find_child( 

300 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

301 ): 

302 cell_text = clean_node(wxr, None, cell) 

303 if cell.kind == NodeKind.TABLE_HEADER_CELL: 

304 if cell_text == "": 

305 continue 

306 elif row_has_data: 

307 row_header = cell_text 

308 elif len(list(row.find_child(NodeKind.TABLE_HEADER_CELL))) == 1: 

309 table_header = cell_text 

310 col_headers.clear() # new table 

311 article = "" 

312 else: 

313 colspan = 1 

314 colspan_str = cell.attrs.get("colspan", "1") 

315 if re.fullmatch(r"\d+", colspan_str): 315 ↛ 317line 315 didn't jump to line 317 because the condition on line 315 was always true

316 colspan = int(colspan_str) 

317 if cell_text != "—": 

318 col_headers.append( 

319 RowspanHeader(cell_text, col_index, colspan) 

320 ) 

321 col_index += colspan 

322 elif cell.kind == NodeKind.TABLE_CELL: 322 ↛ 299line 322 didn't jump to line 299 because the condition on line 322 was always true

323 if col_index % 2 == 0: 

324 if cell_text != "—": 

325 article = cell_text 

326 else: 

327 form = Form(form=cell_text, article=article) 

328 if table_header != "": 328 ↛ 330line 328 didn't jump to line 330 because the condition on line 328 was always true

329 form.raw_tags.append(table_header) 

330 if row_header != "": 330 ↛ 332line 330 didn't jump to line 332 because the condition on line 330 was always true

331 form.raw_tags.append(row_header) 

332 for header in col_headers: 

333 if ( 

334 col_index >= header.index 

335 and col_index < header.index + header.span 

336 and header.text != "Wortform" 

337 ): 

338 form.raw_tags.append(header.text) 

339 translate_raw_tags(form) 

340 if form.form not in ["", "—"]: 340 ↛ 342line 340 didn't jump to line 342 because the condition on line 340 was always true

341 word_entry.forms.append(form) 

342 article = "" 

343 col_index += 1 

344 

345 

346def separate_de_article( 

347 wxr: WiktextractContext, forms: list[Form] 

348) -> list[Form]: 

349 # https://en.wikipedia.org/wiki/German_articles 

350 # https://de.wiktionary.org/wiki/Vorlage:Deutsch_Substantiv_Übersicht 

351 # https://de.wiktionary.org/wiki/Vorlage:Deutsch_Vorname_Übersicht_m 

352 # * May contain parens around the article 

353 new_forms = [] 

354 for form in forms: 

355 m = re.match(r"\(?(der|die|das|den|dem|des)\)?\s+", form.form) 

356 if m is not None: 

357 form.form = form.form[m.end() :] 

358 form.article = m.group(1) 

359 if form.form not in ["", "—"]: 359 ↛ 354line 359 didn't jump to line 354 because the condition on line 359 was always true

360 new_forms.append(form) 

361 return new_forms