Coverage for src / wiktextract / extractor / de / inflection.py: 87%

227 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2025-12-29 01:50 +0000

1import re 

2from dataclasses import dataclass 

3 

4from wikitextprocessor import NodeKind, TemplateNode, WikiNode 

5 

6from ...page import clean_node 

7from ...wxr_context import WiktextractContext 

8from .flexion import parse_flexion_page 

9from .models import Form, WordEntry 

10from .tags import translate_raw_tags 

11 

12# Kategorie:Wiktionary:Flexionstabelle (Deutsch) 

13 

14 

15def extract_inf_table_template( 

16 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

17) -> None: 

18 if ( 

19 "Substantiv Übersicht" in t_node.template_name 

20 or t_node.template_name.endswith( 

21 ( 

22 "Nachname Übersicht", 

23 "Eigenname Übersicht", 

24 "Vorname Übersicht m", 

25 "Name Übersicht", 

26 "Pronomina-Tabelle", 

27 "Pronomen Übersicht", 

28 "adjektivisch Übersicht", 

29 "Substantiv Dialekt", 

30 "Toponym Übersicht", 

31 ) 

32 ) 

33 or re.search(r" Personalpronomen \d$", t_node.template_name) 

34 ): 

35 extract_noun_table_template(wxr, word_entry, t_node) 

36 elif t_node.template_name.endswith( 

37 ("Adjektiv Übersicht", "Adverb Übersicht") 

38 ): 

39 process_adj_table(wxr, word_entry, t_node) 

40 elif ( 

41 t_node.template_name.endswith("Verb Übersicht") 

42 or t_node.template_name == "Kardinalzahl 2-12" 

43 ): 

44 process_verb_table(wxr, word_entry, t_node) 

45 elif t_node.template_name == "Deutsch Possessivpronomen": 45 ↛ exitline 45 didn't return from function 'extract_inf_table_template' because the condition on line 45 was always true

46 extract_de_pronoun_table(wxr, word_entry, t_node) 

47 

48 

49@dataclass 

50class RowspanHeader: 

51 text: str 

52 index: int 

53 span: int 

54 

55 

56def process_verb_table( 

57 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

58) -> None: 

59 # Vorlage:Deutsch Verb Übersicht 

60 expanded_template = wxr.wtp.parse( 

61 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

62 ) 

63 table_nodes = list(expanded_template.find_child(NodeKind.TABLE)) 

64 if len(table_nodes) == 0: 64 ↛ 65line 64 didn't jump to line 65 because the condition on line 64 was never true

65 return 

66 table_node = table_nodes[0] 

67 col_headers = [] 

68 has_person = False 

69 row_headers = [] 

70 for table_row in table_node.find_child(NodeKind.TABLE_ROW): 

71 col_index = 0 

72 header_col_index = 0 

73 pronouns = [] 

74 for table_cell in table_row.find_child( 

75 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

76 ): 

77 cell_text = clean_node(wxr, None, table_cell) 

78 if cell_text.startswith("All other forms:"): 

79 for link_node in table_cell.find_child_recursively( 

80 NodeKind.LINK 

81 ): 

82 link_text = clean_node(wxr, None, link_node) 

83 if link_text.startswith("Flexion:"): 83 ↛ 79line 83 didn't jump to line 79 because the condition on line 83 was always true

84 parse_flexion_page(wxr, word_entry, link_text) 

85 elif table_cell.kind == NodeKind.TABLE_HEADER_CELL: 

86 if cell_text == "": 

87 continue 

88 elif header_col_index == 0: 

89 rowspan = int(table_cell.attrs.get("rowspan", "1")) 

90 row_headers.append(RowspanHeader(cell_text, 0, rowspan)) 

91 elif cell_text in ("Person", "Wortform"): 

92 has_person = True 

93 else: # new table 

94 col_headers.append(cell_text) 

95 has_person = False 

96 pronouns.clear() 

97 header_col_index += 1 

98 elif table_cell.kind == NodeKind.TABLE_CELL: 98 ↛ 74line 98 didn't jump to line 74 because the condition on line 98 was always true

99 if ( 

100 "background-color: #f4f4f4" 

101 in table_cell.attrs.get("style", "").lower() 

102 ): 

103 # Template:Englisch Verb Übersicht 

104 rowspan = int(table_cell.attrs.get("rowspan", "1")) 

105 row_headers.append(RowspanHeader(cell_text, 0, rowspan)) 

106 continue 

107 elif has_person and col_index == 0: 

108 if cell_text in ("Singular", "Plural"): 

109 row_headers.append(RowspanHeader(cell_text, 0, 1)) 

110 else: 

111 pronouns = list( 

112 filter(None, map(str.strip, cell_text.split(","))) 

113 ) 

114 else: 

115 for cell_line in cell_text.splitlines(): 

116 for form_str in map(str.strip, cell_line.split(",")): 

117 if form_str in ["", "—"]: 117 ↛ 118line 117 didn't jump to line 118 because the condition on line 117 was never true

118 continue 

119 elif form_str.startswith("Flexion:"): 119 ↛ 120line 119 didn't jump to line 120 because the condition on line 119 was never true

120 parse_flexion_page(wxr, word_entry, form_str) 

121 continue 

122 form = Form(form=form_str, pronouns=pronouns) 

123 if col_index < len(col_headers): 

124 form.raw_tags.append(col_headers[col_index]) 

125 for row_header in row_headers: 

126 form.raw_tags.append(row_header.text) 

127 translate_raw_tags(form) 

128 word_entry.forms.append(form) 

129 col_index += 1 

130 

131 new_row_headers = [] 

132 for row_header in row_headers: 

133 if row_header.span > 1: 

134 row_header.span -= 1 

135 new_row_headers.append(row_header) 

136 row_headers = new_row_headers 

137 

138 

139def extract_noun_table_template( 

140 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

141): 

142 # Vorlage:Deutsch Substantiv Übersicht 

143 from .page import extract_note_section 

144 

145 expanded_template = wxr.wtp.parse( 

146 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

147 ) 

148 clean_node(wxr, word_entry, expanded_template) 

149 for table in expanded_template.find_child(NodeKind.TABLE): 

150 process_noun_table(wxr, word_entry, table, t_node.template_name) 

151 

152 # Vorlage:Deutsch Nachname Übersicht 

153 for level_node in expanded_template.find_child(NodeKind.LEVEL4): 153 ↛ 154line 153 didn't jump to line 154 because the loop on line 153 never started

154 section_text = clean_node(wxr, None, level_node.largs) 

155 if section_text.startswith("Anmerkung"): 

156 extract_note_section(wxr, word_entry, level_node) 

157 

158 

159def process_noun_table( 

160 wxr: WiktextractContext, 

161 word_entry: WordEntry, 

162 table: WikiNode, 

163 template_name: str, 

164): 

165 column_headers = [] 

166 table_header = "" 

167 forms = [] 

168 flexion_pages = [] 

169 for table_row in table.find_child(NodeKind.TABLE_ROW): 

170 row_header = "" 

171 is_header_row = not table_row.contain_node(NodeKind.TABLE_CELL) 

172 row_has_header = table_row.contain_node(NodeKind.TABLE_HEADER_CELL) 

173 col_index = 0 

174 for table_cell in table_row.find_child( 

175 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

176 ): 

177 cell_text = clean_node(wxr, None, table_cell) 

178 if table_cell.kind == NodeKind.TABLE_HEADER_CELL: 

179 if ( 

180 cell_text in ["", "Kasus", "Utrum", "m", "f", "m, f"] 

181 and col_index == 0 

182 ): 

183 continue 

184 elif is_header_row: 

185 colspan = int(table_cell.attrs.get("colspan", "1")) 

186 if cell_text != "": 186 ↛ 194line 186 didn't jump to line 194 because the condition on line 186 was always true

187 column_headers.append( 

188 RowspanHeader( 

189 re.sub(r"\s*\d+$", "", cell_text), 

190 col_index, 

191 colspan, 

192 ) 

193 ) 

194 col_index += colspan 

195 else: 

196 row_header = cell_text 

197 elif cell_text == "": 197 ↛ 198line 197 didn't jump to line 198 because the condition on line 197 was never true

198 continue 

199 elif not row_has_header: 199 ↛ 201line 199 didn't jump to line 201 because the condition on line 199 was never true

200 # Vorlage:Deutsch adjektivisch Übersicht 

201 table_header = cell_text 

202 column_headers.clear() 

203 for link_node in table_cell.find_child(NodeKind.LINK): 

204 link_text = clean_node(wxr, None, link_node) 

205 if link_text.startswith("Flexion:"): 

206 flexion_pages.append(link_text) 

207 else: 

208 for form_text in cell_text.splitlines(): 

209 form_text = form_text.strip() 

210 if form_text.startswith("(") and form_text.endswith(")"): 

211 form_text = form_text.strip("() ") 

212 if form_text in ["—", "–", "-", "", "?"]: 212 ↛ 213line 212 didn't jump to line 213 because the condition on line 212 was never true

213 continue 

214 form = Form(form=form_text) 

215 if table_header != "": 215 ↛ 216line 215 didn't jump to line 216 because the condition on line 215 was never true

216 form.raw_tags.append(table_header) 

217 if len(row_header) > 0: 217 ↛ 219line 217 didn't jump to line 219 because the condition on line 217 was always true

218 form.raw_tags.append(row_header) 

219 for col_header in column_headers: 

220 if ( 

221 col_header.text not in ("", "—") 

222 and col_index >= col_header.index 

223 and col_index < col_header.index + col_header.span 

224 ): 

225 form.raw_tags.append(col_header.text) 

226 translate_raw_tags(form) 

227 forms.append(form) 

228 col_index += 1 

229 

230 if template_name in ( 230 ↛ 235line 230 didn't jump to line 235 because the condition on line 230 was always true

231 "Deutsch Substantiv Übersicht", 

232 "Deutsch Vorname Übersicht m", 

233 ): 

234 forms = separate_de_article(wxr, forms) 

235 word_entry.forms.extend(forms) 

236 for flexion_page in flexion_pages: 236 ↛ 237line 236 didn't jump to line 237 because the loop on line 236 never started

237 parse_flexion_page(wxr, word_entry, flexion_page) 

238 

239 

240def process_adj_table( 

241 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

242) -> None: 

243 # Vorlage:Deutsch Adjektiv Übersicht 

244 expanded_template = wxr.wtp.parse( 

245 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

246 ) 

247 table_nodes = list(expanded_template.find_child(NodeKind.TABLE)) 

248 if len(table_nodes) == 0: 248 ↛ 249line 248 didn't jump to line 249 because the condition on line 248 was never true

249 return 

250 table_node = table_nodes[0] 

251 column_headers = [] 

252 for table_row in table_node.find_child(NodeKind.TABLE_ROW): 

253 for col_index, table_cell in enumerate( 

254 table_row.find_child( 

255 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

256 ) 

257 ): 

258 cell_text = clean_node(wxr, None, table_cell) 

259 # because {{int:}} magic word is not implemented 

260 # template "Textbaustein-Intl" expands to English words 

261 if cell_text.startswith("All other forms:"): 

262 for link_node in table_cell.find_child(NodeKind.LINK): 

263 parse_flexion_page( 

264 wxr, word_entry, clean_node(wxr, None, link_node) 

265 ) 

266 elif table_cell.kind == NodeKind.TABLE_HEADER_CELL: 

267 column_headers.append(cell_text) 

268 else: 

269 for form_text in cell_text.splitlines(): 

270 if form_text in ("—", "", "?"): 270 ↛ 271line 270 didn't jump to line 271 because the condition on line 270 was never true

271 continue 

272 form = Form(form=form_text) 

273 if col_index < len(column_headers): 273 ↛ 275line 273 didn't jump to line 275 because the condition on line 273 was always true

274 form.raw_tags.append(column_headers[col_index]) 

275 translate_raw_tags(form) 

276 word_entry.forms.append(form) 

277 

278 

279def extract_de_pronoun_table( 

280 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

281) -> None: 

282 # Vorlage:Deutsch Possessivpronomen 

283 expanded_template = wxr.wtp.parse( 

284 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

285 ) 

286 table_nodes = list(expanded_template.find_child(NodeKind.TABLE)) 

287 if len(table_nodes) == 0: 287 ↛ 288line 287 didn't jump to line 288 because the condition on line 287 was never true

288 return 

289 table_node = table_nodes[0] 

290 col_headers = [] 

291 table_header = "" 

292 for row in table_node.find_child(NodeKind.TABLE_ROW): 

293 row_header = "" 

294 row_has_data = row.contain_node(NodeKind.TABLE_CELL) 

295 col_index = 0 

296 article = "" 

297 for cell in row.find_child( 

298 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

299 ): 

300 cell_text = clean_node(wxr, None, cell) 

301 if cell.kind == NodeKind.TABLE_HEADER_CELL: 

302 if cell_text == "": 

303 continue 

304 elif row_has_data: 

305 row_header = cell_text 

306 elif len(list(row.find_child(NodeKind.TABLE_HEADER_CELL))) == 1: 

307 table_header = cell_text 

308 col_headers.clear() # new table 

309 article = "" 

310 else: 

311 colspan = 1 

312 colspan_str = cell.attrs.get("colspan", "1") 

313 if re.fullmatch(r"\d+", colspan_str): 313 ↛ 315line 313 didn't jump to line 315 because the condition on line 313 was always true

314 colspan = int(colspan_str) 

315 if cell_text != "—": 

316 col_headers.append( 

317 RowspanHeader(cell_text, col_index, colspan) 

318 ) 

319 col_index += colspan 

320 elif cell.kind == NodeKind.TABLE_CELL: 320 ↛ 297line 320 didn't jump to line 297 because the condition on line 320 was always true

321 if col_index % 2 == 0: 

322 if cell_text != "—": 

323 article = cell_text 

324 else: 

325 form = Form(form=cell_text, article=article) 

326 if table_header != "": 326 ↛ 328line 326 didn't jump to line 328 because the condition on line 326 was always true

327 form.raw_tags.append(table_header) 

328 if row_header != "": 328 ↛ 330line 328 didn't jump to line 330 because the condition on line 328 was always true

329 form.raw_tags.append(row_header) 

330 for header in col_headers: 

331 if ( 

332 col_index >= header.index 

333 and col_index < header.index + header.span 

334 and header.text != "Wortform" 

335 ): 

336 form.raw_tags.append(header.text) 

337 translate_raw_tags(form) 

338 if form.form not in ["", "—"]: 338 ↛ 340line 338 didn't jump to line 340 because the condition on line 338 was always true

339 word_entry.forms.append(form) 

340 article = "" 

341 col_index += 1 

342 

343 

344def separate_de_article( 

345 wxr: WiktextractContext, forms: list[Form] 

346) -> list[Form]: 

347 # https://en.wikipedia.org/wiki/German_articles 

348 # https://de.wiktionary.org/wiki/Vorlage:Deutsch_Substantiv_Übersicht 

349 # https://de.wiktionary.org/wiki/Vorlage:Deutsch_Vorname_Übersicht_m 

350 # * May contain parens around the article 

351 new_forms = [] 

352 for form in forms: 

353 m = re.match(r"\(?(der|die|das|den|dem|des)\)?\s+", form.form) 

354 if m is not None: 

355 form.form = form.form[m.end() :] 

356 form.article = m.group(1) 

357 if form.form not in ["", "—"]: 357 ↛ 352line 357 didn't jump to line 352 because the condition on line 357 was always true

358 new_forms.append(form) 

359 return new_forms