Coverage for src/wiktextract/extractor/de/inflection.py: 86%

214 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-06-13 07:43 +0000

1import re 

2from dataclasses import dataclass 

3 

4from wikitextprocessor import NodeKind, TemplateNode 

5 

6from ...page import clean_node 

7from ...wxr_context import WiktextractContext 

8from .flexion import parse_flexion_page 

9from .models import Form, WordEntry 

10from .tags import translate_raw_tags 

11 

12# Kategorie:Wiktionary:Flexionstabelle (Deutsch) 

13 

14 

15def extract_inf_table_template( 

16 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

17) -> None: 

18 if ( 

19 "Substantiv Übersicht" in t_node.template_name 

20 or t_node.template_name.endswith( 

21 ( 

22 "Nachname Übersicht", 

23 "Eigenname Übersicht", 

24 "Vorname Übersicht m", 

25 "Name Übersicht", 

26 "Pronomina-Tabelle", 

27 "Pronomen Übersicht", 

28 "adjektivisch Übersicht", 

29 "Substantiv Dialekt", 

30 "Toponym Übersicht", 

31 ) 

32 ) 

33 or re.search(r" Personalpronomen \d$", t_node.template_name) 

34 ): 

35 process_noun_table(wxr, word_entry, t_node) 

36 elif t_node.template_name.endswith( 

37 ("Adjektiv Übersicht", "Adverb Übersicht") 

38 ): 

39 process_adj_table(wxr, word_entry, t_node) 

40 elif ( 

41 t_node.template_name.endswith("Verb Übersicht") 

42 or t_node.template_name == "Kardinalzahl 2-12" 

43 ): 

44 process_verb_table(wxr, word_entry, t_node) 

45 elif t_node.template_name == "Deutsch Possessivpronomen": 45 ↛ exitline 45 didn't return from function 'extract_inf_table_template' because the condition on line 45 was always true

46 extract_pronoun_table(wxr, word_entry, t_node) 

47 

48 

49@dataclass 

50class RowspanHeader: 

51 text: str 

52 index: int 

53 span: int 

54 

55 

56def process_verb_table( 

57 wxr: WiktextractContext, word_entry: WordEntry, template_node: TemplateNode 

58) -> None: 

59 # Vorlage:Deutsch Verb Übersicht 

60 expanded_template = wxr.wtp.parse( 

61 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

62 ) 

63 table_nodes = list(expanded_template.find_child(NodeKind.TABLE)) 

64 if len(table_nodes) == 0: 64 ↛ 65line 64 didn't jump to line 65 because the condition on line 64 was never true

65 return 

66 table_node = table_nodes[0] 

67 col_headers = [] 

68 has_person = False 

69 row_headers = [] 

70 for table_row in table_node.find_child(NodeKind.TABLE_ROW): 

71 col_index = 0 

72 header_col_index = 0 

73 person = "" 

74 for table_cell in table_row.find_child( 

75 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

76 ): 

77 cell_text = clean_node(wxr, None, table_cell) 

78 if cell_text.startswith("All other forms:"): 

79 for link_node in table_cell.find_child_recursively( 

80 NodeKind.LINK 

81 ): 

82 link_text = clean_node(wxr, None, link_node) 

83 if link_text.startswith("Flexion:"): 83 ↛ 79line 83 didn't jump to line 79 because the condition on line 83 was always true

84 parse_flexion_page(wxr, word_entry, link_text) 

85 elif table_cell.kind == NodeKind.TABLE_HEADER_CELL: 

86 if cell_text == "": 

87 continue 

88 elif header_col_index == 0: 

89 rowspan = int(table_cell.attrs.get("rowspan", "1")) 

90 row_headers.append(RowspanHeader(cell_text, 0, rowspan)) 

91 elif cell_text in ("Person", "Wortform"): 

92 has_person = True 

93 else: # new table 

94 col_headers.append(cell_text) 

95 has_person = False 

96 person = "" 

97 header_col_index += 1 

98 elif table_cell.kind == NodeKind.TABLE_CELL: 98 ↛ 74line 98 didn't jump to line 74 because the condition on line 98 was always true

99 if has_person and col_index == 0: 

100 if cell_text in ("Singular", "Plural"): 

101 row_headers.append(RowspanHeader(cell_text, 0, 1)) 

102 else: 

103 person = cell_text 

104 else: 

105 for cell_line in cell_text.splitlines(): 

106 cell_line = cell_line.strip() 

107 if cell_line in ["", "—"]: 107 ↛ 108line 107 didn't jump to line 108 because the condition on line 107 was never true

108 continue 

109 elif cell_line.startswith("Flexion:"): 109 ↛ 110line 109 didn't jump to line 110 because the condition on line 109 was never true

110 parse_flexion_page(wxr, word_entry, cell_line) 

111 continue 

112 for p in person.split(","): 

113 p = p.strip() 

114 form_text = cell_line 

115 if p != "": 

116 form_text = p + " " + cell_line 

117 if form_text == wxr.wtp.title: 

118 continue 

119 form = Form(form=form_text) 

120 if col_index < len(col_headers): 

121 form.raw_tags.append(col_headers[col_index]) 

122 for row_header in row_headers: 

123 form.raw_tags.append(row_header.text) 

124 translate_raw_tags(form) 

125 word_entry.forms.append(form) 

126 col_index += 1 

127 

128 new_row_headers = [] 

129 for row_header in row_headers: 

130 if row_header.span > 1: 

131 row_header.span -= 1 

132 new_row_headers.append(row_header) 

133 row_headers = new_row_headers 

134 

135 

136def process_noun_table( 

137 wxr: WiktextractContext, word_entry: WordEntry, template_node: TemplateNode 

138) -> None: 

139 # Vorlage:Deutsch Substantiv Übersicht 

140 from .page import extract_note_section 

141 

142 expanded_template = wxr.wtp.parse( 

143 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

144 ) 

145 table_nodes = list(expanded_template.find_child(NodeKind.TABLE)) 

146 if len(table_nodes) == 0: 146 ↛ 147line 146 didn't jump to line 147 because the condition on line 146 was never true

147 return 

148 table_node = table_nodes[0] 

149 column_headers = [] 

150 table_header = "" 

151 for table_row in table_node.find_child(NodeKind.TABLE_ROW): 

152 row_header = "" 

153 is_header_row = not table_row.contain_node(NodeKind.TABLE_CELL) 

154 row_has_header = table_row.contain_node(NodeKind.TABLE_HEADER_CELL) 

155 col_index = 0 

156 for table_cell in table_row.find_child( 

157 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

158 ): 

159 cell_text = clean_node(wxr, None, table_cell) 

160 if table_cell.kind == NodeKind.TABLE_HEADER_CELL: 

161 if ( 

162 cell_text in ["", "Kasus", "Utrum", "m", "f", "m, f"] 

163 and col_index == 0 

164 ): 

165 continue 

166 elif is_header_row: 

167 colspan = int(table_cell.attrs.get("colspan", "1")) 

168 if cell_text != "": 168 ↛ 176line 168 didn't jump to line 176 because the condition on line 168 was always true

169 column_headers.append( 

170 RowspanHeader( 

171 re.sub(r"\s*\d+$", "", cell_text), 

172 col_index, 

173 colspan, 

174 ) 

175 ) 

176 col_index += colspan 

177 else: 

178 row_header = cell_text 

179 elif cell_text == "": 179 ↛ 180line 179 didn't jump to line 180 because the condition on line 179 was never true

180 continue 

181 elif not row_has_header: 181 ↛ 183line 181 didn't jump to line 183 because the condition on line 181 was never true

182 # Vorlage:Deutsch adjektivisch Übersicht 

183 table_header = cell_text 

184 column_headers.clear() 

185 for link_node in table_cell.find_child(NodeKind.LINK): 

186 link_text = clean_node(wxr, None, link_node) 

187 if link_text.startswith("Flexion:"): 

188 parse_flexion_page(wxr, word_entry, link_text) 

189 else: 

190 for form_text in cell_text.splitlines(): 

191 form_text = form_text.strip() 

192 if form_text.startswith("(") and form_text.endswith(")"): 192 ↛ 193line 192 didn't jump to line 193 because the condition on line 192 was never true

193 form_text = form_text.strip("() ") 

194 if form_text in ["—", "–", "-", "", "?", wxr.wtp.title]: 194 ↛ 195line 194 didn't jump to line 195 because the condition on line 194 was never true

195 continue 

196 form = Form(form=form_text) 

197 if table_header != "": 197 ↛ 198line 197 didn't jump to line 198 because the condition on line 197 was never true

198 form.raw_tags.append(table_header) 

199 if len(row_header) > 0: 199 ↛ 201line 199 didn't jump to line 201 because the condition on line 199 was always true

200 form.raw_tags.append(row_header) 

201 for col_header in column_headers: 

202 if ( 

203 col_header.text not in ("", "—") 

204 and col_index >= col_header.index 

205 and col_index < col_header.index + col_header.span 

206 ): 

207 form.raw_tags.append(col_header.text) 

208 translate_raw_tags(form) 

209 word_entry.forms.append(form) 

210 col_index += 1 

211 

212 clean_node(wxr, word_entry, expanded_template) # category links 

213 # Vorlage:Deutsch Nachname Übersicht 

214 for level_node in expanded_template.find_child(NodeKind.LEVEL4): 214 ↛ 215line 214 didn't jump to line 215 because the loop on line 214 never started

215 section_text = clean_node(wxr, None, level_node.largs) 

216 if section_text.startswith("Anmerkung"): 

217 extract_note_section(wxr, word_entry, level_node) 

218 

219 

220def process_adj_table( 

221 wxr: WiktextractContext, word_entry: WordEntry, template_node: TemplateNode 

222) -> None: 

223 # Vorlage:Deutsch Adjektiv Übersicht 

224 expanded_template = wxr.wtp.parse( 

225 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

226 ) 

227 table_nodes = list(expanded_template.find_child(NodeKind.TABLE)) 

228 if len(table_nodes) == 0: 228 ↛ 229line 228 didn't jump to line 229 because the condition on line 228 was never true

229 return 

230 table_node = table_nodes[0] 

231 column_headers = [] 

232 for table_row in table_node.find_child(NodeKind.TABLE_ROW): 

233 for col_index, table_cell in enumerate( 

234 table_row.find_child( 

235 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

236 ) 

237 ): 

238 cell_text = clean_node(wxr, None, table_cell) 

239 # because {{int:}} magic word is not implemented 

240 # template "Textbaustein-Intl" expands to English words 

241 if cell_text.startswith("All other forms:"): 

242 for link_node in table_cell.find_child(NodeKind.LINK): 

243 parse_flexion_page( 

244 wxr, word_entry, clean_node(wxr, None, link_node) 

245 ) 

246 elif table_cell.kind == NodeKind.TABLE_HEADER_CELL: 

247 column_headers.append(cell_text) 

248 else: 

249 for form_text in cell_text.splitlines(): 

250 if form_text in ("—", "", "?"): 250 ↛ 251line 250 didn't jump to line 251 because the condition on line 250 was never true

251 continue 

252 form = Form(form=form_text) 

253 if col_index < len(column_headers): 253 ↛ 255line 253 didn't jump to line 255 because the condition on line 253 was always true

254 form.raw_tags.append(column_headers[col_index]) 

255 translate_raw_tags(form) 

256 word_entry.forms.append(form) 

257 

258 

259def extract_pronoun_table( 

260 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

261) -> None: 

262 # Vorlage:Deutsch Possessivpronomen 

263 expanded_template = wxr.wtp.parse( 

264 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

265 ) 

266 table_nodes = list(expanded_template.find_child(NodeKind.TABLE)) 

267 if len(table_nodes) == 0: 267 ↛ 268line 267 didn't jump to line 268 because the condition on line 267 was never true

268 return 

269 table_node = table_nodes[0] 

270 col_headers = [] 

271 table_header = "" 

272 for row in table_node.find_child(NodeKind.TABLE_ROW): 

273 row_header = "" 

274 row_has_data = row.contain_node(NodeKind.TABLE_CELL) 

275 col_index = 0 

276 article = "" 

277 for cell in row.find_child( 

278 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

279 ): 

280 cell_text = clean_node(wxr, None, cell) 

281 if cell.kind == NodeKind.TABLE_HEADER_CELL: 

282 if cell_text == "": 

283 continue 

284 elif row_has_data: 

285 row_header = cell_text 

286 elif len(list(row.find_child(NodeKind.TABLE_HEADER_CELL))) == 1: 

287 table_header = cell_text 

288 col_headers.clear() # new table 

289 article = "" 

290 else: 

291 colspan = 1 

292 colspan_str = cell.attrs.get("colspan", "1") 

293 if re.fullmatch(r"\d+", colspan_str): 293 ↛ 295line 293 didn't jump to line 295 because the condition on line 293 was always true

294 colspan = int(colspan_str) 

295 if cell_text != "—": 

296 col_headers.append( 

297 RowspanHeader(cell_text, col_index, colspan) 

298 ) 

299 col_index += colspan 

300 elif cell.kind == NodeKind.TABLE_CELL: 300 ↛ 277line 300 didn't jump to line 277 because the condition on line 300 was always true

301 if col_index % 2 == 0: 

302 article = cell_text 

303 else: 

304 form_str = ( 

305 article + " " + cell_text 

306 if article not in ["", "—"] 

307 else cell_text 

308 ) 

309 form = Form(form=form_str) 

310 if table_header != "": 310 ↛ 312line 310 didn't jump to line 312 because the condition on line 310 was always true

311 form.raw_tags.append(table_header) 

312 if row_header != "": 312 ↛ 314line 312 didn't jump to line 314 because the condition on line 312 was always true

313 form.raw_tags.append(row_header) 

314 for header in col_headers: 

315 if ( 

316 col_index >= header.index 

317 and col_index < header.index + header.span 

318 and header.text != "Wortform" 

319 ): 

320 form.raw_tags.append(header.text) 

321 translate_raw_tags(form) 

322 if form.form != wxr.wtp.title: 

323 word_entry.forms.append(form) 

324 article = "" 

325 col_index += 1