Coverage for src / wiktextract / extractor / de / inflection.py: 86%

211 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2025-12-05 07:46 +0000

1import re 

2from dataclasses import dataclass 

3 

4from wikitextprocessor import NodeKind, TemplateNode 

5 

6from ...page import clean_node 

7from ...wxr_context import WiktextractContext 

8from .flexion import parse_flexion_page 

9from .models import Form, WordEntry 

10from .tags import translate_raw_tags 

11 

12# Kategorie:Wiktionary:Flexionstabelle (Deutsch) 

13 

14 

15def extract_inf_table_template( 

16 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

17) -> None: 

18 if ( 

19 "Substantiv Übersicht" in t_node.template_name 

20 or t_node.template_name.endswith( 

21 ( 

22 "Nachname Übersicht", 

23 "Eigenname Übersicht", 

24 "Vorname Übersicht m", 

25 "Name Übersicht", 

26 "Pronomina-Tabelle", 

27 "Pronomen Übersicht", 

28 "adjektivisch Übersicht", 

29 "Substantiv Dialekt", 

30 "Toponym Übersicht", 

31 ) 

32 ) 

33 or re.search(r" Personalpronomen \d$", t_node.template_name) 

34 ): 

35 process_noun_table(wxr, word_entry, t_node) 

36 elif t_node.template_name.endswith( 

37 ("Adjektiv Übersicht", "Adverb Übersicht") 

38 ): 

39 process_adj_table(wxr, word_entry, t_node) 

40 elif ( 

41 t_node.template_name.endswith("Verb Übersicht") 

42 or t_node.template_name == "Kardinalzahl 2-12" 

43 ): 

44 process_verb_table(wxr, word_entry, t_node) 

45 elif t_node.template_name == "Deutsch Possessivpronomen": 45 ↛ exitline 45 didn't return from function 'extract_inf_table_template' because the condition on line 45 was always true

46 extract_pronoun_table(wxr, word_entry, t_node) 

47 

48 

49@dataclass 

50class RowspanHeader: 

51 text: str 

52 index: int 

53 span: int 

54 

55 

56def process_verb_table( 

57 wxr: WiktextractContext, word_entry: WordEntry, template_node: TemplateNode 

58) -> None: 

59 # Vorlage:Deutsch Verb Übersicht 

60 expanded_template = wxr.wtp.parse( 

61 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

62 ) 

63 table_nodes = list(expanded_template.find_child(NodeKind.TABLE)) 

64 if len(table_nodes) == 0: 64 ↛ 65line 64 didn't jump to line 65 because the condition on line 64 was never true

65 return 

66 table_node = table_nodes[0] 

67 col_headers = [] 

68 has_person = False 

69 row_headers = [] 

70 for table_row in table_node.find_child(NodeKind.TABLE_ROW): 

71 col_index = 0 

72 header_col_index = 0 

73 pronouns = [] 

74 for table_cell in table_row.find_child( 

75 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

76 ): 

77 cell_text = clean_node(wxr, None, table_cell) 

78 if cell_text.startswith("All other forms:"): 

79 for link_node in table_cell.find_child_recursively( 

80 NodeKind.LINK 

81 ): 

82 link_text = clean_node(wxr, None, link_node) 

83 if link_text.startswith("Flexion:"): 83 ↛ 79line 83 didn't jump to line 79 because the condition on line 83 was always true

84 parse_flexion_page(wxr, word_entry, link_text) 

85 elif table_cell.kind == NodeKind.TABLE_HEADER_CELL: 

86 if cell_text == "": 

87 continue 

88 elif header_col_index == 0: 

89 rowspan = int(table_cell.attrs.get("rowspan", "1")) 

90 row_headers.append(RowspanHeader(cell_text, 0, rowspan)) 

91 elif cell_text in ("Person", "Wortform"): 

92 has_person = True 

93 else: # new table 

94 col_headers.append(cell_text) 

95 has_person = False 

96 pronouns.clear() 

97 header_col_index += 1 

98 elif table_cell.kind == NodeKind.TABLE_CELL: 98 ↛ 74line 98 didn't jump to line 74 because the condition on line 98 was always true

99 if ( 

100 "background-color: #f4f4f4" 

101 in table_cell.attrs.get("style", "").lower() 

102 ): 

103 # Template:Englisch Verb Übersicht 

104 rowspan = int(table_cell.attrs.get("rowspan", "1")) 

105 row_headers.append(RowspanHeader(cell_text, 0, rowspan)) 

106 continue 

107 elif has_person and col_index == 0: 

108 if cell_text in ("Singular", "Plural"): 

109 row_headers.append(RowspanHeader(cell_text, 0, 1)) 

110 else: 

111 pronouns = list( 

112 filter(None, map(str.strip, cell_text.split(","))) 

113 ) 

114 else: 

115 for cell_line in cell_text.splitlines(): 

116 for form_str in map(str.strip, cell_line.split(",")): 

117 if form_str in ["", "—", wxr.wtp.title]: 

118 continue 

119 elif form_str.startswith("Flexion:"): 119 ↛ 120line 119 didn't jump to line 120 because the condition on line 119 was never true

120 parse_flexion_page(wxr, word_entry, form_str) 

121 continue 

122 form = Form(form=form_str, pronouns=pronouns) 

123 if col_index < len(col_headers): 

124 form.raw_tags.append(col_headers[col_index]) 

125 for row_header in row_headers: 

126 form.raw_tags.append(row_header.text) 

127 translate_raw_tags(form) 

128 word_entry.forms.append(form) 

129 col_index += 1 

130 

131 new_row_headers = [] 

132 for row_header in row_headers: 

133 if row_header.span > 1: 

134 row_header.span -= 1 

135 new_row_headers.append(row_header) 

136 row_headers = new_row_headers 

137 

138 

139def process_noun_table( 

140 wxr: WiktextractContext, word_entry: WordEntry, template_node: TemplateNode 

141) -> None: 

142 # Vorlage:Deutsch Substantiv Übersicht 

143 from .page import extract_note_section 

144 

145 expanded_template = wxr.wtp.parse( 

146 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

147 ) 

148 table_nodes = list(expanded_template.find_child(NodeKind.TABLE)) 

149 if len(table_nodes) == 0: 149 ↛ 150line 149 didn't jump to line 150 because the condition on line 149 was never true

150 return 

151 table_node = table_nodes[0] 

152 column_headers = [] 

153 table_header = "" 

154 for table_row in table_node.find_child(NodeKind.TABLE_ROW): 

155 row_header = "" 

156 is_header_row = not table_row.contain_node(NodeKind.TABLE_CELL) 

157 row_has_header = table_row.contain_node(NodeKind.TABLE_HEADER_CELL) 

158 col_index = 0 

159 for table_cell in table_row.find_child( 

160 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

161 ): 

162 cell_text = clean_node(wxr, None, table_cell) 

163 if table_cell.kind == NodeKind.TABLE_HEADER_CELL: 

164 if ( 

165 cell_text in ["", "Kasus", "Utrum", "m", "f", "m, f"] 

166 and col_index == 0 

167 ): 

168 continue 

169 elif is_header_row: 

170 colspan = int(table_cell.attrs.get("colspan", "1")) 

171 if cell_text != "": 171 ↛ 179line 171 didn't jump to line 179 because the condition on line 171 was always true

172 column_headers.append( 

173 RowspanHeader( 

174 re.sub(r"\s*\d+$", "", cell_text), 

175 col_index, 

176 colspan, 

177 ) 

178 ) 

179 col_index += colspan 

180 else: 

181 row_header = cell_text 

182 elif cell_text == "": 182 ↛ 183line 182 didn't jump to line 183 because the condition on line 182 was never true

183 continue 

184 elif not row_has_header: 184 ↛ 186line 184 didn't jump to line 186 because the condition on line 184 was never true

185 # Vorlage:Deutsch adjektivisch Übersicht 

186 table_header = cell_text 

187 column_headers.clear() 

188 for link_node in table_cell.find_child(NodeKind.LINK): 

189 link_text = clean_node(wxr, None, link_node) 

190 if link_text.startswith("Flexion:"): 

191 parse_flexion_page(wxr, word_entry, link_text) 

192 else: 

193 for form_text in cell_text.splitlines(): 

194 form_text = form_text.strip() 

195 if form_text.startswith("(") and form_text.endswith(")"): 195 ↛ 196line 195 didn't jump to line 196 because the condition on line 195 was never true

196 form_text = form_text.strip("() ") 

197 if form_text in ["—", "–", "-", "", "?", wxr.wtp.title]: 197 ↛ 198line 197 didn't jump to line 198 because the condition on line 197 was never true

198 continue 

199 form = Form(form=form_text) 

200 if table_header != "": 200 ↛ 201line 200 didn't jump to line 201 because the condition on line 200 was never true

201 form.raw_tags.append(table_header) 

202 if len(row_header) > 0: 202 ↛ 204line 202 didn't jump to line 204 because the condition on line 202 was always true

203 form.raw_tags.append(row_header) 

204 for col_header in column_headers: 

205 if ( 

206 col_header.text not in ("", "—") 

207 and col_index >= col_header.index 

208 and col_index < col_header.index + col_header.span 

209 ): 

210 form.raw_tags.append(col_header.text) 

211 translate_raw_tags(form) 

212 word_entry.forms.append(form) 

213 col_index += 1 

214 

215 clean_node(wxr, word_entry, expanded_template) # category links 

216 # Vorlage:Deutsch Nachname Übersicht 

217 for level_node in expanded_template.find_child(NodeKind.LEVEL4): 217 ↛ 218line 217 didn't jump to line 218 because the loop on line 217 never started

218 section_text = clean_node(wxr, None, level_node.largs) 

219 if section_text.startswith("Anmerkung"): 

220 extract_note_section(wxr, word_entry, level_node) 

221 

222 

223def process_adj_table( 

224 wxr: WiktextractContext, word_entry: WordEntry, template_node: TemplateNode 

225) -> None: 

226 # Vorlage:Deutsch Adjektiv Übersicht 

227 expanded_template = wxr.wtp.parse( 

228 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

229 ) 

230 table_nodes = list(expanded_template.find_child(NodeKind.TABLE)) 

231 if len(table_nodes) == 0: 231 ↛ 232line 231 didn't jump to line 232 because the condition on line 231 was never true

232 return 

233 table_node = table_nodes[0] 

234 column_headers = [] 

235 for table_row in table_node.find_child(NodeKind.TABLE_ROW): 

236 for col_index, table_cell in enumerate( 

237 table_row.find_child( 

238 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

239 ) 

240 ): 

241 cell_text = clean_node(wxr, None, table_cell) 

242 # because {{int:}} magic word is not implemented 

243 # template "Textbaustein-Intl" expands to English words 

244 if cell_text.startswith("All other forms:"): 

245 for link_node in table_cell.find_child(NodeKind.LINK): 

246 parse_flexion_page( 

247 wxr, word_entry, clean_node(wxr, None, link_node) 

248 ) 

249 elif table_cell.kind == NodeKind.TABLE_HEADER_CELL: 

250 column_headers.append(cell_text) 

251 else: 

252 for form_text in cell_text.splitlines(): 

253 if form_text in ("—", "", "?"): 253 ↛ 254line 253 didn't jump to line 254 because the condition on line 253 was never true

254 continue 

255 form = Form(form=form_text) 

256 if col_index < len(column_headers): 256 ↛ 258line 256 didn't jump to line 258 because the condition on line 256 was always true

257 form.raw_tags.append(column_headers[col_index]) 

258 translate_raw_tags(form) 

259 word_entry.forms.append(form) 

260 

261 

262def extract_pronoun_table( 

263 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

264) -> None: 

265 # Vorlage:Deutsch Possessivpronomen 

266 expanded_template = wxr.wtp.parse( 

267 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

268 ) 

269 table_nodes = list(expanded_template.find_child(NodeKind.TABLE)) 

270 if len(table_nodes) == 0: 270 ↛ 271line 270 didn't jump to line 271 because the condition on line 270 was never true

271 return 

272 table_node = table_nodes[0] 

273 col_headers = [] 

274 table_header = "" 

275 for row in table_node.find_child(NodeKind.TABLE_ROW): 

276 row_header = "" 

277 row_has_data = row.contain_node(NodeKind.TABLE_CELL) 

278 col_index = 0 

279 article = "" 

280 for cell in row.find_child( 

281 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

282 ): 

283 cell_text = clean_node(wxr, None, cell) 

284 if cell.kind == NodeKind.TABLE_HEADER_CELL: 

285 if cell_text == "": 

286 continue 

287 elif row_has_data: 

288 row_header = cell_text 

289 elif len(list(row.find_child(NodeKind.TABLE_HEADER_CELL))) == 1: 

290 table_header = cell_text 

291 col_headers.clear() # new table 

292 article = "" 

293 else: 

294 colspan = 1 

295 colspan_str = cell.attrs.get("colspan", "1") 

296 if re.fullmatch(r"\d+", colspan_str): 296 ↛ 298line 296 didn't jump to line 298 because the condition on line 296 was always true

297 colspan = int(colspan_str) 

298 if cell_text != "—": 

299 col_headers.append( 

300 RowspanHeader(cell_text, col_index, colspan) 

301 ) 

302 col_index += colspan 

303 elif cell.kind == NodeKind.TABLE_CELL: 303 ↛ 280line 303 didn't jump to line 280 because the condition on line 303 was always true

304 if col_index % 2 == 0: 

305 article = cell_text 

306 else: 

307 form_str = ( 

308 article + " " + cell_text 

309 if article not in ["", "—"] 

310 else cell_text 

311 ) 

312 form = Form(form=form_str) 

313 if table_header != "": 313 ↛ 315line 313 didn't jump to line 315 because the condition on line 313 was always true

314 form.raw_tags.append(table_header) 

315 if row_header != "": 315 ↛ 317line 315 didn't jump to line 317 because the condition on line 315 was always true

316 form.raw_tags.append(row_header) 

317 for header in col_headers: 

318 if ( 

319 col_index >= header.index 

320 and col_index < header.index + header.span 

321 and header.text != "Wortform" 

322 ): 

323 form.raw_tags.append(header.text) 

324 translate_raw_tags(form) 

325 if form.form != wxr.wtp.title: 

326 word_entry.forms.append(form) 

327 article = "" 

328 col_index += 1