Coverage for src/wiktextract/extractor/de/inflection.py: 86%

207 statements  

« prev     ^ index     » next       coverage.py v7.11.0, created at 2025-11-03 05:44 +0000

1import re 

2from dataclasses import dataclass 

3 

4from wikitextprocessor import NodeKind, TemplateNode 

5 

6from ...page import clean_node 

7from ...wxr_context import WiktextractContext 

8from .flexion import parse_flexion_page 

9from .models import Form, WordEntry 

10from .tags import translate_raw_tags 

11 

12# Kategorie:Wiktionary:Flexionstabelle (Deutsch) 

13 

14 

15def extract_inf_table_template( 

16 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

17) -> None: 

18 if ( 

19 "Substantiv Übersicht" in t_node.template_name 

20 or t_node.template_name.endswith( 

21 ( 

22 "Nachname Übersicht", 

23 "Eigenname Übersicht", 

24 "Vorname Übersicht m", 

25 "Name Übersicht", 

26 "Pronomina-Tabelle", 

27 "Pronomen Übersicht", 

28 "adjektivisch Übersicht", 

29 "Substantiv Dialekt", 

30 "Toponym Übersicht", 

31 ) 

32 ) 

33 or re.search(r" Personalpronomen \d$", t_node.template_name) 

34 ): 

35 process_noun_table(wxr, word_entry, t_node) 

36 elif t_node.template_name.endswith( 

37 ("Adjektiv Übersicht", "Adverb Übersicht") 

38 ): 

39 process_adj_table(wxr, word_entry, t_node) 

40 elif ( 

41 t_node.template_name.endswith("Verb Übersicht") 

42 or t_node.template_name == "Kardinalzahl 2-12" 

43 ): 

44 process_verb_table(wxr, word_entry, t_node) 

45 elif t_node.template_name == "Deutsch Possessivpronomen": 45 ↛ exitline 45 didn't return from function 'extract_inf_table_template' because the condition on line 45 was always true

46 extract_pronoun_table(wxr, word_entry, t_node) 

47 

48 

49@dataclass 

50class RowspanHeader: 

51 text: str 

52 index: int 

53 span: int 

54 

55 

56def process_verb_table( 

57 wxr: WiktextractContext, word_entry: WordEntry, template_node: TemplateNode 

58) -> None: 

59 # Vorlage:Deutsch Verb Übersicht 

60 expanded_template = wxr.wtp.parse( 

61 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

62 ) 

63 table_nodes = list(expanded_template.find_child(NodeKind.TABLE)) 

64 if len(table_nodes) == 0: 64 ↛ 65line 64 didn't jump to line 65 because the condition on line 64 was never true

65 return 

66 table_node = table_nodes[0] 

67 col_headers = [] 

68 has_person = False 

69 row_headers = [] 

70 for table_row in table_node.find_child(NodeKind.TABLE_ROW): 

71 col_index = 0 

72 header_col_index = 0 

73 pronouns = [] 

74 for table_cell in table_row.find_child( 

75 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

76 ): 

77 cell_text = clean_node(wxr, None, table_cell) 

78 if cell_text.startswith("All other forms:"): 

79 for link_node in table_cell.find_child_recursively( 

80 NodeKind.LINK 

81 ): 

82 link_text = clean_node(wxr, None, link_node) 

83 if link_text.startswith("Flexion:"): 83 ↛ 79line 83 didn't jump to line 79 because the condition on line 83 was always true

84 parse_flexion_page(wxr, word_entry, link_text) 

85 elif table_cell.kind == NodeKind.TABLE_HEADER_CELL: 

86 if cell_text == "": 

87 continue 

88 elif header_col_index == 0: 

89 rowspan = int(table_cell.attrs.get("rowspan", "1")) 

90 row_headers.append(RowspanHeader(cell_text, 0, rowspan)) 

91 elif cell_text in ("Person", "Wortform"): 

92 has_person = True 

93 else: # new table 

94 col_headers.append(cell_text) 

95 has_person = False 

96 pronouns.clear() 

97 header_col_index += 1 

98 elif table_cell.kind == NodeKind.TABLE_CELL: 98 ↛ 74line 98 didn't jump to line 74 because the condition on line 98 was always true

99 if has_person and col_index == 0: 

100 if cell_text in ("Singular", "Plural"): 

101 row_headers.append(RowspanHeader(cell_text, 0, 1)) 

102 else: 

103 pronouns = list( 

104 filter(None, map(str.strip, cell_text.split(","))) 

105 ) 

106 else: 

107 for cell_line in cell_text.splitlines(): 

108 for form_str in map(str.strip, cell_line.split(",")): 

109 if form_str in ["", "—", wxr.wtp.title]: 

110 continue 

111 elif form_str.startswith("Flexion:"): 111 ↛ 112line 111 didn't jump to line 112 because the condition on line 111 was never true

112 parse_flexion_page(wxr, word_entry, form_str) 

113 continue 

114 form = Form(form=form_str, pronouns=pronouns) 

115 if col_index < len(col_headers): 

116 form.raw_tags.append(col_headers[col_index]) 

117 for row_header in row_headers: 

118 form.raw_tags.append(row_header.text) 

119 translate_raw_tags(form) 

120 word_entry.forms.append(form) 

121 col_index += 1 

122 

123 new_row_headers = [] 

124 for row_header in row_headers: 

125 if row_header.span > 1: 

126 row_header.span -= 1 

127 new_row_headers.append(row_header) 

128 row_headers = new_row_headers 

129 

130 

131def process_noun_table( 

132 wxr: WiktextractContext, word_entry: WordEntry, template_node: TemplateNode 

133) -> None: 

134 # Vorlage:Deutsch Substantiv Übersicht 

135 from .page import extract_note_section 

136 

137 expanded_template = wxr.wtp.parse( 

138 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

139 ) 

140 table_nodes = list(expanded_template.find_child(NodeKind.TABLE)) 

141 if len(table_nodes) == 0: 141 ↛ 142line 141 didn't jump to line 142 because the condition on line 141 was never true

142 return 

143 table_node = table_nodes[0] 

144 column_headers = [] 

145 table_header = "" 

146 for table_row in table_node.find_child(NodeKind.TABLE_ROW): 

147 row_header = "" 

148 is_header_row = not table_row.contain_node(NodeKind.TABLE_CELL) 

149 row_has_header = table_row.contain_node(NodeKind.TABLE_HEADER_CELL) 

150 col_index = 0 

151 for table_cell in table_row.find_child( 

152 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

153 ): 

154 cell_text = clean_node(wxr, None, table_cell) 

155 if table_cell.kind == NodeKind.TABLE_HEADER_CELL: 

156 if ( 

157 cell_text in ["", "Kasus", "Utrum", "m", "f", "m, f"] 

158 and col_index == 0 

159 ): 

160 continue 

161 elif is_header_row: 

162 colspan = int(table_cell.attrs.get("colspan", "1")) 

163 if cell_text != "": 163 ↛ 171line 163 didn't jump to line 171 because the condition on line 163 was always true

164 column_headers.append( 

165 RowspanHeader( 

166 re.sub(r"\s*\d+$", "", cell_text), 

167 col_index, 

168 colspan, 

169 ) 

170 ) 

171 col_index += colspan 

172 else: 

173 row_header = cell_text 

174 elif cell_text == "": 174 ↛ 175line 174 didn't jump to line 175 because the condition on line 174 was never true

175 continue 

176 elif not row_has_header: 176 ↛ 178line 176 didn't jump to line 178 because the condition on line 176 was never true

177 # Vorlage:Deutsch adjektivisch Übersicht 

178 table_header = cell_text 

179 column_headers.clear() 

180 for link_node in table_cell.find_child(NodeKind.LINK): 

181 link_text = clean_node(wxr, None, link_node) 

182 if link_text.startswith("Flexion:"): 

183 parse_flexion_page(wxr, word_entry, link_text) 

184 else: 

185 for form_text in cell_text.splitlines(): 

186 form_text = form_text.strip() 

187 if form_text.startswith("(") and form_text.endswith(")"): 187 ↛ 188line 187 didn't jump to line 188 because the condition on line 187 was never true

188 form_text = form_text.strip("() ") 

189 if form_text in ["—", "–", "-", "", "?", wxr.wtp.title]: 189 ↛ 190line 189 didn't jump to line 190 because the condition on line 189 was never true

190 continue 

191 form = Form(form=form_text) 

192 if table_header != "": 192 ↛ 193line 192 didn't jump to line 193 because the condition on line 192 was never true

193 form.raw_tags.append(table_header) 

194 if len(row_header) > 0: 194 ↛ 196line 194 didn't jump to line 196 because the condition on line 194 was always true

195 form.raw_tags.append(row_header) 

196 for col_header in column_headers: 

197 if ( 

198 col_header.text not in ("", "—") 

199 and col_index >= col_header.index 

200 and col_index < col_header.index + col_header.span 

201 ): 

202 form.raw_tags.append(col_header.text) 

203 translate_raw_tags(form) 

204 word_entry.forms.append(form) 

205 col_index += 1 

206 

207 clean_node(wxr, word_entry, expanded_template) # category links 

208 # Vorlage:Deutsch Nachname Übersicht 

209 for level_node in expanded_template.find_child(NodeKind.LEVEL4): 209 ↛ 210line 209 didn't jump to line 210 because the loop on line 209 never started

210 section_text = clean_node(wxr, None, level_node.largs) 

211 if section_text.startswith("Anmerkung"): 

212 extract_note_section(wxr, word_entry, level_node) 

213 

214 

215def process_adj_table( 

216 wxr: WiktextractContext, word_entry: WordEntry, template_node: TemplateNode 

217) -> None: 

218 # Vorlage:Deutsch Adjektiv Übersicht 

219 expanded_template = wxr.wtp.parse( 

220 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

221 ) 

222 table_nodes = list(expanded_template.find_child(NodeKind.TABLE)) 

223 if len(table_nodes) == 0: 223 ↛ 224line 223 didn't jump to line 224 because the condition on line 223 was never true

224 return 

225 table_node = table_nodes[0] 

226 column_headers = [] 

227 for table_row in table_node.find_child(NodeKind.TABLE_ROW): 

228 for col_index, table_cell in enumerate( 

229 table_row.find_child( 

230 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

231 ) 

232 ): 

233 cell_text = clean_node(wxr, None, table_cell) 

234 # because {{int:}} magic word is not implemented 

235 # template "Textbaustein-Intl" expands to English words 

236 if cell_text.startswith("All other forms:"): 

237 for link_node in table_cell.find_child(NodeKind.LINK): 

238 parse_flexion_page( 

239 wxr, word_entry, clean_node(wxr, None, link_node) 

240 ) 

241 elif table_cell.kind == NodeKind.TABLE_HEADER_CELL: 

242 column_headers.append(cell_text) 

243 else: 

244 for form_text in cell_text.splitlines(): 

245 if form_text in ("—", "", "?"): 245 ↛ 246line 245 didn't jump to line 246 because the condition on line 245 was never true

246 continue 

247 form = Form(form=form_text) 

248 if col_index < len(column_headers): 248 ↛ 250line 248 didn't jump to line 250 because the condition on line 248 was always true

249 form.raw_tags.append(column_headers[col_index]) 

250 translate_raw_tags(form) 

251 word_entry.forms.append(form) 

252 

253 

254def extract_pronoun_table( 

255 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

256) -> None: 

257 # Vorlage:Deutsch Possessivpronomen 

258 expanded_template = wxr.wtp.parse( 

259 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

260 ) 

261 table_nodes = list(expanded_template.find_child(NodeKind.TABLE)) 

262 if len(table_nodes) == 0: 262 ↛ 263line 262 didn't jump to line 263 because the condition on line 262 was never true

263 return 

264 table_node = table_nodes[0] 

265 col_headers = [] 

266 table_header = "" 

267 for row in table_node.find_child(NodeKind.TABLE_ROW): 

268 row_header = "" 

269 row_has_data = row.contain_node(NodeKind.TABLE_CELL) 

270 col_index = 0 

271 article = "" 

272 for cell in row.find_child( 

273 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

274 ): 

275 cell_text = clean_node(wxr, None, cell) 

276 if cell.kind == NodeKind.TABLE_HEADER_CELL: 

277 if cell_text == "": 

278 continue 

279 elif row_has_data: 

280 row_header = cell_text 

281 elif len(list(row.find_child(NodeKind.TABLE_HEADER_CELL))) == 1: 

282 table_header = cell_text 

283 col_headers.clear() # new table 

284 article = "" 

285 else: 

286 colspan = 1 

287 colspan_str = cell.attrs.get("colspan", "1") 

288 if re.fullmatch(r"\d+", colspan_str): 288 ↛ 290line 288 didn't jump to line 290 because the condition on line 288 was always true

289 colspan = int(colspan_str) 

290 if cell_text != "—": 

291 col_headers.append( 

292 RowspanHeader(cell_text, col_index, colspan) 

293 ) 

294 col_index += colspan 

295 elif cell.kind == NodeKind.TABLE_CELL: 295 ↛ 272line 295 didn't jump to line 272 because the condition on line 295 was always true

296 if col_index % 2 == 0: 

297 article = cell_text 

298 else: 

299 form_str = ( 

300 article + " " + cell_text 

301 if article not in ["", "—"] 

302 else cell_text 

303 ) 

304 form = Form(form=form_str) 

305 if table_header != "": 305 ↛ 307line 305 didn't jump to line 307 because the condition on line 305 was always true

306 form.raw_tags.append(table_header) 

307 if row_header != "": 307 ↛ 309line 307 didn't jump to line 309 because the condition on line 307 was always true

308 form.raw_tags.append(row_header) 

309 for header in col_headers: 

310 if ( 

311 col_index >= header.index 

312 and col_index < header.index + header.span 

313 and header.text != "Wortform" 

314 ): 

315 form.raw_tags.append(header.text) 

316 translate_raw_tags(form) 

317 if form.form != wxr.wtp.title: 

318 word_entry.forms.append(form) 

319 article = "" 

320 col_index += 1