Coverage for src/wiktextract/extractor/de/flexion.py: 69%

188 statements  

« prev     ^ index     » next       coverage.py v7.14.3, created at 2026-06-23 09:14 +0000

1from dataclasses import dataclass 

2 

3from wikitextprocessor.parser import ( 

4 LEVEL_KIND_FLAGS, 

5 HTMLNode, 

6 LevelNode, 

7 NodeKind, 

8 TemplateNode, 

9 WikiNode, 

10) 

11 

12from ...page import clean_node 

13from ...wxr_context import WiktextractContext 

14from .models import Form, WordEntry 

15from .tags import TAGS, translate_raw_tags 

16 

17 

18def parse_flexion_page( 

19 wxr: WiktextractContext, word_entry: WordEntry, page_title: str 

20) -> None: 

21 # https://de.wiktionary.org/wiki/Hilfe:Flexionsseiten 

22 LEVEL2_TAGS = ["Hilfsverb haben", "Hilfsverb sein"] 

23 

24 flexion_page = wxr.wtp.get_page_body( 

25 page_title, wxr.wtp.NAMESPACE_DATA["Flexion"]["id"] 

26 ) 

27 if flexion_page is None: 27 ↛ 28line 27 didn't jump to line 28 because the condition on line 27 was never true

28 return 

29 flexion_root = wxr.wtp.parse(flexion_page) 

30 shared_raw_tags = [] 

31 for node in flexion_root.find_child_recursively( 

32 NodeKind.TEMPLATE | NodeKind.LEVEL2 

33 ): 

34 match node.kind: 

35 case NodeKind.LEVEL2: 

36 shared_raw_tags.clear() 

37 section_str = clean_node(wxr, None, node.largs) 

38 for word in section_str.split(" "): 

39 word = word.strip(", ") 

40 if word in TAGS and not page_title.endswith(f":{word}"): 40 ↛ 41line 40 didn't jump to line 41 because the condition on line 40 was never true

41 shared_raw_tags.append(word) 

42 for raw_tag in LEVEL2_TAGS: 

43 if raw_tag in section_str: 43 ↛ 44line 43 didn't jump to line 44 because the condition on line 43 was never true

44 shared_raw_tags.append(raw_tag) 

45 case NodeKind.TEMPLATE: 45 ↛ 31line 45 didn't jump to line 31 because the pattern on line 45 always matched

46 if node.template_name == "Deklinationsseite Numerale": 46 ↛ 47line 46 didn't jump to line 47 because the condition on line 46 was never true

47 extract_deklinationsseite_numerale_template( 

48 wxr, word_entry, node, page_title 

49 ) 

50 elif node.template_name.startswith("Deklinationsseite"): 

51 process_deklinationsseite_template( 

52 wxr, word_entry, node, page_title 

53 ) 

54 elif node.template_name.startswith("Deutsch Verb"): 

55 process_deutsch_verb_template( 

56 wxr, word_entry, node, page_title, shared_raw_tags 

57 ) 

58 

59 

60@dataclass 

61class SpanHeader: 

62 text: str 

63 index: int 

64 span: int 

65 

66 

67def process_deklinationsseite_template( 

68 wxr: WiktextractContext, 

69 word_entry: WordEntry, 

70 template_node: TemplateNode, 

71 page_tite: str, 

72) -> None: 

73 # https://de.wiktionary.org/wiki/Vorlage:Deklinationsseite_Adjektiv 

74 expanded_template = wxr.wtp.parse( 

75 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

76 ) 

77 h4_text = "" 

78 for node in expanded_template.find_child(NodeKind.HTML | NodeKind.TABLE): 

79 if isinstance(node, HTMLNode) and node.tag == "h4": 

80 h4_text = clean_node(wxr, None, node) 

81 elif node.kind == NodeKind.TABLE: 81 ↛ 78line 81 didn't jump to line 78 because the condition on line 81 was always true

82 col_headers = [] 

83 has_article = False 

84 for row_node in node.find_child(NodeKind.TABLE_ROW): 

85 col_index = 0 

86 row_header = "" 

87 article = "" 

88 for cell_node in row_node.find_child( 

89 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

90 ): 

91 cell_text = clean_node(wxr, None, cell_node) 

92 if cell_node.kind == NodeKind.TABLE_HEADER_CELL: 

93 if cell_text == "": 

94 continue 

95 elif cell_text in ("Artikel", "Wortform"): 

96 has_article = True 

97 continue 

98 elif "colspan" in cell_node.attrs: 

99 col_span = int(cell_node.attrs.get("colspan")) 

100 if col_span == 9: # new table 

101 has_article = False 

102 col_headers.clear() 

103 col_headers.append( 

104 SpanHeader(cell_text, col_index, col_span) 

105 ) 

106 col_index += col_span 

107 else: 

108 row_header = cell_text 

109 elif cell_node.kind == NodeKind.TABLE_CELL: 109 ↛ 88line 109 didn't jump to line 88 because the condition on line 109 was always true

110 if has_article and col_index % 2 == 0: 

111 if cell_text != "—": 111 ↛ 137line 111 didn't jump to line 137 because the condition on line 111 was always true

112 article = cell_text 

113 else: 

114 raw_tags = [] 

115 if h4_text != "": 115 ↛ 117line 115 didn't jump to line 117 because the condition on line 115 was always true

116 raw_tags.append(h4_text) 

117 if row_header != "": 

118 raw_tags.append(row_header) 

119 for col_header in col_headers: 

120 if ( 

121 col_header.text not in ("", "—") 

122 and col_index >= col_header.index 

123 and col_index 

124 < col_header.index + col_header.span 

125 ): 

126 raw_tags.append(col_header.text) 

127 for line in cell_text.splitlines(): 

128 form = Form( 

129 form=line, 

130 source=page_tite, 

131 raw_tags=raw_tags, 

132 article=article, 

133 ) 

134 if form.form not in ("", "—"): 134 ↛ 127line 134 didn't jump to line 127 because the condition on line 134 was always true

135 translate_raw_tags(form) 

136 word_entry.forms.append(form) 

137 col_index += int(cell_node.attrs.get("colspan", "1")) 

138 

139 

140def process_deutsch_verb_template( 

141 wxr: WiktextractContext, 

142 word_entry: WordEntry, 

143 template_node: TemplateNode, 

144 page_tite: str, 

145 shared_raw_tags: list[str], 

146) -> None: 

147 # Vorlage:Deutsch Verb regelmäßig 

148 expanded_template = wxr.wtp.parse( 

149 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

150 ) 

151 for level_node in expanded_template.find_child(LEVEL_KIND_FLAGS): 

152 process_deutsch_verb_section( 

153 wxr, word_entry, level_node, page_tite, shared_raw_tags 

154 ) 

155 

156 

157def process_deutsch_verb_section( 

158 wxr: WiktextractContext, 

159 word_entry: WordEntry, 

160 level_node: LevelNode, 

161 page_tite: str, 

162 shared_raw_tags: list[str], 

163) -> None: 

164 section_title = clean_node(wxr, None, level_node.largs) 

165 new_raw_tags = shared_raw_tags.copy() 

166 new_raw_tags.append(section_title) 

167 for table_node in level_node.find_child(NodeKind.TABLE): 

168 process_deutsch_verb_table( 

169 wxr, word_entry, table_node, page_tite, new_raw_tags 

170 ) 

171 for next_level in level_node.find_child(LEVEL_KIND_FLAGS): 171 ↛ 172line 171 didn't jump to line 172 because the loop on line 171 never started

172 process_deutsch_verb_section( 

173 wxr, word_entry, next_level, page_tite, new_raw_tags 

174 ) 

175 

176 

177def process_deutsch_verb_table( 

178 wxr: WiktextractContext, 

179 word_entry: WordEntry, 

180 table: WikiNode, 

181 page_tite: str, 

182 shared_raw_tags: list[str], 

183) -> None: 

184 col_headers = [] 

185 for row in table.find_child(NodeKind.TABLE_ROW): 

186 row_header = "" 

187 col_index = 0 

188 col_header_index = 0 

189 is_bold_col_header = all( 

190 c.contain_node(NodeKind.BOLD) 

191 for c in row.find_child(NodeKind.TABLE_CELL) 

192 if clean_node(wxr, None, c) != "" 

193 ) 

194 if ( 

195 len( 

196 list( 

197 row.find_child( 

198 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

199 ) 

200 ) 

201 ) 

202 == 1 

203 ): 

204 col_headers.clear() # new table 

205 for cell in row.find_child( 

206 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

207 ): 

208 cell_text = clean_node(wxr, None, cell) 

209 if cell_text in ( 

210 "Flexion der Verbaladjektive", 

211 "(nichterweiterte) Infinitive", 

212 ): 

213 break 

214 elif cell.kind == NodeKind.TABLE_HEADER_CELL and cell_text not in ( 

215 "", 

216 "Person", 

217 ): 

218 colspan = int(cell.attrs.get("colspan", "1")) 

219 col_headers.append( 

220 SpanHeader( 

221 cell_text, 

222 col_header_index, 

223 colspan, 

224 ) 

225 ) 

226 col_header_index += colspan 

227 elif cell.kind == NodeKind.TABLE_CELL: 227 ↛ 205line 227 didn't jump to line 205 because the condition on line 227 was always true

228 if cell_text in ( 

229 "", 

230 "—", 

231 "Text", 

232 "Person", 

233 ) or cell_text.startswith("Flexion:"): 

234 col_index += 1 

235 elif ( 

236 cell.contain_node(NodeKind.BOLD) 

237 or ( 

238 len(list(cell.find_html("small"))) > 0 

239 and len(list(cell.filter_empty_str_child())) == 1 

240 ) 

241 # Vorlage:Deutsch Verb schwach untrennbar reflexiv 

242 or cell.attrs.get("bgcolor", "").lower() == "#f4f4f4" 

243 ): # header in cell 

244 colspan = int(cell.attrs.get("colspan", "1")) 

245 if is_bold_col_header: 

246 for bold_node in cell.find_child(NodeKind.BOLD): 

247 col_headers.append( 

248 SpanHeader( 

249 clean_node(wxr, None, bold_node), 

250 col_header_index, 

251 colspan, 

252 ) 

253 ) 

254 else: 

255 row_header = cell_text 

256 col_header_index += colspan 

257 else: 

258 for form_text in cell_text.splitlines(): 

259 form_text = form_text.strip(", ") 

260 form_raw_tag = "" 

261 if ":" in form_text: 261 ↛ 262line 261 didn't jump to line 262 because the condition on line 261 was never true

262 form_raw_tag, form_text = form_text.split(":", 1) 

263 form = Form( 

264 form=form_text.strip(), 

265 source=page_tite, 

266 raw_tags=shared_raw_tags, 

267 ) 

268 if form_raw_tag != "": 268 ↛ 269line 268 didn't jump to line 269 because the condition on line 268 was never true

269 form.raw_tags.append(form_raw_tag) 

270 if row_header != "": 

271 form.raw_tags.append(row_header) 

272 for col_header in col_headers: 

273 if ( 

274 col_index >= col_header.index 

275 and col_index 

276 < col_header.index + col_header.span 

277 ): 

278 if col_header.text.endswith("I"): 278 ↛ 279line 278 didn't jump to line 279 because the condition on line 278 was never true

279 form.raw_tags.append(col_header.text) 

280 else: 

281 for raw_tag in col_header.text.split(): 

282 form.raw_tags.append(raw_tag) 

283 translate_raw_tags(form) 

284 word_entry.forms.append(form) 

285 col_index += 1 

286 

287 

288def extract_deklinationsseite_numerale_template( 

289 wxr: WiktextractContext, 

290 word_entry: WordEntry, 

291 t_node: TemplateNode, 

292 page_tite: str, 

293) -> None: 

294 # https://de.wiktionary.org/wiki/Vorlage:Deklinationsseite_Numerale 

295 expanded_template = wxr.wtp.parse( 

296 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

297 ) 

298 for table in expanded_template.find_child(NodeKind.TABLE): 

299 col_headers = [] 

300 for row in table.find_child(NodeKind.TABLE_ROW): 

301 row_header = "" 

302 row_has_data = row.contain_node(NodeKind.TABLE_CELL) 

303 col_index = 0 

304 for cell in row.find_child( 

305 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

306 ): 

307 cell_text = clean_node(wxr, None, cell) 

308 if cell_text == "": 

309 continue 

310 if cell.kind == NodeKind.TABLE_HEADER_CELL: 

311 if row_has_data: 

312 row_header = cell_text 

313 else: 

314 col_span = int(cell.attrs.get("colspan", "1")) 

315 if col_index == 0 and not row_has_data: 

316 col_headers.clear() # new table 

317 col_headers.append( 

318 SpanHeader(cell_text, col_index, col_span) 

319 ) 

320 col_index += col_span 

321 else: 

322 word_nodes = [] 

323 raw_tags = [] 

324 for cell_child in cell.children: 

325 if ( 

326 isinstance(cell_child, HTMLNode) 

327 and cell_child.tag == "br" 

328 ): 

329 word = clean_node(wxr, None, word_nodes) 

330 if word != "": 

331 deklinationsseite_numerale_add_form( 

332 word_entry, 

333 word, 

334 page_tite, 

335 raw_tags, 

336 col_index, 

337 row_header, 

338 col_headers, 

339 ) 

340 word_nodes.clear() 

341 elif ( 

342 isinstance(cell_child, WikiNode) 

343 and cell_child.kind == NodeKind.ITALIC 

344 ): 

345 raw_tag = clean_node(wxr, None, cell_child).strip( 

346 ": " 

347 ) 

348 if raw_tag != "": 

349 raw_tags.append(raw_tag) 

350 else: 

351 word_nodes.append(cell_child) 

352 word = clean_node(wxr, None, word_nodes) 

353 if word != "": 

354 deklinationsseite_numerale_add_form( 

355 word_entry, 

356 word, 

357 page_tite, 

358 raw_tags, 

359 col_index, 

360 row_header, 

361 col_headers, 

362 ) 

363 col_index += 1 

364 

365 

366def deklinationsseite_numerale_add_form( 

367 word_entry: WordEntry, 

368 word: str, 

369 source: str, 

370 raw_tags: list[str], 

371 index: int, 

372 row_header: str, 

373 col_headers: list[SpanHeader], 

374) -> None: 

375 form = Form( 

376 form=word, 

377 source=source, 

378 raw_tags=raw_tags, 

379 ) 

380 if row_header != "": 

381 form.raw_tags.append(row_header) 

382 for col_header in col_headers: 

383 if ( 

384 index >= col_header.index 

385 and index < col_header.index + col_header.span 

386 ): 

387 form.raw_tags.append(col_header.text) 

388 translate_raw_tags(form) 

389 word_entry.forms.append(form)