Coverage for src/wiktextract/extractor/de/flexion.py: 70%

190 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-06-13 07:43 +0000

1from dataclasses import dataclass 

2 

3from wikitextprocessor.parser import ( 

4 LEVEL_KIND_FLAGS, 

5 HTMLNode, 

6 LevelNode, 

7 NodeKind, 

8 TemplateNode, 

9 WikiNode, 

10) 

11 

12from ...page import clean_node 

13from ...wxr_context import WiktextractContext 

14from .models import Form, WordEntry 

15from .tags import GRAMMATICAL_TAGS, translate_raw_tags 

16 

17 

18def parse_flexion_page( 

19 wxr: WiktextractContext, word_entry: WordEntry, page_title: str 

20) -> None: 

21 # https://de.wiktionary.org/wiki/Hilfe:Flexionsseiten 

22 LEVEL2_TAGS = ["Hilfsverb haben", "Hilfsverb sein"] 

23 

24 flexion_page = wxr.wtp.get_page_body( 

25 page_title, wxr.wtp.NAMESPACE_DATA["Flexion"]["id"] 

26 ) 

27 if flexion_page is None: 27 ↛ 28line 27 didn't jump to line 28 because the condition on line 27 was never true

28 return 

29 flexion_root = wxr.wtp.parse(flexion_page) 

30 shared_raw_tags = [] 

31 for node in flexion_root.find_child_recursively( 

32 NodeKind.TEMPLATE | NodeKind.LEVEL2 

33 ): 

34 match node.kind: 

35 case NodeKind.LEVEL2: 

36 shared_raw_tags.clear() 

37 section_str = clean_node(wxr, None, node.largs) 

38 for word in section_str.split(" "): 

39 word = word.strip(", ") 

40 if word in GRAMMATICAL_TAGS and not page_title.endswith( 40 ↛ 43line 40 didn't jump to line 43 because the condition on line 40 was never true

41 f":{word}" 

42 ): 

43 shared_raw_tags.append(word) 

44 for raw_tag in LEVEL2_TAGS: 

45 if raw_tag in section_str: 45 ↛ 46line 45 didn't jump to line 46 because the condition on line 45 was never true

46 shared_raw_tags.append(raw_tag) 

47 case NodeKind.TEMPLATE: 47 ↛ 31line 47 didn't jump to line 31 because the pattern on line 47 always matched

48 if node.template_name == "Deklinationsseite Numerale": 48 ↛ 49line 48 didn't jump to line 49 because the condition on line 48 was never true

49 extract_deklinationsseite_numerale_template( 

50 wxr, word_entry, node, page_title 

51 ) 

52 elif node.template_name.startswith("Deklinationsseite"): 

53 process_deklinationsseite_template( 

54 wxr, word_entry, node, page_title 

55 ) 

56 elif node.template_name.startswith("Deutsch Verb"): 

57 process_deutsch_verb_template( 

58 wxr, word_entry, node, page_title, shared_raw_tags 

59 ) 

60 

61 

62@dataclass 

63class SpanHeader: 

64 text: str 

65 index: int 

66 span: int 

67 

68 

69def process_deklinationsseite_template( 

70 wxr: WiktextractContext, 

71 word_entry: WordEntry, 

72 template_node: TemplateNode, 

73 page_tite: str, 

74) -> None: 

75 # https://de.wiktionary.org/wiki/Vorlage:Deklinationsseite_Adjektiv 

76 expanded_template = wxr.wtp.parse( 

77 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

78 ) 

79 h4_text = "" 

80 for node in expanded_template.find_child(NodeKind.HTML | NodeKind.TABLE): 

81 if isinstance(node, HTMLNode) and node.tag == "h4": 

82 h4_text = clean_node(wxr, None, node) 

83 elif node.kind == NodeKind.TABLE: 83 ↛ 80line 83 didn't jump to line 80 because the condition on line 83 was always true

84 col_headers = [] 

85 has_article = False 

86 for row_node in node.find_child(NodeKind.TABLE_ROW): 

87 col_index = 0 

88 row_header = "" 

89 article = "" 

90 for cell_node in row_node.find_child( 

91 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

92 ): 

93 cell_text = clean_node(wxr, None, cell_node) 

94 if cell_node.kind == NodeKind.TABLE_HEADER_CELL: 

95 if cell_text == "": 

96 continue 

97 elif cell_text in ("Artikel", "Wortform"): 

98 has_article = True 

99 continue 

100 elif "colspan" in cell_node.attrs: 

101 col_span = int(cell_node.attrs.get("colspan")) 

102 if col_span == 9: # new table 

103 has_article = False 

104 col_headers.clear() 

105 col_headers.append( 

106 SpanHeader(cell_text, col_index, col_span) 

107 ) 

108 col_index += col_span 

109 else: 

110 row_header = cell_text 

111 elif cell_node.kind == NodeKind.TABLE_CELL: 111 ↛ 90line 111 didn't jump to line 90 because the condition on line 111 was always true

112 if has_article and col_index % 2 == 0: 

113 article = cell_text 

114 else: 

115 form_text = "" 

116 if article not in ("", "—"): 

117 form_text = article + " " 

118 raw_tags = [] 

119 if h4_text != "": 119 ↛ 121line 119 didn't jump to line 121 because the condition on line 119 was always true

120 raw_tags.append(h4_text) 

121 if row_header != "": 

122 raw_tags.append(row_header) 

123 for col_header in col_headers: 

124 if ( 

125 col_header.text not in ("", "—") 

126 and col_index >= col_header.index 

127 and col_index 

128 < col_header.index + col_header.span 

129 ): 

130 raw_tags.append(col_header.text) 

131 for line in cell_text.splitlines(): 

132 form = Form( 

133 form=form_text + line, 

134 source=page_tite, 

135 raw_tags=raw_tags, 

136 ) 

137 if form.form not in ("", "—"): 137 ↛ 131line 137 didn't jump to line 131 because the condition on line 137 was always true

138 translate_raw_tags(form) 

139 word_entry.forms.append(form) 

140 col_index += int(cell_node.attrs.get("colspan", "1")) 

141 

142 

143def process_deutsch_verb_template( 

144 wxr: WiktextractContext, 

145 word_entry: WordEntry, 

146 template_node: TemplateNode, 

147 page_tite: str, 

148 shared_raw_tags: list[str], 

149) -> None: 

150 # Vorlage:Deutsch Verb regelmäßig 

151 expanded_template = wxr.wtp.parse( 

152 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

153 ) 

154 for level_node in expanded_template.find_child(LEVEL_KIND_FLAGS): 

155 process_deutsch_verb_section( 

156 wxr, word_entry, level_node, page_tite, shared_raw_tags 

157 ) 

158 

159 

160def process_deutsch_verb_section( 

161 wxr: WiktextractContext, 

162 word_entry: WordEntry, 

163 level_node: LevelNode, 

164 page_tite: str, 

165 shared_raw_tags: list[str], 

166) -> None: 

167 section_title = clean_node(wxr, None, level_node.largs) 

168 new_raw_tags = shared_raw_tags.copy() 

169 new_raw_tags.append(section_title) 

170 for table_node in level_node.find_child(NodeKind.TABLE): 

171 process_deutsch_verb_table( 

172 wxr, word_entry, table_node, page_tite, new_raw_tags 

173 ) 

174 for next_level in level_node.find_child(LEVEL_KIND_FLAGS): 174 ↛ 175line 174 didn't jump to line 175 because the loop on line 174 never started

175 process_deutsch_verb_section( 

176 wxr, word_entry, next_level, page_tite, new_raw_tags 

177 ) 

178 

179 

180def process_deutsch_verb_table( 

181 wxr: WiktextractContext, 

182 word_entry: WordEntry, 

183 table: WikiNode, 

184 page_tite: str, 

185 shared_raw_tags: list[str], 

186) -> None: 

187 col_headers = [] 

188 for row in table.find_child(NodeKind.TABLE_ROW): 

189 row_header = "" 

190 col_index = 0 

191 col_header_index = 0 

192 is_bold_col_header = all( 

193 c.contain_node(NodeKind.BOLD) 

194 for c in row.find_child(NodeKind.TABLE_CELL) 

195 if clean_node(wxr, None, c) != "" 

196 ) 

197 if ( 

198 len( 

199 list( 

200 row.find_child( 

201 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

202 ) 

203 ) 

204 ) 

205 == 1 

206 ): 

207 col_headers.clear() # new table 

208 for cell in row.find_child( 

209 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

210 ): 

211 cell_text = clean_node(wxr, None, cell) 

212 if cell_text in ( 

213 "Flexion der Verbaladjektive", 

214 "(nichterweiterte) Infinitive", 

215 ): 

216 break 

217 elif cell.kind == NodeKind.TABLE_HEADER_CELL and cell_text not in ( 

218 "", 

219 "Person", 

220 ): 

221 colspan = int(cell.attrs.get("colspan", "1")) 

222 col_headers.append( 

223 SpanHeader( 

224 cell_text, 

225 col_header_index, 

226 colspan, 

227 ) 

228 ) 

229 col_header_index += colspan 

230 elif cell.kind == NodeKind.TABLE_CELL: 230 ↛ 208line 230 didn't jump to line 208 because the condition on line 230 was always true

231 if cell_text in ( 

232 "", 

233 "—", 

234 "Text", 

235 "Person", 

236 ) or cell_text.startswith("Flexion:"): 

237 col_index += 1 

238 elif ( 

239 cell.contain_node(NodeKind.BOLD) 

240 or ( 

241 len(list(cell.find_html("small"))) > 0 

242 and len(list(cell.filter_empty_str_child())) == 1 

243 ) 

244 # Vorlage:Deutsch Verb schwach untrennbar reflexiv 

245 or cell.attrs.get("bgcolor", "").lower() == "#f4f4f4" 

246 ): # header in cell 

247 colspan = int(cell.attrs.get("colspan", "1")) 

248 if is_bold_col_header: 

249 for bold_node in cell.find_child(NodeKind.BOLD): 

250 col_headers.append( 

251 SpanHeader( 

252 clean_node(wxr, None, bold_node), 

253 col_header_index, 

254 colspan, 

255 ) 

256 ) 

257 else: 

258 row_header = cell_text 

259 col_header_index += colspan 

260 else: 

261 for form_text in cell_text.splitlines(): 

262 form_text = form_text.strip(", ") 

263 form_raw_tag = "" 

264 if ":" in form_text: 264 ↛ 265line 264 didn't jump to line 265 because the condition on line 264 was never true

265 form_raw_tag, form_text = form_text.split(":", 1) 

266 form = Form( 

267 form=form_text.strip(), 

268 source=page_tite, 

269 raw_tags=shared_raw_tags, 

270 ) 

271 if form_raw_tag != "": 271 ↛ 272line 271 didn't jump to line 272 because the condition on line 271 was never true

272 form.raw_tags.append(form_raw_tag) 

273 if row_header != "": 

274 form.raw_tags.append(row_header) 

275 for col_header in col_headers: 

276 if ( 

277 col_index >= col_header.index 

278 and col_index 

279 < col_header.index + col_header.span 

280 ): 

281 if col_header.text.endswith("I"): 281 ↛ 282line 281 didn't jump to line 282 because the condition on line 281 was never true

282 form.raw_tags.append(col_header.text) 

283 else: 

284 for raw_tag in col_header.text.split(): 

285 form.raw_tags.append(raw_tag) 

286 translate_raw_tags(form) 

287 word_entry.forms.append(form) 

288 col_index += 1 

289 

290 

291def extract_deklinationsseite_numerale_template( 

292 wxr: WiktextractContext, 

293 word_entry: WordEntry, 

294 t_node: TemplateNode, 

295 page_tite: str, 

296) -> None: 

297 # https://de.wiktionary.org/wiki/Vorlage:Deklinationsseite_Numerale 

298 expanded_template = wxr.wtp.parse( 

299 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

300 ) 

301 for table in expanded_template.find_child(NodeKind.TABLE): 

302 col_headers = [] 

303 for row in table.find_child(NodeKind.TABLE_ROW): 

304 row_header = "" 

305 row_has_data = row.contain_node(NodeKind.TABLE_CELL) 

306 col_index = 0 

307 for cell in row.find_child( 

308 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

309 ): 

310 cell_text = clean_node(wxr, None, cell) 

311 if cell_text == "": 

312 continue 

313 if cell.kind == NodeKind.TABLE_HEADER_CELL: 

314 if row_has_data: 

315 row_header = cell_text 

316 else: 

317 col_span = int(cell.attrs.get("colspan", "1")) 

318 if col_index == 0 and not row_has_data: 

319 col_headers.clear() # new table 

320 col_headers.append( 

321 SpanHeader(cell_text, col_index, col_span) 

322 ) 

323 col_index += col_span 

324 else: 

325 word_nodes = [] 

326 raw_tags = [] 

327 for cell_child in cell.children: 

328 if ( 

329 isinstance(cell_child, HTMLNode) 

330 and cell_child.tag == "br" 

331 ): 

332 word = clean_node(wxr, None, word_nodes) 

333 if word != "": 

334 deklinationsseite_numerale_add_form( 

335 word_entry, 

336 word, 

337 page_tite, 

338 raw_tags, 

339 col_index, 

340 row_header, 

341 col_headers, 

342 ) 

343 word_nodes.clear() 

344 elif ( 

345 isinstance(cell_child, WikiNode) 

346 and cell_child.kind == NodeKind.ITALIC 

347 ): 

348 raw_tag = clean_node(wxr, None, cell_child).strip( 

349 ": " 

350 ) 

351 if raw_tag != "": 

352 raw_tags.append(raw_tag) 

353 else: 

354 word_nodes.append(cell_child) 

355 word = clean_node(wxr, None, word_nodes) 

356 if word != "": 

357 deklinationsseite_numerale_add_form( 

358 word_entry, 

359 word, 

360 page_tite, 

361 raw_tags, 

362 col_index, 

363 row_header, 

364 col_headers, 

365 ) 

366 col_index += 1 

367 

368 

369def deklinationsseite_numerale_add_form( 

370 word_entry: WordEntry, 

371 word: str, 

372 source: str, 

373 raw_tags: list[str], 

374 index: int, 

375 row_header: str, 

376 col_headers: list[SpanHeader], 

377) -> None: 

378 form = Form( 

379 form=word, 

380 source=source, 

381 raw_tags=raw_tags, 

382 ) 

383 if row_header != "": 

384 form.raw_tags.append(row_header) 

385 for col_header in col_headers: 

386 if ( 

387 index >= col_header.index 

388 and index < col_header.index + col_header.span 

389 ): 

390 form.raw_tags.append(col_header.text) 

391 translate_raw_tags(form) 

392 word_entry.forms.append(form)