Coverage for src / wiktextract / extractor / de / flexion.py: 69%

188 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2025-12-12 08:09 +0000

1from dataclasses import dataclass 

2 

3from wikitextprocessor.parser import ( 

4 LEVEL_KIND_FLAGS, 

5 HTMLNode, 

6 LevelNode, 

7 NodeKind, 

8 TemplateNode, 

9 WikiNode, 

10) 

11 

12from ...page import clean_node 

13from ...wxr_context import WiktextractContext 

14from .models import Form, WordEntry 

15from .tags import GRAMMATICAL_TAGS, translate_raw_tags 

16 

17 

18def parse_flexion_page( 

19 wxr: WiktextractContext, word_entry: WordEntry, page_title: str 

20) -> None: 

21 # https://de.wiktionary.org/wiki/Hilfe:Flexionsseiten 

22 LEVEL2_TAGS = ["Hilfsverb haben", "Hilfsverb sein"] 

23 

24 flexion_page = wxr.wtp.get_page_body( 

25 page_title, wxr.wtp.NAMESPACE_DATA["Flexion"]["id"] 

26 ) 

27 if flexion_page is None: 27 ↛ 28line 27 didn't jump to line 28 because the condition on line 27 was never true

28 return 

29 flexion_root = wxr.wtp.parse(flexion_page) 

30 shared_raw_tags = [] 

31 for node in flexion_root.find_child_recursively( 

32 NodeKind.TEMPLATE | NodeKind.LEVEL2 

33 ): 

34 match node.kind: 

35 case NodeKind.LEVEL2: 

36 shared_raw_tags.clear() 

37 section_str = clean_node(wxr, None, node.largs) 

38 for word in section_str.split(" "): 

39 word = word.strip(", ") 

40 if word in GRAMMATICAL_TAGS and not page_title.endswith( 40 ↛ 43line 40 didn't jump to line 43 because the condition on line 40 was never true

41 f":{word}" 

42 ): 

43 shared_raw_tags.append(word) 

44 for raw_tag in LEVEL2_TAGS: 

45 if raw_tag in section_str: 45 ↛ 46line 45 didn't jump to line 46 because the condition on line 45 was never true

46 shared_raw_tags.append(raw_tag) 

47 case NodeKind.TEMPLATE: 47 ↛ 31line 47 didn't jump to line 31 because the pattern on line 47 always matched

48 if node.template_name == "Deklinationsseite Numerale": 48 ↛ 49line 48 didn't jump to line 49 because the condition on line 48 was never true

49 extract_deklinationsseite_numerale_template( 

50 wxr, word_entry, node, page_title 

51 ) 

52 elif node.template_name.startswith("Deklinationsseite"): 

53 process_deklinationsseite_template( 

54 wxr, word_entry, node, page_title 

55 ) 

56 elif node.template_name.startswith("Deutsch Verb"): 

57 process_deutsch_verb_template( 

58 wxr, word_entry, node, page_title, shared_raw_tags 

59 ) 

60 

61 

62@dataclass 

63class SpanHeader: 

64 text: str 

65 index: int 

66 span: int 

67 

68 

69def process_deklinationsseite_template( 

70 wxr: WiktextractContext, 

71 word_entry: WordEntry, 

72 template_node: TemplateNode, 

73 page_tite: str, 

74) -> None: 

75 # https://de.wiktionary.org/wiki/Vorlage:Deklinationsseite_Adjektiv 

76 expanded_template = wxr.wtp.parse( 

77 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

78 ) 

79 h4_text = "" 

80 for node in expanded_template.find_child(NodeKind.HTML | NodeKind.TABLE): 

81 if isinstance(node, HTMLNode) and node.tag == "h4": 

82 h4_text = clean_node(wxr, None, node) 

83 elif node.kind == NodeKind.TABLE: 83 ↛ 80line 83 didn't jump to line 80 because the condition on line 83 was always true

84 col_headers = [] 

85 has_article = False 

86 for row_node in node.find_child(NodeKind.TABLE_ROW): 

87 col_index = 0 

88 row_header = "" 

89 article = "" 

90 for cell_node in row_node.find_child( 

91 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

92 ): 

93 cell_text = clean_node(wxr, None, cell_node) 

94 if cell_node.kind == NodeKind.TABLE_HEADER_CELL: 

95 if cell_text == "": 

96 continue 

97 elif cell_text in ("Artikel", "Wortform"): 

98 has_article = True 

99 continue 

100 elif "colspan" in cell_node.attrs: 

101 col_span = int(cell_node.attrs.get("colspan")) 

102 if col_span == 9: # new table 

103 has_article = False 

104 col_headers.clear() 

105 col_headers.append( 

106 SpanHeader(cell_text, col_index, col_span) 

107 ) 

108 col_index += col_span 

109 else: 

110 row_header = cell_text 

111 elif cell_node.kind == NodeKind.TABLE_CELL: 111 ↛ 90line 111 didn't jump to line 90 because the condition on line 111 was always true

112 if has_article and col_index % 2 == 0: 

113 if cell_text != "—": 113 ↛ 139line 113 didn't jump to line 139 because the condition on line 113 was always true

114 article = cell_text 

115 else: 

116 raw_tags = [] 

117 if h4_text != "": 117 ↛ 119line 117 didn't jump to line 119 because the condition on line 117 was always true

118 raw_tags.append(h4_text) 

119 if row_header != "": 

120 raw_tags.append(row_header) 

121 for col_header in col_headers: 

122 if ( 

123 col_header.text not in ("", "—") 

124 and col_index >= col_header.index 

125 and col_index 

126 < col_header.index + col_header.span 

127 ): 

128 raw_tags.append(col_header.text) 

129 for line in cell_text.splitlines(): 

130 form = Form( 

131 form=line, 

132 source=page_tite, 

133 raw_tags=raw_tags, 

134 article=article, 

135 ) 

136 if form.form not in ("", "—"): 136 ↛ 129line 136 didn't jump to line 129 because the condition on line 136 was always true

137 translate_raw_tags(form) 

138 word_entry.forms.append(form) 

139 col_index += int(cell_node.attrs.get("colspan", "1")) 

140 

141 

142def process_deutsch_verb_template( 

143 wxr: WiktextractContext, 

144 word_entry: WordEntry, 

145 template_node: TemplateNode, 

146 page_tite: str, 

147 shared_raw_tags: list[str], 

148) -> None: 

149 # Vorlage:Deutsch Verb regelmäßig 

150 expanded_template = wxr.wtp.parse( 

151 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

152 ) 

153 for level_node in expanded_template.find_child(LEVEL_KIND_FLAGS): 

154 process_deutsch_verb_section( 

155 wxr, word_entry, level_node, page_tite, shared_raw_tags 

156 ) 

157 

158 

159def process_deutsch_verb_section( 

160 wxr: WiktextractContext, 

161 word_entry: WordEntry, 

162 level_node: LevelNode, 

163 page_tite: str, 

164 shared_raw_tags: list[str], 

165) -> None: 

166 section_title = clean_node(wxr, None, level_node.largs) 

167 new_raw_tags = shared_raw_tags.copy() 

168 new_raw_tags.append(section_title) 

169 for table_node in level_node.find_child(NodeKind.TABLE): 

170 process_deutsch_verb_table( 

171 wxr, word_entry, table_node, page_tite, new_raw_tags 

172 ) 

173 for next_level in level_node.find_child(LEVEL_KIND_FLAGS): 173 ↛ 174line 173 didn't jump to line 174 because the loop on line 173 never started

174 process_deutsch_verb_section( 

175 wxr, word_entry, next_level, page_tite, new_raw_tags 

176 ) 

177 

178 

179def process_deutsch_verb_table( 

180 wxr: WiktextractContext, 

181 word_entry: WordEntry, 

182 table: WikiNode, 

183 page_tite: str, 

184 shared_raw_tags: list[str], 

185) -> None: 

186 col_headers = [] 

187 for row in table.find_child(NodeKind.TABLE_ROW): 

188 row_header = "" 

189 col_index = 0 

190 col_header_index = 0 

191 is_bold_col_header = all( 

192 c.contain_node(NodeKind.BOLD) 

193 for c in row.find_child(NodeKind.TABLE_CELL) 

194 if clean_node(wxr, None, c) != "" 

195 ) 

196 if ( 

197 len( 

198 list( 

199 row.find_child( 

200 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

201 ) 

202 ) 

203 ) 

204 == 1 

205 ): 

206 col_headers.clear() # new table 

207 for cell in row.find_child( 

208 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

209 ): 

210 cell_text = clean_node(wxr, None, cell) 

211 if cell_text in ( 

212 "Flexion der Verbaladjektive", 

213 "(nichterweiterte) Infinitive", 

214 ): 

215 break 

216 elif cell.kind == NodeKind.TABLE_HEADER_CELL and cell_text not in ( 

217 "", 

218 "Person", 

219 ): 

220 colspan = int(cell.attrs.get("colspan", "1")) 

221 col_headers.append( 

222 SpanHeader( 

223 cell_text, 

224 col_header_index, 

225 colspan, 

226 ) 

227 ) 

228 col_header_index += colspan 

229 elif cell.kind == NodeKind.TABLE_CELL: 229 ↛ 207line 229 didn't jump to line 207 because the condition on line 229 was always true

230 if cell_text in ( 

231 "", 

232 "—", 

233 "Text", 

234 "Person", 

235 ) or cell_text.startswith("Flexion:"): 

236 col_index += 1 

237 elif ( 

238 cell.contain_node(NodeKind.BOLD) 

239 or ( 

240 len(list(cell.find_html("small"))) > 0 

241 and len(list(cell.filter_empty_str_child())) == 1 

242 ) 

243 # Vorlage:Deutsch Verb schwach untrennbar reflexiv 

244 or cell.attrs.get("bgcolor", "").lower() == "#f4f4f4" 

245 ): # header in cell 

246 colspan = int(cell.attrs.get("colspan", "1")) 

247 if is_bold_col_header: 

248 for bold_node in cell.find_child(NodeKind.BOLD): 

249 col_headers.append( 

250 SpanHeader( 

251 clean_node(wxr, None, bold_node), 

252 col_header_index, 

253 colspan, 

254 ) 

255 ) 

256 else: 

257 row_header = cell_text 

258 col_header_index += colspan 

259 else: 

260 for form_text in cell_text.splitlines(): 

261 form_text = form_text.strip(", ") 

262 form_raw_tag = "" 

263 if ":" in form_text: 263 ↛ 264line 263 didn't jump to line 264 because the condition on line 263 was never true

264 form_raw_tag, form_text = form_text.split(":", 1) 

265 form = Form( 

266 form=form_text.strip(), 

267 source=page_tite, 

268 raw_tags=shared_raw_tags, 

269 ) 

270 if form_raw_tag != "": 270 ↛ 271line 270 didn't jump to line 271 because the condition on line 270 was never true

271 form.raw_tags.append(form_raw_tag) 

272 if row_header != "": 

273 form.raw_tags.append(row_header) 

274 for col_header in col_headers: 

275 if ( 

276 col_index >= col_header.index 

277 and col_index 

278 < col_header.index + col_header.span 

279 ): 

280 if col_header.text.endswith("I"): 280 ↛ 281line 280 didn't jump to line 281 because the condition on line 280 was never true

281 form.raw_tags.append(col_header.text) 

282 else: 

283 for raw_tag in col_header.text.split(): 

284 form.raw_tags.append(raw_tag) 

285 translate_raw_tags(form) 

286 word_entry.forms.append(form) 

287 col_index += 1 

288 

289 

290def extract_deklinationsseite_numerale_template( 

291 wxr: WiktextractContext, 

292 word_entry: WordEntry, 

293 t_node: TemplateNode, 

294 page_tite: str, 

295) -> None: 

296 # https://de.wiktionary.org/wiki/Vorlage:Deklinationsseite_Numerale 

297 expanded_template = wxr.wtp.parse( 

298 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

299 ) 

300 for table in expanded_template.find_child(NodeKind.TABLE): 

301 col_headers = [] 

302 for row in table.find_child(NodeKind.TABLE_ROW): 

303 row_header = "" 

304 row_has_data = row.contain_node(NodeKind.TABLE_CELL) 

305 col_index = 0 

306 for cell in row.find_child( 

307 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

308 ): 

309 cell_text = clean_node(wxr, None, cell) 

310 if cell_text == "": 

311 continue 

312 if cell.kind == NodeKind.TABLE_HEADER_CELL: 

313 if row_has_data: 

314 row_header = cell_text 

315 else: 

316 col_span = int(cell.attrs.get("colspan", "1")) 

317 if col_index == 0 and not row_has_data: 

318 col_headers.clear() # new table 

319 col_headers.append( 

320 SpanHeader(cell_text, col_index, col_span) 

321 ) 

322 col_index += col_span 

323 else: 

324 word_nodes = [] 

325 raw_tags = [] 

326 for cell_child in cell.children: 

327 if ( 

328 isinstance(cell_child, HTMLNode) 

329 and cell_child.tag == "br" 

330 ): 

331 word = clean_node(wxr, None, word_nodes) 

332 if word != "": 

333 deklinationsseite_numerale_add_form( 

334 word_entry, 

335 word, 

336 page_tite, 

337 raw_tags, 

338 col_index, 

339 row_header, 

340 col_headers, 

341 ) 

342 word_nodes.clear() 

343 elif ( 

344 isinstance(cell_child, WikiNode) 

345 and cell_child.kind == NodeKind.ITALIC 

346 ): 

347 raw_tag = clean_node(wxr, None, cell_child).strip( 

348 ": " 

349 ) 

350 if raw_tag != "": 

351 raw_tags.append(raw_tag) 

352 else: 

353 word_nodes.append(cell_child) 

354 word = clean_node(wxr, None, word_nodes) 

355 if word != "": 

356 deklinationsseite_numerale_add_form( 

357 word_entry, 

358 word, 

359 page_tite, 

360 raw_tags, 

361 col_index, 

362 row_header, 

363 col_headers, 

364 ) 

365 col_index += 1 

366 

367 

368def deklinationsseite_numerale_add_form( 

369 word_entry: WordEntry, 

370 word: str, 

371 source: str, 

372 raw_tags: list[str], 

373 index: int, 

374 row_header: str, 

375 col_headers: list[SpanHeader], 

376) -> None: 

377 form = Form( 

378 form=word, 

379 source=source, 

380 raw_tags=raw_tags, 

381 ) 

382 if row_header != "": 

383 form.raw_tags.append(row_header) 

384 for col_header in col_headers: 

385 if ( 

386 index >= col_header.index 

387 and index < col_header.index + col_header.span 

388 ): 

389 form.raw_tags.append(col_header.text) 

390 translate_raw_tags(form) 

391 word_entry.forms.append(form)