Coverage for src/wiktextract/extractor/nl/inflection.py: 85%

255 statements  

« prev     ^ index     » next       coverage.py v7.14.3, created at 2026-06-23 09:14 +0000

1import re 

2from dataclasses import dataclass 

3 

4from wikitextprocessor.parser import ( 

5 LEVEL_KIND_FLAGS, 

6 NodeKind, 

7 TemplateNode, 

8 WikiNode, 

9) 

10 

11from ...page import clean_node 

12from ...wxr_context import WiktextractContext 

13from .models import Form, WordEntry 

14from .tags import translate_raw_tags 

15 

16FORMS_TABLE_TEMPLATES = frozenset( 

17 [ 

18 "-nlnoun-", 

19 "adjcomp", 

20 "-nlname-", 

21 "-denoun-", 

22 "-denoun1-", 

23 "-nlstam-", 

24 "-csadjc-comp-", 

25 "-dumstam-", 

26 ] 

27) 

28 

29 

30def extract_inflection_template( 

31 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

32) -> None: 

33 if t_node.template_name in [ 

34 "-nlnoun-", 

35 "adjcomp", 

36 "-nlname-", 

37 "-denoun-", 

38 "-denoun1-", 

39 ]: 

40 extract_noun_adj_table(wxr, word_entry, t_node) 

41 elif t_node.template_name == "-nlstam-": 

42 extract_nlstam_template(wxr, word_entry, t_node) 

43 elif t_node.template_name.startswith("-csadjc-comp-"): 43 ↛ 44line 43 didn't jump to line 44 because the condition on line 43 was never true

44 extract_csadjc_comp_template(wxr, word_entry, t_node) 

45 elif t_node.template_name == "-dumstam-": 45 ↛ exitline 45 didn't return from function 'extract_inflection_template' because the condition on line 45 was always true

46 extract_dumstam_template(wxr, word_entry, t_node) 

47 

48 

49def extract_noun_adj_table( 

50 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

51) -> None: 

52 # https://nl.wiktionary.org/wiki/Sjabloon:-nlnoun- 

53 # https://nl.wiktionary.org/wiki/Sjabloon:adjcomp 

54 expanded_node = wxr.wtp.parse( 

55 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

56 ) 

57 column_headers = [] 

58 for table_node in expanded_node.find_child(NodeKind.TABLE): 

59 for row_node in table_node.find_child(NodeKind.TABLE_ROW): 

60 for header_node in row_node.find_child(NodeKind.TABLE_HEADER_CELL): 

61 header_text = clean_node(wxr, None, header_node) 

62 if header_text != "": 

63 column_headers.append(header_text) 

64 row_header = "" 

65 for col_index, data_node in enumerate( 

66 row_node.find_child(NodeKind.TABLE_CELL) 

67 ): 

68 if col_index == 0: 

69 row_header = clean_node(wxr, None, data_node) 

70 else: 

71 for form_str in clean_node( 

72 wxr, None, data_node 

73 ).splitlines(): 

74 if form_str not in ["", "-", wxr.wtp.title]: 

75 form = Form(form=form_str) 

76 if row_header not in ["", "naamwoord", "demoniem"]: 

77 form.raw_tags.append(row_header) 

78 if col_index - 1 < len(column_headers): 78 ↛ 82line 78 didn't jump to line 82 because the condition on line 78 was always true

79 form.raw_tags.append( 

80 column_headers[col_index - 1] 

81 ) 

82 translate_raw_tags(form) 

83 word_entry.forms.append(form) 

84 

85 for link_node in expanded_node.find_child(NodeKind.LINK): 

86 clean_node(wxr, word_entry, link_node) 

87 

88 

89def extract_nlstam_template( 

90 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

91) -> None: 

92 # verb table 

93 # https://nl.wiktionary.org/wiki/Sjabloon:-nlstam- 

94 for arg in [2, 3]: 

95 form_texts = clean_node( 

96 wxr, None, t_node.template_parameters.get(arg, "") 

97 ) 

98 ipa_texts = clean_node( 

99 wxr, None, t_node.template_parameters.get(arg + 3, "") 

100 ).splitlines() 

101 for index, form_str in enumerate(form_texts.splitlines()): 

102 if form_str != "": 102 ↛ 101line 102 didn't jump to line 101 because the condition on line 102 was always true

103 form = Form(form=form_str) 

104 if index < len(ipa_texts): 104 ↛ 106line 104 didn't jump to line 106 because the condition on line 104 was always true

105 form.ipa = ipa_texts[index] 

106 form.tags.extend( 

107 ["past"] if arg == 2 else ["past", "participle"] 

108 ) 

109 word_entry.forms.append(form) 

110 clean_node(wxr, word_entry, t_node) 

111 if not word_entry.extracted_vervoeging_page: 

112 extract_vervoeging_page(wxr, word_entry) 

113 word_entry.extracted_vervoeging_page = True 

114 

115 

116def extract_vervoeging_page( 

117 wxr: WiktextractContext, word_entry: WordEntry 

118) -> None: 

119 page = wxr.wtp.get_page(f"{wxr.wtp.title}/vervoeging", 0) 

120 if page is None: 

121 return 

122 root = wxr.wtp.parse(page.body) 

123 table_templates = [ 

124 "-nlverb-", 

125 "-nlverb-reflex-", 

126 "-nlverb-onp-", 

127 "-dumverb-", 

128 ] 

129 for t_node in root.find_child(NodeKind.TEMPLATE): 

130 if t_node.template_name in table_templates: 130 ↛ 129line 130 didn't jump to line 129 because the condition on line 130 was always true

131 extract_nlverb_template(wxr, word_entry, t_node, "") 

132 sense = "" 

133 for lang_level_node in root.find_child(NodeKind.LEVEL2): 

134 lang_name = clean_node(wxr, None, lang_level_node.largs) 

135 if lang_name != word_entry.lang: 135 ↛ 136line 135 didn't jump to line 136 because the condition on line 135 was never true

136 continue 

137 for sense_level_node in lang_level_node.find_child_recursively( 

138 LEVEL_KIND_FLAGS 

139 ): 

140 sense = clean_node(wxr, None, sense_level_node.largs) 

141 for t_node in sense_level_node.find_child(NodeKind.TEMPLATE): 

142 if t_node.template_name in table_templates: 142 ↛ 141line 142 didn't jump to line 141 because the condition on line 142 was always true

143 extract_nlverb_template(wxr, word_entry, t_node, sense) 

144 # only have language level node 

145 for t_node in lang_level_node.find_child(NodeKind.TEMPLATE): 

146 if t_node.template_name in table_templates: 146 ↛ 145line 146 didn't jump to line 145 because the condition on line 146 was always true

147 extract_nlverb_template(wxr, word_entry, t_node, sense) 

148 

149 

150@dataclass 

151class TableHeader: 

152 text: str 

153 col_index: int 

154 colspan: int 

155 row_index: int 

156 rowspan: int 

157 

158 

159NLVERB_HEADER_PREFIXES = { 

160 "vervoeging van de bedrijvende vorm van": ["active"], 

161 "onpersoonlijke lijdende vorm": ["impersonal", "passive"], 

162 "lijdende vorm": ["passive"], 

163} 

164 

165 

166def extract_nlverb_template( 

167 wxr: WiktextractContext, 

168 word_entry: WordEntry, 

169 t_node: TemplateNode, 

170 sense: str, 

171) -> None: 

172 # https://nl.wiktionary.org/wiki/Sjabloon:-nlverb- 

173 # Sjabloon:-nlverb-reflex- 

174 # Sjabloon:-dumverb- 

175 expanded_node = wxr.wtp.parse( 

176 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

177 ) 

178 for link_node in expanded_node.find_child(NodeKind.LINK): 

179 clean_node(wxr, word_entry, link_node) 

180 if t_node.template_name == "-dumverb-": 

181 extract_dumverb_table(wxr, word_entry, expanded_node, sense) 

182 return 

183 

184 for table_node in expanded_node.find_child(NodeKind.TABLE): 

185 row_index = 0 

186 shared_tags = [] 

187 shared_raw_tags = [] 

188 last_row_all_header = False 

189 col_headers = [] 

190 row_headers = [] 

191 for row_node in table_node.find_child(NodeKind.TABLE_ROW): 

192 col_index = 0 

193 for row_header in row_headers: 

194 if ( 

195 row_index >= row_header.row_index 

196 and row_index < row_header.row_index + row_header.rowspan 

197 ): 

198 col_index += row_header.rowspan 

199 

200 current_row_all_header = all( 

201 nlverb_table_cell_is_header(n) 

202 for n in row_node.find_child( 

203 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

204 ) 

205 ) 

206 if current_row_all_header and not last_row_all_header: 

207 row_index = 0 

208 shared_tags.clear() 

209 shared_raw_tags.clear() 

210 col_headers.clear() 

211 row_headers.clear() 

212 

213 small_tag = "" 

214 is_row_first_node = True 

215 for cell_node in row_node.find_child( 

216 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

217 ): 

218 cell_colspan = 1 

219 cell_colspan_str = cell_node.attrs.get("colspan", "1") 

220 if re.fullmatch(r"\d+", cell_colspan_str): 220 ↛ 222line 220 didn't jump to line 222 because the condition on line 220 was always true

221 cell_colspan = int(cell_colspan_str) 

222 cell_rowspan = 1 

223 cell_rowspan_str = cell_node.attrs.get("rowspan", "1") 

224 if re.fullmatch(r"\d+", cell_rowspan_str): 224 ↛ 226line 224 didn't jump to line 226 because the condition on line 224 was always true

225 cell_rowspan = int(cell_rowspan_str) 

226 cell_str = clean_node(wxr, None, cell_node).strip("| ") 

227 if cell_str in ["", "—", wxr.wtp.title]: 

228 pass 

229 elif nlverb_table_cell_is_header(cell_node): 

230 for ( 

231 header_prefix, 

232 prefix_tags, 

233 ) in NLVERB_HEADER_PREFIXES.items(): 

234 if cell_str.startswith(header_prefix): 

235 shared_tags.extend(prefix_tags) 

236 break 

237 else: 

238 if cell_str.startswith("vervoeging van "): 238 ↛ 239line 238 didn't jump to line 239 because the condition on line 238 was never true

239 pass 

240 elif current_row_all_header: 

241 if ( 

242 is_row_first_node 

243 and t_node.template_name == "-nlverb-" 

244 ): 

245 shared_raw_tags.append(cell_str) 

246 else: 

247 col_headers.append( 

248 TableHeader( 

249 cell_str, 

250 col_index, 

251 cell_colspan, 

252 row_index, 

253 cell_rowspan, 

254 ) 

255 ) 

256 else: 

257 if "(" in cell_str: 257 ↛ 258line 257 didn't jump to line 258 because the condition on line 257 was never true

258 cell_str = cell_str[ 

259 : cell_str.index("(") 

260 ].strip() 

261 row_headers.append( 

262 TableHeader( 

263 cell_str, 

264 col_index, 

265 cell_colspan, 

266 row_index, 

267 cell_rowspan, 

268 ) 

269 ) 

270 else: # data cell 

271 has_small_tag = False 

272 for small_node in cell_node.find_html("small"): 272 ↛ 273line 272 didn't jump to line 273 because the loop on line 272 never started

273 has_small_tag = True 

274 if has_small_tag: 274 ↛ 275line 274 didn't jump to line 275 because the condition on line 274 was never true

275 small_tag = cell_str 

276 col_index += cell_colspan 

277 continue 

278 form_texts = nl_split_cell(cell_str) 

279 for form_str in form_texts: 

280 form_str = form_str.strip() 

281 if len(form_str) == 0: 281 ↛ 282line 281 didn't jump to line 282 because the condition on line 281 was never true

282 continue 

283 form = Form( 

284 form=form_str, 

285 tags=shared_tags, 

286 raw_tags=shared_raw_tags, 

287 source=f"{wxr.wtp.title}/vervoeging", 

288 sense=sense, 

289 ) 

290 if small_tag != "": 290 ↛ 291line 290 didn't jump to line 291 because the condition on line 290 was never true

291 form.raw_tags.append(small_tag) 

292 small_tag = "" 

293 for row_header in row_headers: 

294 if ( 

295 row_index >= row_header.row_index 

296 and row_index 

297 < row_header.row_index + row_header.rowspan 

298 ): 

299 form.raw_tags.append(row_header.text) 

300 for col_header in col_headers: 

301 if ( 

302 col_index >= col_header.col_index 

303 and col_index 

304 < col_header.col_index + col_header.colspan 

305 ): 

306 form.raw_tags.append(col_header.text) 

307 translate_raw_tags(form) 

308 word_entry.forms.append(form) 

309 

310 col_index += cell_colspan 

311 is_row_first_node = False 

312 

313 row_index += 1 

314 last_row_all_header = current_row_all_header 

315 

316 

317def nlverb_table_cell_is_header(node: WikiNode) -> bool: 

318 return ( 

319 node.kind == NodeKind.TABLE_HEADER_CELL 

320 or node.attrs.get("class", "") == "infoboxrijhoofding" 

321 ) 

322 

323 

324def extract_csadjc_comp_template( 

325 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

326) -> None: 

327 # https://nl.wiktionary.org/wiki/Sjabloon:-csadjc-comp-ý3- 

328 expanded_node = wxr.wtp.parse( 

329 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

330 ) 

331 for table in expanded_node.find_child(NodeKind.TABLE): 

332 for row in table.find_child(NodeKind.TABLE_ROW): 

333 row_header = "" 

334 for cell_node in row.find_child( 

335 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

336 ): 

337 if cell_node.kind == NodeKind.TABLE_HEADER_CELL: 

338 row_header = clean_node(wxr, None, cell_node) 

339 elif cell_node.kind == NodeKind.TABLE_CELL: 

340 form_text = clean_node(wxr, None, cell_node) 

341 if form_text not in ["", wxr.wtp.title]: 

342 form = Form(form=form_text) 

343 if row_header != "": 

344 form.raw_tags.append(row_header) 

345 translate_raw_tags(form) 

346 word_entry.forms.append(form) 

347 

348 

349def extract_dumstam_template( 

350 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

351) -> None: 

352 # https://nl.wiktionary.org/wiki/Sjabloon:-dumstam- 

353 tags = [ 

354 ["infinitive"], 

355 ["past", "singular"], 

356 ["past", "plural"], 

357 ["past", "participle"], 

358 ] 

359 for arg_name in range(1, 5): 

360 word = clean_node( 

361 wxr, None, t_node.template_parameters.get(arg_name, "") 

362 ) 

363 if word not in ["", word_entry.word]: 

364 form = Form(form=word, tags=tags[arg_name - 1]) 

365 word_entry.forms.append(form) 

366 clean_node(wxr, word_entry, t_node) 

367 if not word_entry.extracted_vervoeging_page: 367 ↛ exitline 367 didn't return from function 'extract_dumstam_template' because the condition on line 367 was always true

368 extract_vervoeging_page(wxr, word_entry) 

369 word_entry.extracted_vervoeging_page = True 

370 

371 

372def extract_dumverb_table( 

373 wxr: WiktextractContext, 

374 word_entry: WordEntry, 

375 expanded_node: WikiNode, 

376 sense: str, 

377) -> None: 

378 table_node = expanded_node 

379 for t_node in expanded_node.find_child(NodeKind.TABLE): 379 ↛ 382line 379 didn't jump to line 382 because the loop on line 379 didn't complete

380 table_node = t_node 

381 break 

382 col_headers = [] 

383 last_row_all_header = False 

384 for row_node in table_node.find_child(NodeKind.TABLE_ROW): 

385 col_index = 0 

386 row_header = "" 

387 current_row_all_header = all( 

388 nlverb_table_cell_is_header(n) 

389 for n in row_node.find_child( 

390 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

391 ) 

392 ) 

393 if current_row_all_header and not last_row_all_header: 

394 col_headers.clear() 

395 for cell_node in row_node.find_child( 

396 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

397 ): 

398 cell_colspan = 1 

399 cell_colspan_str = cell_node.attrs.get("colspan", "1") 

400 if re.fullmatch(r"\d+", cell_colspan_str): 400 ↛ 402line 400 didn't jump to line 402 because the condition on line 400 was always true

401 cell_colspan = int(cell_colspan_str) 

402 cell_str = clean_node(wxr, None, cell_node).strip("!| \n") 

403 if cell_str in ["", "—", wxr.wtp.title]: 

404 continue 

405 is_header = nlverb_table_cell_is_header(cell_node) 

406 if is_header: 

407 if current_row_all_header: 

408 col_headers.append( 

409 TableHeader( 

410 cell_str, 

411 col_index, 

412 cell_colspan, 

413 0, 

414 0, 

415 ) 

416 ) 

417 col_index += cell_colspan 

418 else: 

419 row_header = cell_str 

420 else: 

421 for cell_line in cell_str.splitlines(): 

422 cell_line = cell_line.strip() 

423 if cell_line == "": 423 ↛ 424line 423 didn't jump to line 424 because the condition on line 423 was never true

424 continue 

425 form = Form( 

426 form=cell_line, 

427 source=f"{wxr.wtp.title}/vervoeging", 

428 sense=sense, 

429 ) 

430 if row_header != "": 

431 form.raw_tags.append(row_header) 

432 for col_header in col_headers: 

433 if ( 

434 col_index >= col_header.col_index 

435 and col_index 

436 < col_header.col_index + col_header.colspan 

437 ): 

438 form.raw_tags.append(col_header.text) 

439 translate_raw_tags(form) 

440 word_entry.forms.append(form) 

441 col_index += cell_colspan 

442 last_row_all_header = current_row_all_header 

443 

444 

445def nl_split_cell(text: str) -> list[str]: 

446 if not text: 

447 return [] 

448 if ("/" in text) + ("\n" in text) + ("(" in text) > 1: 

449 # Leave messy entries alone; remove this when not applicable anymore 

450 return [text] 

451 if "/ " in text: # "zweerde/ zwoor" 

452 form_texts = [s.strip() for s in text.split("/")] 

453 elif "/" in text and " " in text: 

454 # "zult/zal zweren" -> ["zult zweren", "zal zweren"] 

455 space_index = text.index(" ") 

456 second_part = text[space_index:] 

457 form_texts = [ 

458 (f_str + second_part).strip() 

459 for f_str in text[:space_index].split("/") 

460 ] 

461 elif m := re.match(r"([^()]+)\(([^)]+)\)(.+)", text): 

462 # "zou(dt) treinsurfen" -> ["zou treinsurfen", "zoudt treinsurfen"] 

463 form_texts = [ 

464 m.group(1) + m.group(3), 

465 m.group(1) + m.group(2) + m.group(3), 

466 ] 

467 elif "\n" in text: 

468 form_texts = [s.strip() for s in text.split("\n")] 

469 else: 

470 form_texts = [text] 

471 return form_texts