Coverage for src/wiktextract/extractor/nl/inflection.py: 85%

244 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2024-12-27 08:07 +0000

1import re 

2from dataclasses import dataclass 

3 

4from wikitextprocessor.parser import ( 

5 LEVEL_KIND_FLAGS, 

6 NodeKind, 

7 TemplateNode, 

8 WikiNode, 

9) 

10 

11from ...page import clean_node 

12from ...wxr_context import WiktextractContext 

13from .models import Form, WordEntry 

14from .tags import translate_raw_tags 

15 

16FORMS_TABLE_TEMPLATES = frozenset( 

17 [ 

18 "-nlnoun-", 

19 "adjcomp", 

20 "-nlname-", 

21 "-denoun-", 

22 "-denoun1-", 

23 "-nlstam-", 

24 "-csadjc-comp-", 

25 "-dumstam-", 

26 ] 

27) 

28 

29 

30def extract_inflection_template( 

31 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

32) -> None: 

33 if t_node.template_name in [ 

34 "-nlnoun-", 

35 "adjcomp", 

36 "-nlname-", 

37 "-denoun-", 

38 "-denoun1-", 

39 ]: 

40 extract_noun_adj_table(wxr, word_entry, t_node) 

41 elif t_node.template_name == "-nlstam-": 

42 extract_nlstam_template(wxr, word_entry, t_node) 

43 elif t_node.template_name.startswith("-csadjc-comp-"): 43 ↛ 44line 43 didn't jump to line 44 because the condition on line 43 was never true

44 extract_csadjc_comp_template(wxr, word_entry, t_node) 

45 elif t_node.template_name == "-dumstam-": 45 ↛ exitline 45 didn't return from function 'extract_inflection_template' because the condition on line 45 was always true

46 extract_dumstam_template(wxr, word_entry, t_node) 

47 

48 

49def extract_noun_adj_table( 

50 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

51) -> None: 

52 # https://nl.wiktionary.org/wiki/Sjabloon:-nlnoun- 

53 # https://nl.wiktionary.org/wiki/Sjabloon:adjcomp 

54 expanded_node = wxr.wtp.parse( 

55 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

56 ) 

57 column_headers = [] 

58 for table_node in expanded_node.find_child(NodeKind.TABLE): 

59 for row_node in table_node.find_child(NodeKind.TABLE_ROW): 

60 for header_node in row_node.find_child(NodeKind.TABLE_HEADER_CELL): 

61 header_text = clean_node(wxr, None, header_node) 

62 if header_text != "": 

63 column_headers.append(header_text) 

64 row_header = "" 

65 for col_index, data_node in enumerate( 

66 row_node.find_child(NodeKind.TABLE_CELL) 

67 ): 

68 if col_index == 0: 

69 row_header = clean_node(wxr, None, data_node) 

70 else: 

71 for form_str in clean_node( 

72 wxr, None, data_node 

73 ).splitlines(): 

74 if form_str not in ["", "-", wxr.wtp.title]: 

75 form = Form(form=form_str) 

76 if row_header not in ["", "naamwoord", "demoniem"]: 

77 form.raw_tags.append(row_header) 

78 if col_index - 1 < len(column_headers): 78 ↛ 82line 78 didn't jump to line 82 because the condition on line 78 was always true

79 form.raw_tags.append( 

80 column_headers[col_index - 1] 

81 ) 

82 translate_raw_tags(form) 

83 word_entry.forms.append(form) 

84 

85 for link_node in expanded_node.find_child(NodeKind.LINK): 

86 clean_node(wxr, word_entry, link_node) 

87 

88 

89def extract_nlstam_template( 

90 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

91) -> None: 

92 # verb table 

93 # https://nl.wiktionary.org/wiki/Sjabloon:-nlstam- 

94 for arg in [2, 3]: 

95 form_texts = clean_node( 

96 wxr, None, t_node.template_parameters.get(arg, "") 

97 ) 

98 ipa_texts = clean_node( 

99 wxr, None, t_node.template_parameters.get(arg + 3, "") 

100 ).splitlines() 

101 for index, form_str in enumerate(form_texts.splitlines()): 

102 if form_str != "": 102 ↛ 101line 102 didn't jump to line 101 because the condition on line 102 was always true

103 form = Form(form=form_str) 

104 if index < len(ipa_texts): 104 ↛ 106line 104 didn't jump to line 106 because the condition on line 104 was always true

105 form.ipa = ipa_texts[index] 

106 form.tags.extend( 

107 ["past"] if arg == 2 else ["past", "participle"] 

108 ) 

109 word_entry.forms.append(form) 

110 clean_node(wxr, word_entry, t_node) 

111 if not word_entry.extracted_vervoeging_page: 

112 extract_vervoeging_page(wxr, word_entry) 

113 word_entry.extracted_vervoeging_page = True 

114 

115 

116def extract_vervoeging_page( 

117 wxr: WiktextractContext, word_entry: WordEntry 

118) -> None: 

119 page = wxr.wtp.get_page(f"{wxr.wtp.title}/vervoeging", 0) 

120 if page is None: 

121 return 

122 root = wxr.wtp.parse(page.body) 

123 table_templates = [ 

124 "-nlverb-", 

125 "-nlverb-reflex-", 

126 "-nlverb-onp-", 

127 "-dumverb-", 

128 ] 

129 for t_node in root.find_child(NodeKind.TEMPLATE): 

130 if t_node.template_name in table_templates: 130 ↛ 129line 130 didn't jump to line 129 because the condition on line 130 was always true

131 extract_nlverb_template(wxr, word_entry, t_node, "") 

132 sense = "" 

133 for lang_level_node in root.find_child(NodeKind.LEVEL2): 

134 lang_name = clean_node(wxr, None, lang_level_node.largs) 

135 if lang_name != word_entry.lang: 135 ↛ 136line 135 didn't jump to line 136 because the condition on line 135 was never true

136 continue 

137 for sense_level_node in lang_level_node.find_child_recursively( 

138 LEVEL_KIND_FLAGS 

139 ): 

140 sense = clean_node(wxr, None, sense_level_node.largs) 

141 for t_node in sense_level_node.find_child(NodeKind.TEMPLATE): 

142 if t_node.template_name in table_templates: 142 ↛ 141line 142 didn't jump to line 141 because the condition on line 142 was always true

143 extract_nlverb_template(wxr, word_entry, t_node, sense) 

144 # only have language level node 

145 for t_node in lang_level_node.find_child(NodeKind.TEMPLATE): 

146 if t_node.template_name in table_templates: 146 ↛ 145line 146 didn't jump to line 145 because the condition on line 146 was always true

147 extract_nlverb_template(wxr, word_entry, t_node, sense) 

148 

149 

150@dataclass 

151class TableHeader: 

152 text: str 

153 col_index: int 

154 colspan: int 

155 row_index: int 

156 rowspan: int 

157 

158 

159NLVERB_HEADER_PREFIXES = { 

160 "vervoeging van de bedrijvende vorm van": ["active"], 

161 "onpersoonlijke lijdende vorm": ["impersonal", "passive"], 

162 "lijdende vorm": ["passive"], 

163} 

164 

165 

166def extract_nlverb_template( 

167 wxr: WiktextractContext, 

168 word_entry: WordEntry, 

169 t_node: TemplateNode, 

170 sense: str, 

171) -> None: 

172 # https://nl.wiktionary.org/wiki/Sjabloon:-nlverb- 

173 # Sjabloon:-nlverb-reflex- 

174 # Sjabloon:-dumverb- 

175 expanded_node = wxr.wtp.parse( 

176 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

177 ) 

178 for link_node in expanded_node.find_child(NodeKind.LINK): 

179 clean_node(wxr, word_entry, link_node) 

180 if t_node.template_name == "-dumverb-": 

181 extract_dumverb_table(wxr, word_entry, expanded_node, sense) 

182 return 

183 

184 for table_node in expanded_node.find_child(NodeKind.TABLE): 

185 row_index = 0 

186 shared_tags = [] 

187 shared_raw_tags = [] 

188 last_row_all_header = False 

189 col_headers = [] 

190 row_headers = [] 

191 for row_node in table_node.find_child(NodeKind.TABLE_ROW): 

192 col_index = 0 

193 for row_header in row_headers: 

194 if ( 

195 row_index >= row_header.row_index 

196 and row_index < row_header.row_index + row_header.rowspan 

197 ): 

198 col_index += row_header.rowspan 

199 

200 current_row_all_header = all( 

201 nlverb_table_cell_is_header(n) 

202 for n in row_node.find_child( 

203 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

204 ) 

205 ) 

206 if current_row_all_header and not last_row_all_header: 

207 row_index = 0 

208 shared_tags.clear() 

209 shared_raw_tags.clear() 

210 col_headers.clear() 

211 row_headers.clear() 

212 

213 small_tag = "" 

214 is_row_first_node = True 

215 for cell_node in row_node.find_child( 

216 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

217 ): 

218 cell_colspan = 1 

219 cell_colspan_str = cell_node.attrs.get("colspan", "1") 

220 if re.fullmatch(r"\d+", cell_colspan_str): 220 ↛ 222line 220 didn't jump to line 222 because the condition on line 220 was always true

221 cell_colspan = int(cell_colspan_str) 

222 cell_rowspan = 1 

223 cell_rowspan_str = cell_node.attrs.get("rowspan", "1") 

224 if re.fullmatch(r"\d+", cell_rowspan_str): 224 ↛ 226line 224 didn't jump to line 226 because the condition on line 224 was always true

225 cell_rowspan = int(cell_rowspan_str) 

226 cell_str = clean_node(wxr, None, cell_node).strip("| ") 

227 if cell_str in ["", "—", wxr.wtp.title]: 

228 pass 

229 elif nlverb_table_cell_is_header(cell_node): 

230 for ( 

231 header_prefix, 

232 prefix_tags, 

233 ) in NLVERB_HEADER_PREFIXES.items(): 

234 if cell_str.startswith(header_prefix): 

235 shared_tags.extend(prefix_tags) 

236 break 

237 else: 

238 if cell_str.startswith("vervoeging van "): 238 ↛ 239line 238 didn't jump to line 239 because the condition on line 238 was never true

239 pass 

240 elif current_row_all_header: 

241 if ( 

242 is_row_first_node 

243 and t_node.template_name == "-nlverb-" 

244 ): 

245 shared_raw_tags.append(cell_str) 

246 else: 

247 col_headers.append( 

248 TableHeader( 

249 cell_str, 

250 col_index, 

251 cell_colspan, 

252 row_index, 

253 cell_rowspan, 

254 ) 

255 ) 

256 else: 

257 if "(" in cell_str: 257 ↛ 258line 257 didn't jump to line 258 because the condition on line 257 was never true

258 cell_str = cell_str[ 

259 : cell_str.index("(") 

260 ].strip() 

261 row_headers.append( 

262 TableHeader( 

263 cell_str, 

264 col_index, 

265 cell_colspan, 

266 row_index, 

267 cell_rowspan, 

268 ) 

269 ) 

270 else: # data cell 

271 has_small_tag = False 

272 for small_node in cell_node.find_html("small"): 272 ↛ 273line 272 didn't jump to line 273 because the loop on line 272 never started

273 has_small_tag = True 

274 if has_small_tag: 274 ↛ 275line 274 didn't jump to line 275 because the condition on line 274 was never true

275 small_tag = cell_str 

276 col_index += cell_colspan 

277 continue 

278 form_texts = [cell_str] 

279 if "/ " in cell_str: # "zweerde/ zwoor" 

280 form_texts = cell_str.split("/") 

281 elif "/" in cell_str and " " in cell_str: 

282 # "zult/zal zweren" -> ["zult zweren", "zal zweren"] 

283 space_index = cell_str.index(" ") 

284 second_part = cell_str[space_index:] 

285 form_texts = [ 

286 f_str + second_part 

287 for f_str in cell_str[:space_index].split("/") 

288 ] 

289 for form_str in form_texts: 

290 form_str = form_str.strip() 

291 if len(form_str) == 0: 291 ↛ 292line 291 didn't jump to line 292 because the condition on line 291 was never true

292 continue 

293 form = Form( 

294 form=form_str, 

295 tags=shared_tags, 

296 raw_tags=shared_raw_tags, 

297 source=f"{wxr.wtp.title}/vervoeging", 

298 sense=sense, 

299 ) 

300 if small_tag != "": 300 ↛ 301line 300 didn't jump to line 301 because the condition on line 300 was never true

301 form.raw_tags.append(small_tag) 

302 small_tag = "" 

303 for row_header in row_headers: 

304 if ( 

305 row_index >= row_header.row_index 

306 and row_index 

307 < row_header.row_index + row_header.rowspan 

308 ): 

309 form.raw_tags.append(row_header.text) 

310 for col_header in col_headers: 

311 if ( 

312 col_index >= col_header.col_index 

313 and col_index 

314 < col_header.col_index + col_header.colspan 

315 ): 

316 form.raw_tags.append(col_header.text) 

317 translate_raw_tags(form) 

318 word_entry.forms.append(form) 

319 

320 col_index += cell_colspan 

321 is_row_first_node = False 

322 

323 row_index += 1 

324 last_row_all_header = current_row_all_header 

325 

326 

327def nlverb_table_cell_is_header(node: WikiNode) -> bool: 

328 return ( 

329 node.kind == NodeKind.TABLE_HEADER_CELL 

330 or node.attrs.get("class", "") == "infoboxrijhoofding" 

331 ) 

332 

333 

334def extract_csadjc_comp_template( 

335 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

336) -> None: 

337 # https://nl.wiktionary.org/wiki/Sjabloon:-csadjc-comp-ý3- 

338 expanded_node = wxr.wtp.parse( 

339 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

340 ) 

341 for table in expanded_node.find_child(NodeKind.TABLE): 

342 for row in table.find_child(NodeKind.TABLE_ROW): 

343 row_header = "" 

344 for cell_node in row.find_child( 

345 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

346 ): 

347 if cell_node.kind == NodeKind.TABLE_HEADER_CELL: 

348 row_header = clean_node(wxr, None, cell_node) 

349 elif cell_node.kind == NodeKind.TABLE_CELL: 

350 form_text = clean_node(wxr, None, cell_node) 

351 if form_text not in ["", wxr.wtp.title]: 

352 form = Form(form=form_text) 

353 if row_header != "": 

354 form.raw_tags.append(row_header) 

355 translate_raw_tags(form) 

356 word_entry.forms.append(form) 

357 

358 

359def extract_dumstam_template( 

360 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

361) -> None: 

362 # https://nl.wiktionary.org/wiki/Sjabloon:-dumstam- 

363 tags = [ 

364 ["infinitive"], 

365 ["past", "singular"], 

366 ["past", "plural"], 

367 ["past", "participle"], 

368 ] 

369 for arg_name in range(1, 5): 

370 word = clean_node( 

371 wxr, None, t_node.template_parameters.get(arg_name, "") 

372 ) 

373 if word not in ["", word_entry.word]: 

374 form = Form(form=word, tags=tags[arg_name - 1]) 

375 word_entry.forms.append(form) 

376 clean_node(wxr, word_entry, t_node) 

377 if not word_entry.extracted_vervoeging_page: 377 ↛ exitline 377 didn't return from function 'extract_dumstam_template' because the condition on line 377 was always true

378 extract_vervoeging_page(wxr, word_entry) 

379 word_entry.extracted_vervoeging_page = True 

380 

381 

382def extract_dumverb_table( 

383 wxr: WiktextractContext, 

384 word_entry: WordEntry, 

385 expanded_node: WikiNode, 

386 sense: str, 

387) -> None: 

388 table_node = expanded_node 

389 for t_node in expanded_node.find_child(NodeKind.TABLE): 389 ↛ 392line 389 didn't jump to line 392 because the loop on line 389 didn't complete

390 table_node = t_node 

391 break 

392 col_headers = [] 

393 last_row_all_header = False 

394 for row_node in table_node.find_child(NodeKind.TABLE_ROW): 

395 col_index = 0 

396 row_header = "" 

397 current_row_all_header = all( 

398 nlverb_table_cell_is_header(n) 

399 for n in row_node.find_child( 

400 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

401 ) 

402 ) 

403 if current_row_all_header and not last_row_all_header: 

404 col_headers.clear() 

405 for cell_node in row_node.find_child( 

406 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

407 ): 

408 cell_colspan = 1 

409 cell_colspan_str = cell_node.attrs.get("colspan", "1") 

410 if re.fullmatch(r"\d+", cell_colspan_str): 410 ↛ 412line 410 didn't jump to line 412 because the condition on line 410 was always true

411 cell_colspan = int(cell_colspan_str) 

412 cell_str = clean_node(wxr, None, cell_node).strip("!| \n") 

413 if cell_str in ["", "—", wxr.wtp.title]: 

414 continue 

415 is_header = nlverb_table_cell_is_header(cell_node) 

416 if is_header: 

417 if current_row_all_header: 

418 col_headers.append( 

419 TableHeader( 

420 cell_str, 

421 col_index, 

422 cell_colspan, 

423 0, 

424 0, 

425 ) 

426 ) 

427 col_index += cell_colspan 

428 else: 

429 row_header = cell_str 

430 else: 

431 for cell_line in cell_str.splitlines(): 

432 cell_line = cell_line.strip() 

433 if cell_line == "": 433 ↛ 434line 433 didn't jump to line 434 because the condition on line 433 was never true

434 continue 

435 form = Form( 

436 form=cell_line, 

437 source=f"{wxr.wtp.title}/vervoeging", 

438 sense=sense, 

439 ) 

440 if row_header != "": 

441 form.raw_tags.append(row_header) 

442 for col_header in col_headers: 

443 if ( 

444 col_index >= col_header.col_index 

445 and col_index 

446 < col_header.col_index + col_header.colspan 

447 ): 

448 form.raw_tags.append(col_header.text) 

449 translate_raw_tags(form) 

450 word_entry.forms.append(form) 

451 col_index += cell_colspan 

452 last_row_all_header = current_row_all_header