Coverage for src/wiktextract/extractor/nl/inflection.py: 85%

1import re

2from dataclasses import dataclass

4from wikitextprocessor.parser import (

5 LEVEL_KIND_FLAGS,

6 NodeKind,

7 TemplateNode,

8 WikiNode,

11from ...page import clean_node

12from ...wxr_context import WiktextractContext

13from .models import Form, WordEntry

14from .tags import translate_raw_tags

16FORMS_TABLE_TEMPLATES = frozenset(

17 [

18 "-nlnoun-",

19 "adjcomp",

20 "-nlname-",

21 "-denoun-",

22 "-denoun1-",

23 "-nlstam-",

24 "-csadjc-comp-",

25 "-dumstam-",

26 ]

27)

30def extract_inflection_template(

31 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode

32) -> None:

33 if t_node.template_name in [

34 "-nlnoun-",

35 "adjcomp",

36 "-nlname-",

37 "-denoun-",

38 "-denoun1-",

39 ]:

40 extract_noun_adj_table(wxr, word_entry, t_node)

41 elif t_node.template_name == "-nlstam-":

42 extract_nlstam_template(wxr, word_entry, t_node)

43 elif t_node.template_name.startswith("-csadjc-comp-"): 43 ↛ 44line 43 didn't jump to line 44 because the condition on line 43 was never true

44 extract_csadjc_comp_template(wxr, word_entry, t_node)

45 elif t_node.template_name == "-dumstam-": 45 ↛ exitline 45 didn't return from function 'extract_inflection_template' because the condition on line 45 was always true

46 extract_dumstam_template(wxr, word_entry, t_node)

49def extract_noun_adj_table(

50 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode

51) -> None:

52 # https://nl.wiktionary.org/wiki/Sjabloon:-nlnoun-

53 # https://nl.wiktionary.org/wiki/Sjabloon:adjcomp

54 expanded_node = wxr.wtp.parse(

55 wxr.wtp.node_to_wikitext(t_node), expand_all=True

56 )

57 column_headers = []

58 for table_node in expanded_node.find_child(NodeKind.TABLE):

59 for row_node in table_node.find_child(NodeKind.TABLE_ROW):

60 for header_node in row_node.find_child(NodeKind.TABLE_HEADER_CELL):

61 header_text = clean_node(wxr, None, header_node)

62 if header_text != "":

63 column_headers.append(header_text)

64 row_header = ""

65 for col_index, data_node in enumerate(

66 row_node.find_child(NodeKind.TABLE_CELL)

67 ):

68 if col_index == 0:

69 row_header = clean_node(wxr, None, data_node)

70 else:

71 for form_str in clean_node(

72 wxr, None, data_node

73 ).splitlines():

74 if form_str not in ["", "-", wxr.wtp.title]:

75 form = Form(form=form_str)

76 if row_header not in ["", "naamwoord", "demoniem"]:

77 form.raw_tags.append(row_header)

78 if col_index - 1 < len(column_headers): 78 ↛ 82line 78 didn't jump to line 82 because the condition on line 78 was always true

79 form.raw_tags.append(

80 column_headers[col_index - 1]

81 )

82 translate_raw_tags(form)

83 word_entry.forms.append(form)

85 for link_node in expanded_node.find_child(NodeKind.LINK):

86 clean_node(wxr, word_entry, link_node)

89def extract_nlstam_template(

90 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode

91) -> None:

92 # verb table

93 # https://nl.wiktionary.org/wiki/Sjabloon:-nlstam-

94 for arg in [2, 3]:

95 form_texts = clean_node(

96 wxr, None, t_node.template_parameters.get(arg, "")

97 )

98 ipa_texts = clean_node(

99 wxr, None, t_node.template_parameters.get(arg + 3, "")

100 ).splitlines()

101 for index, form_str in enumerate(form_texts.splitlines()):

102 if form_str != "": 102 ↛ 101line 102 didn't jump to line 101 because the condition on line 102 was always true

103 form = Form(form=form_str)

104 if index < len(ipa_texts): 104 ↛ 106line 104 didn't jump to line 106 because the condition on line 104 was always true

105 form.ipa = ipa_texts[index]

106 form.tags.extend(

107 ["past"] if arg == 2 else ["past", "participle"]

108 )

109 word_entry.forms.append(form)

110 clean_node(wxr, word_entry, t_node)

111 if not word_entry.extracted_vervoeging_page:

112 extract_vervoeging_page(wxr, word_entry)

113 word_entry.extracted_vervoeging_page = True

114

115

116def extract_vervoeging_page(

117 wxr: WiktextractContext, word_entry: WordEntry

118) -> None:

119 page = wxr.wtp.get_page(f"{wxr.wtp.title}/vervoeging", 0)

120 if page is None:

121 return

122 root = wxr.wtp.parse(page.body)

123 table_templates = [

124 "-nlverb-",

125 "-nlverb-reflex-",

126 "-nlverb-onp-",

127 "-dumverb-",

128 ]

129 for t_node in root.find_child(NodeKind.TEMPLATE):

130 if t_node.template_name in table_templates: 130 ↛ 129line 130 didn't jump to line 129 because the condition on line 130 was always true

131 extract_nlverb_template(wxr, word_entry, t_node, "")

132 sense = ""

133 for lang_level_node in root.find_child(NodeKind.LEVEL2):

134 lang_name = clean_node(wxr, None, lang_level_node.largs)

135 if lang_name != word_entry.lang: 135 ↛ 136line 135 didn't jump to line 136 because the condition on line 135 was never true

136 continue

137 for sense_level_node in lang_level_node.find_child_recursively(

138 LEVEL_KIND_FLAGS

139 ):

140 sense = clean_node(wxr, None, sense_level_node.largs)

141 for t_node in sense_level_node.find_child(NodeKind.TEMPLATE):

142 if t_node.template_name in table_templates: 142 ↛ 141line 142 didn't jump to line 141 because the condition on line 142 was always true

143 extract_nlverb_template(wxr, word_entry, t_node, sense)

144 # only have language level node

145 for t_node in lang_level_node.find_child(NodeKind.TEMPLATE):

146 if t_node.template_name in table_templates: 146 ↛ 145line 146 didn't jump to line 145 because the condition on line 146 was always true

147 extract_nlverb_template(wxr, word_entry, t_node, sense)

148

149

150@dataclass

151class TableHeader:

152 text: str

153 col_index: int

154 colspan: int

155 row_index: int

156 rowspan: int

157

158

159NLVERB_HEADER_PREFIXES = {

160 "vervoeging van de bedrijvende vorm van": ["active"],

161 "onpersoonlijke lijdende vorm": ["impersonal", "passive"],

162 "lijdende vorm": ["passive"],

163}

164

165

166def extract_nlverb_template(

167 wxr: WiktextractContext,

168 word_entry: WordEntry,

169 t_node: TemplateNode,

170 sense: str,

171) -> None:

172 # https://nl.wiktionary.org/wiki/Sjabloon:-nlverb-

173 # Sjabloon:-nlverb-reflex-

174 # Sjabloon:-dumverb-

175 expanded_node = wxr.wtp.parse(

176 wxr.wtp.node_to_wikitext(t_node), expand_all=True

177 )

178 for link_node in expanded_node.find_child(NodeKind.LINK):

179 clean_node(wxr, word_entry, link_node)

180 if t_node.template_name == "-dumverb-":

181 extract_dumverb_table(wxr, word_entry, expanded_node, sense)

182 return

183

184 for table_node in expanded_node.find_child(NodeKind.TABLE):

185 row_index = 0

186 shared_tags = []

187 shared_raw_tags = []

188 last_row_all_header = False

189 col_headers = []

190 row_headers = []

191 for row_node in table_node.find_child(NodeKind.TABLE_ROW):

192 col_index = 0

193 for row_header in row_headers:

194 if (

195 row_index >= row_header.row_index

196 and row_index < row_header.row_index + row_header.rowspan

197 ):

198 col_index += row_header.rowspan

199

200 current_row_all_header = all(

201 nlverb_table_cell_is_header(n)

202 for n in row_node.find_child(

203 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL

204 )

205 )

206 if current_row_all_header and not last_row_all_header:

207 row_index = 0

208 shared_tags.clear()

209 shared_raw_tags.clear()

210 col_headers.clear()

211 row_headers.clear()

212

213 small_tag = ""

214 is_row_first_node = True

215 for cell_node in row_node.find_child(

216 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL

217 ):

218 cell_colspan = 1

219 cell_colspan_str = cell_node.attrs.get("colspan", "1")

220 if re.fullmatch(r"\d+", cell_colspan_str): 220 ↛ 222line 220 didn't jump to line 222 because the condition on line 220 was always true

221 cell_colspan = int(cell_colspan_str)

222 cell_rowspan = 1

223 cell_rowspan_str = cell_node.attrs.get("rowspan", "1")

224 if re.fullmatch(r"\d+", cell_rowspan_str): 224 ↛ 226line 224 didn't jump to line 226 because the condition on line 224 was always true

225 cell_rowspan = int(cell_rowspan_str)

226 cell_str = clean_node(wxr, None, cell_node).strip("| ")

227 if cell_str in ["", "—", wxr.wtp.title]:

228 pass

229 elif nlverb_table_cell_is_header(cell_node):

230 for (

231 header_prefix,

232 prefix_tags,

233 ) in NLVERB_HEADER_PREFIXES.items():

234 if cell_str.startswith(header_prefix):

235 shared_tags.extend(prefix_tags)

236 break

237 else:

238 if cell_str.startswith("vervoeging van "): 238 ↛ 239line 238 didn't jump to line 239 because the condition on line 238 was never true

239 pass

240 elif current_row_all_header:

241 if (

242 is_row_first_node

243 and t_node.template_name == "-nlverb-"

244 ):

245 shared_raw_tags.append(cell_str)

246 else:

247 col_headers.append(

248 TableHeader(

249 cell_str,

250 col_index,

251 cell_colspan,

252 row_index,

253 cell_rowspan,

254 )

255 )

256 else:

257 if "(" in cell_str: 257 ↛ 258line 257 didn't jump to line 258 because the condition on line 257 was never true

258 cell_str = cell_str[

259 : cell_str.index("(")

260 ].strip()

261 row_headers.append(

262 TableHeader(

263 cell_str,

264 col_index,

265 cell_colspan,

266 row_index,

267 cell_rowspan,

268 )

269 )

270 else: # data cell

271 has_small_tag = False

272 for small_node in cell_node.find_html("small"): 272 ↛ 273line 272 didn't jump to line 273 because the loop on line 272 never started

273 has_small_tag = True

274 if has_small_tag: 274 ↛ 275line 274 didn't jump to line 275 because the condition on line 274 was never true

275 small_tag = cell_str

276 col_index += cell_colspan

277 continue

278 form_texts = [cell_str]

279 if "/ " in cell_str: # "zweerde/ zwoor"

280 form_texts = cell_str.split("/")

281 elif "/" in cell_str and " " in cell_str:

282 # "zult/zal zweren" -> ["zult zweren", "zal zweren"]

283 space_index = cell_str.index(" ")

284 second_part = cell_str[space_index:]

285 form_texts = [

286 f_str + second_part

287 for f_str in cell_str[:space_index].split("/")

288 ]

289 for form_str in form_texts:

290 form_str = form_str.strip()

291 if len(form_str) == 0: 291 ↛ 292line 291 didn't jump to line 292 because the condition on line 291 was never true

292 continue

293 form = Form(

294 form=form_str,

295 tags=shared_tags,

296 raw_tags=shared_raw_tags,

297 source=f"{wxr.wtp.title}/vervoeging",

298 sense=sense,

299 )

300 if small_tag != "": 300 ↛ 301line 300 didn't jump to line 301 because the condition on line 300 was never true

301 form.raw_tags.append(small_tag)

302 small_tag = ""

303 for row_header in row_headers:

304 if (

305 row_index >= row_header.row_index

306 and row_index

307 < row_header.row_index + row_header.rowspan

308 ):

309 form.raw_tags.append(row_header.text)

310 for col_header in col_headers:

311 if (

312 col_index >= col_header.col_index

313 and col_index

314 < col_header.col_index + col_header.colspan

315 ):

316 form.raw_tags.append(col_header.text)

317 translate_raw_tags(form)

318 word_entry.forms.append(form)

319

320 col_index += cell_colspan

321 is_row_first_node = False

322

323 row_index += 1

324 last_row_all_header = current_row_all_header

325

326

327def nlverb_table_cell_is_header(node: WikiNode) -> bool:

328 return (

329 node.kind == NodeKind.TABLE_HEADER_CELL

330 or node.attrs.get("class", "") == "infoboxrijhoofding"

331 )

332

333

334def extract_csadjc_comp_template(

335 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode

336) -> None:

337 # https://nl.wiktionary.org/wiki/Sjabloon:-csadjc-comp-ý3-

338 expanded_node = wxr.wtp.parse(

339 wxr.wtp.node_to_wikitext(t_node), expand_all=True

340 )

341 for table in expanded_node.find_child(NodeKind.TABLE):

342 for row in table.find_child(NodeKind.TABLE_ROW):

343 row_header = ""

344 for cell_node in row.find_child(

345 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL

346 ):

347 if cell_node.kind == NodeKind.TABLE_HEADER_CELL:

348 row_header = clean_node(wxr, None, cell_node)

349 elif cell_node.kind == NodeKind.TABLE_CELL:

350 form_text = clean_node(wxr, None, cell_node)

351 if form_text not in ["", wxr.wtp.title]:

352 form = Form(form=form_text)

353 if row_header != "":

354 form.raw_tags.append(row_header)

355 translate_raw_tags(form)

356 word_entry.forms.append(form)

357

358

359def extract_dumstam_template(

360 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode

361) -> None:

362 # https://nl.wiktionary.org/wiki/Sjabloon:-dumstam-

363 tags = [

364 ["infinitive"],

365 ["past", "singular"],

366 ["past", "plural"],

367 ["past", "participle"],

368 ]

369 for arg_name in range(1, 5):

370 word = clean_node(

371 wxr, None, t_node.template_parameters.get(arg_name, "")

372 )

373 if word not in ["", word_entry.word]:

374 form = Form(form=word, tags=tags[arg_name - 1])

375 word_entry.forms.append(form)

376 clean_node(wxr, word_entry, t_node)

377 if not word_entry.extracted_vervoeging_page: 377 ↛ exitline 377 didn't return from function 'extract_dumstam_template' because the condition on line 377 was always true

378 extract_vervoeging_page(wxr, word_entry)

379 word_entry.extracted_vervoeging_page = True

380

381

382def extract_dumverb_table(

383 wxr: WiktextractContext,

384 word_entry: WordEntry,

385 expanded_node: WikiNode,

386 sense: str,

387) -> None:

388 table_node = expanded_node

389 for t_node in expanded_node.find_child(NodeKind.TABLE): 389 ↛ 392line 389 didn't jump to line 392 because the loop on line 389 didn't complete

390 table_node = t_node

391 break

392 col_headers = []

393 last_row_all_header = False

394 for row_node in table_node.find_child(NodeKind.TABLE_ROW):

395 col_index = 0

396 row_header = ""

397 current_row_all_header = all(

398 nlverb_table_cell_is_header(n)

399 for n in row_node.find_child(

400 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL

401 )

402 )

403 if current_row_all_header and not last_row_all_header:

404 col_headers.clear()

405 for cell_node in row_node.find_child(

406 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL

407 ):

408 cell_colspan = 1

409 cell_colspan_str = cell_node.attrs.get("colspan", "1")

410 if re.fullmatch(r"\d+", cell_colspan_str): 410 ↛ 412line 410 didn't jump to line 412 because the condition on line 410 was always true

411 cell_colspan = int(cell_colspan_str)

412 cell_str = clean_node(wxr, None, cell_node).strip("!| \n")

413 if cell_str in ["", "—", wxr.wtp.title]:

414 continue

415 is_header = nlverb_table_cell_is_header(cell_node)

416 if is_header:

417 if current_row_all_header:

418 col_headers.append(

419 TableHeader(

420 cell_str,

421 col_index,

422 cell_colspan,

423 0,

424 0,

425 )

426 )

427 col_index += cell_colspan

428 else:

429 row_header = cell_str

430 else:

431 for cell_line in cell_str.splitlines():

432 cell_line = cell_line.strip()

433 if cell_line == "": 433 ↛ 434line 433 didn't jump to line 434 because the condition on line 433 was never true

434 continue

435 form = Form(

436 form=cell_line,

437 source=f"{wxr.wtp.title}/vervoeging",

438 sense=sense,

439 )

440 if row_header != "":

441 form.raw_tags.append(row_header)

442 for col_header in col_headers:

443 if (

444 col_index >= col_header.col_index

445 and col_index

446 < col_header.col_index + col_header.colspan

447 ):

448 form.raw_tags.append(col_header.text)

449 translate_raw_tags(form)

450 word_entry.forms.append(form)

451 col_index += cell_colspan

452 last_row_all_header = current_row_all_header