Coverage for src/wiktextract/extractor/nl/pos.py: 92%

193 statements  

« prev     ^ index     » next       coverage.py v7.10.3, created at 2025-08-15 05:18 +0000

1import re 

2 

3from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode 

4 

5from ...page import clean_node 

6from ...wxr_context import WiktextractContext 

7from .example import ( 

8 EXAMPLE_TEMPLATES, 

9 extract_example_list_item, 

10 extract_example_template, 

11) 

12from .models import AltForm, Form, Sense, WordEntry 

13from .section_titles import LINKAGE_SECTIONS, POS_DATA 

14from .tags import ( 

15 GLOSS_TAG_TEMPLATES, 

16 LIST_ITEM_TAG_TEMPLATES, 

17 translate_raw_tags, 

18) 

19 

20 

21def extract_pos_section( 

22 wxr: WiktextractContext, 

23 page_data: list[WordEntry], 

24 base_data: WordEntry, 

25 forms_data: WordEntry, 

26 level_node: LevelNode, 

27 pos_title: str, 

28) -> None: 

29 page_data.append(base_data.model_copy(deep=True)) 

30 page_data[-1].pos_title = pos_title 

31 pos_data = POS_DATA[pos_title] 

32 page_data[-1].pos = pos_data["pos"] 

33 page_data[-1].tags.extend(pos_data.get("tags", [])) 

34 if forms_data.pos == "unknown": 

35 forms_data.pos = page_data[-1].pos 

36 if forms_data.pos == page_data[-1].pos: 

37 page_data[-1].forms.extend(forms_data.forms) 

38 page_data[-1].categories.extend(forms_data.categories) 

39 else: 

40 forms_data.forms.clear() 

41 forms_data.categories.clear() 

42 extract_pos_section_nodes(wxr, page_data, base_data, forms_data, level_node) 

43 if len(page_data[-1].senses) == 0 and pos_title in LINKAGE_SECTIONS: 

44 page_data.pop() 

45 

46 

47def extract_pos_section_nodes( 

48 wxr: WiktextractContext, 

49 page_data: list[WordEntry], 

50 base_data: WordEntry, 

51 forms_data: WordEntry, 

52 level_node: LevelNode, 

53) -> None: 

54 gloss_list_start = 0 

55 is_first_bold = True 

56 for index, node in enumerate(level_node.children): 

57 if ( 

58 isinstance(node, WikiNode) 

59 and node.kind == NodeKind.LIST 

60 and node.sarg.endswith(("#", "::")) 

61 ): 

62 if gloss_list_start == 0 and node.sarg.endswith("#"): 

63 gloss_list_start = index 

64 extract_pos_header_line_nodes( 

65 wxr, page_data[-1], level_node.children[:index] 

66 ) 

67 for list_item in node.find_child(NodeKind.LIST_ITEM): 

68 parent_sense = None 

69 if node.sarg.endswith("##") and len(page_data[-1].senses) > 0: 

70 p_glosses_len = len(node.sarg) - 1 

71 for sense in page_data[-1].senses: 71 ↛ 78line 71 didn't jump to line 78 because the loop on line 71 didn't complete

72 if ( 72 ↛ 71line 72 didn't jump to line 71 because the condition on line 72 was always true

73 sense.glosses 

74 == page_data[-1].senses[-1].glosses[:p_glosses_len] 

75 ): 

76 parent_sense = sense 

77 break 

78 extract_gloss_list_item( 

79 wxr, page_data[-1], list_item, parent_sense 

80 ) 

81 elif isinstance(node, LevelNode): 

82 title_text = clean_node(wxr, None, node.largs) 

83 if title_text in POS_DATA and title_text not in LINKAGE_SECTIONS: 

84 # expanded from "eng-onv-d" form-of template 

85 from .page import parse_section 

86 

87 parse_section(wxr, page_data, base_data, forms_data, node) 

88 else: 

89 break 

90 elif ( 

91 isinstance(node, TemplateNode) 

92 and node.template_name in EXAMPLE_TEMPLATES 

93 and len(page_data[-1].senses) > 0 

94 ): 

95 extract_example_template(wxr, page_data[-1].senses[-1], node) 

96 elif isinstance(node, TemplateNode) and ( 

97 node.template_name 

98 in [ 

99 "noun-pl", 

100 "nl-advb-form", 

101 "noun-dim", 

102 "noun-dim-pl", 

103 "num-form", 

104 "ordn-form", 

105 "prep-form", 

106 "pronom-dem-form", 

107 "pronom-pos-form", 

108 "xh-pronom-pos-form", 

109 "oudeschrijfwijze", 

110 ] 

111 or node.template_name.endswith( 

112 ("adjc-form", "adverb-form", "noun-form") 

113 ) 

114 or re.search(r"-dec\d+", node.template_name) is not None 

115 ): 

116 extract_noun_form_of_template(wxr, page_data[-1], node) 

117 elif isinstance(node, TemplateNode) and ( 

118 node.template_name.startswith( 

119 ( 

120 "1ps", 

121 "2ps", 

122 "aanv-w", 

123 "onv-d", 

124 "ott-", 

125 "ovt-", 

126 "tps", 

127 "volt-d", 

128 "eng-onv-d", 

129 # Categorie:Bijvoeglijknaamwoordsjablonen 

130 "dan-adjc-", 

131 "la-adjc-", 

132 "nno-adjc-", 

133 "nor-adjc-", 

134 "swe-adjc-", 

135 ) 

136 ) 

137 or node.template_name.endswith( 

138 ( 

139 # Categorie:Werkwoordsvormsjablonen 

140 "verb-form", 

141 "-gw", 

142 "-lv", 

143 "-lv-vt", 

144 "-lv-vtd", 

145 "-onv-d", 

146 "-twt", 

147 "-vt", 

148 "-vt-onr", 

149 "-3ps", 

150 "-inf", 

151 "-lv-hv", 

152 "-twt-bv", 

153 "-twt-hv", 

154 "-vt-onr-bv", 

155 "-vt-onr-hv", 

156 "-vt-onr", 

157 ) 

158 ) 

159 or node.template_name 

160 in ["fra-deelwoord", "2ps-rus", "ww-kur", "ww-tur"] 

161 ): 

162 extract_verb_form_of_template( 

163 wxr, page_data, base_data, forms_data, node 

164 ) 

165 elif isinstance(node, TemplateNode): 

166 # tag template after form-of template 

167 cats = {} 

168 expanded_text = clean_node(wxr, cats, node) 

169 if ( 

170 expanded_text.startswith("(") 

171 and expanded_text.endswith(")") 

172 and len(page_data[-1].senses) > 0 

173 ): 

174 page_data[-1].senses[-1].raw_tags.append( 

175 expanded_text.strip("() ") 

176 ) 

177 page_data[-1].senses[-1].categories.extend( 

178 cats.get("categories", []) 

179 ) 

180 translate_raw_tags(page_data[-1].senses[-1]) 

181 elif ( 

182 isinstance(node, WikiNode) 

183 and node.kind == NodeKind.BOLD 

184 and is_first_bold 

185 ): 

186 extract_form_line_bold_node(wxr, page_data[-1], node) 

187 is_first_bold = None 

188 

189 

190def extract_gloss_list_item( 

191 wxr: WiktextractContext, 

192 word_entry: WordEntry, 

193 list_item: WikiNode, 

194 parent_sense: Sense | None = None, 

195) -> None: 

196 create_new_sense = ( 

197 False if list_item.sarg == "::" and len(word_entry.senses) > 0 else True 

198 ) 

199 if not create_new_sense: 

200 sense = word_entry.senses[-1] 

201 elif parent_sense is None: 

202 sense = Sense() 

203 else: 

204 sense = parent_sense.model_copy(deep=True) 

205 

206 gloss_nodes = [] 

207 for child in list_item.children: 

208 if isinstance(child, TemplateNode): 

209 if child.template_name in GLOSS_TAG_TEMPLATES: 

210 sense.raw_tags.append(clean_node(wxr, sense, child)) 

211 elif child.template_name in LIST_ITEM_TAG_TEMPLATES: 211 ↛ 212line 211 didn't jump to line 212 because the condition on line 211 was never true

212 sense.tags.append(LIST_ITEM_TAG_TEMPLATES[child.template_name]) 

213 else: 

214 expanded_text = clean_node(wxr, sense, child) 

215 if expanded_text.startswith("(") and expanded_text.endswith( 

216 ")" 

217 ): 

218 sense.raw_tags.append(expanded_text.strip("() ")) 

219 else: 

220 gloss_nodes.append(expanded_text) 

221 elif isinstance(child, WikiNode) and child.kind == NodeKind.LIST: 

222 if child.sarg.endswith("*"): 

223 for next_list_item in child.find_child(NodeKind.LIST_ITEM): 

224 extract_example_list_item(wxr, sense, next_list_item) 

225 elif isinstance(child, WikiNode) and child.kind == NodeKind.ITALIC: 

226 italic_text = clean_node(wxr, sense, child) 

227 if italic_text.startswith("(") and italic_text.endswith(")"): 227 ↛ 230line 227 didn't jump to line 230 because the condition on line 227 was always true

228 sense.raw_tags.append(italic_text.strip("() ")) 

229 else: 

230 gloss_nodes.append(italic_text) 

231 else: 

232 gloss_nodes.append(child) 

233 

234 gloss_text = clean_node(wxr, sense, gloss_nodes) 

235 while gloss_text.startswith(","): # between qualifier templates 

236 gloss_text = gloss_text.removeprefix(",").strip() 

237 m = re.match(r"\(([^()]+)\)", gloss_text) 

238 if m is not None: 

239 new_gloss_text = gloss_text[m.end() :].strip() 

240 if new_gloss_text != "": 

241 # expanded "verouderd" template in "2ps" template 

242 gloss_text = new_gloss_text 

243 sense.raw_tags.append(m.group(1)) 

244 else: # gloss text after form-of template 

245 gloss_text = m.group(1) 

246 

247 if len(gloss_text) > 0: 

248 sense.glosses.append(gloss_text) 

249 if ( 249 ↛ 259line 249 didn't jump to line 259 because the condition on line 249 was always true

250 len(sense.glosses) > 0 

251 or len(sense.tags) > 0 

252 or len(sense.raw_tags) > 0 

253 or len(sense.examples) > 0 

254 ): 

255 translate_raw_tags(sense) 

256 if create_new_sense: 

257 word_entry.senses.append(sense) 

258 

259 for child_list in list_item.find_child(NodeKind.LIST): 

260 if child_list.sarg.startswith("#") and child_list.sarg.endswith("#"): 

261 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM): 

262 extract_gloss_list_item(wxr, word_entry, child_list_item, sense) 

263 

264 

265def extract_pos_header_line_nodes( 

266 wxr: WiktextractContext, word_entry: WordEntry, nodes: list[WikiNode | str] 

267) -> None: 

268 for node in nodes: 

269 if isinstance(node, str) and word_entry.etymology_index == "": 

270 m = re.search(r"\[(.+)\]", node.strip()) 

271 if m is not None: 

272 word_entry.etymology_index = m.group(1).strip() 

273 elif isinstance(node, TemplateNode): 

274 if node.template_name == "-l-": 

275 extract_l_template(wxr, word_entry, node) 

276 elif node.template_name == "dimt": 276 ↛ 277line 276 didn't jump to line 277 because the condition on line 276 was never true

277 word_entry.raw_tags.append(clean_node(wxr, word_entry, node)) 

278 translate_raw_tags(word_entry) 

279 

280 

281def extract_l_template( 

282 wxr: WiktextractContext, word_entry: WordEntry, node: TemplateNode 

283) -> None: 

284 # https://nl.wiktionary.org/wiki/Sjabloon:-l- 

285 first_arg = clean_node(wxr, None, node.template_parameters.get(1, "")) 

286 gender_args = { 

287 "n": "neuter", 

288 "m": "masculine", 

289 "fm": ["feminine", "masculine"], 

290 "p": "plural", 

291 } 

292 tag = gender_args.get(first_arg, []) 

293 if isinstance(tag, str): 

294 word_entry.tags.append(tag) 

295 elif isinstance(tag, list): 295 ↛ exitline 295 didn't return from function 'extract_l_template' because the condition on line 295 was always true

296 word_entry.tags.extend(tag) 

297 

298 

299# https://nl.wiktionary.org/wiki/Sjabloon:noun-pl 

300# https://nl.wiktionary.org/wiki/Sjabloon:noun-form 

301# https://nl.wiktionary.org/wiki/Sjabloon:oudeschrijfwijze 

302# "getal" and "gesl" args 

303NOUN_FORM_OF_TEMPLATE_NUM_TAGS = { 

304 "s": "singular", 

305 "p": "plural", 

306 "d": "dual", 

307 "c": "collective", 

308 "a": "animate", 

309 "i": "inanimate", 

310} 

311NOUN_FORM_OF_TEMPLATE_GENDER_TAGS = { 

312 "m": "masculine", 

313 "f": "feminine", 

314 "n": "neuter", 

315 "c": "common", 

316 "fm": ["feminine", "masculine"], 

317 "mf": ["feminine", "masculine"], 

318 "mn": ["masculine", "neuter"], 

319} 

320 

321 

322def extract_oudeschrijfwijze_template_g_arg( 

323 wxr: WiktextractContext, g_arg: str, sense: Sense 

324) -> bool: 

325 for tags_dict in [ 

326 NOUN_FORM_OF_TEMPLATE_GENDER_TAGS, 

327 NOUN_FORM_OF_TEMPLATE_NUM_TAGS, 

328 ]: 

329 if g_arg in tags_dict: 

330 tag = tags_dict[g_arg] 

331 if isinstance(tag, str): 331 ↛ 333line 331 didn't jump to line 333 because the condition on line 331 was always true

332 sense.tags.append(tag) 

333 elif isinstance(tag, list): 

334 sense.tags.extend(tag) 

335 return True 

336 return False 

337 

338 

339def extract_oudeschrijfwijze_template( 

340 wxr: WiktextractContext, t_node: TemplateNode, sense: Sense 

341) -> None: 

342 g_arg_str = clean_node(wxr, None, t_node.template_parameters.get("g", "")) 

343 if not extract_oudeschrijfwijze_template_g_arg(wxr, g_arg_str, sense): 

344 g_args = t_node.template_parameters.get("g", "") 

345 if isinstance(g_args, list): 

346 for g_arg in g_args: 

347 if isinstance(g_arg, TemplateNode): 

348 extract_oudeschrijfwijze_template_g_arg( 

349 wxr, g_arg.template_name, sense 

350 ) 

351 

352 

353def extract_noun_form_of_template( 

354 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

355) -> None: 

356 # https://nl.wiktionary.org/wiki/Categorie:Vormsjablonen 

357 sense = Sense(tags=["form-of"]) 

358 if t_node.template_name.endswith("-pl"): 

359 sense.tags.append("plural") 

360 else: 

361 num_arg = clean_node( 

362 wxr, None, t_node.template_parameters.get("getal", "") 

363 ) 

364 if num_arg in NOUN_FORM_OF_TEMPLATE_NUM_TAGS: 

365 sense.tags.append(NOUN_FORM_OF_TEMPLATE_NUM_TAGS[num_arg]) 

366 

367 gender_arg = clean_node( 

368 wxr, None, t_node.template_parameters.get("gesl", "") 

369 ) 

370 if gender_arg in NOUN_FORM_OF_TEMPLATE_GENDER_TAGS: 370 ↛ 371line 370 didn't jump to line 371 because the condition on line 370 was never true

371 gender_tag = NOUN_FORM_OF_TEMPLATE_GENDER_TAGS[gender_arg] 

372 if isinstance(gender_tag, str): 

373 sense.tags.append(gender_tag) 

374 elif isinstance(gender_tag, list): 

375 sense.tags.extend(gender_tag) 

376 

377 # Sjabloon:oudeschrijfwijze 

378 if t_node.template_name == "oudeschrijfwijze": 

379 extract_oudeschrijfwijze_template(wxr, t_node, sense) 

380 

381 form_of = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

382 if form_of != "": 382 ↛ 385line 382 didn't jump to line 385 because the condition on line 382 was always true

383 sense.form_of.append(AltForm(word=form_of)) 

384 

385 expanded_node = wxr.wtp.parse( 

386 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

387 ) 

388 for list_item in expanded_node.find_child_recursively(NodeKind.LIST_ITEM): 

389 sense.glosses.append(clean_node(wxr, None, list_item.children)) 

390 break 

391 clean_node(wxr, sense, expanded_node) 

392 word_entry.senses.append(sense) 

393 

394 

395def extract_verb_form_of_template( 

396 wxr: WiktextractContext, 

397 page_data: list[WordEntry], 

398 base_data: WordEntry, 

399 forms_data: WordEntry, 

400 t_node: TemplateNode, 

401) -> None: 

402 # https://nl.wiktionary.org/wiki/Categorie:Werkwoordsvormsjablonen_voor_het_Nederlands 

403 # https://nl.wiktionary.org/wiki/Categorie:Werkwoordsvormsjablonen 

404 from .page import extract_section_categories 

405 

406 orig_data_len = len(page_data) 

407 expanded_node = wxr.wtp.parse( 

408 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

409 ) 

410 extract_pos_section_nodes( 

411 wxr, page_data, base_data, forms_data, expanded_node 

412 ) 

413 form_of = clean_node( 

414 wxr, 

415 None, 

416 t_node.template_parameters.get( 

417 3 if t_node.template_name == "la-adjc-form" else 1, "" 

418 ), 

419 ) 

420 for word_entry in page_data[orig_data_len - len(page_data) - 1 :]: 

421 for sense in word_entry.senses: 

422 sense.tags.append("form-of") 

423 if form_of != "": 423 ↛ 421line 423 didn't jump to line 421 because the condition on line 423 was always true

424 sense.form_of.append(AltForm(word=form_of)) 

425 extract_section_categories(wxr, word_entry, expanded_node) 

426 word_entry.tags.append("form-of") 

427 

428 

429def extract_form_line_bold_node( 

430 wxr: WiktextractContext, word_entry: WordEntry, bold_node: WikiNode 

431): 

432 word = clean_node(wxr, None, bold_node) 

433 if word != "" and word != wxr.wtp.title: 

434 word_entry.forms.append(Form(form=word, tags=["canonical"]))