Coverage for src/wiktextract/extractor/nl/pos.py: 90%

172 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2024-12-27 08:07 +0000

1import re 

2 

3from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode 

4 

5from ...page import clean_node 

6from ...wxr_context import WiktextractContext 

7from .example import ( 

8 EXAMPLE_TEMPLATES, 

9 extract_example_list_item, 

10 extract_example_template, 

11) 

12from .models import AltForm, Sense, WordEntry 

13from .section_titles import LINKAGE_SECTIONS, POS_DATA 

14from .tags import ( 

15 GLOSS_TAG_TEMPLATES, 

16 LIST_ITEM_TAG_TEMPLATES, 

17 translate_raw_tags, 

18) 

19 

20 

21def extract_pos_section( 

22 wxr: WiktextractContext, 

23 page_data: list[WordEntry], 

24 base_data: WordEntry, 

25 forms_data: WordEntry, 

26 level_node: LevelNode, 

27 pos_title: str, 

28) -> None: 

29 page_data.append(base_data.model_copy(deep=True)) 

30 page_data[-1].pos_title = pos_title 

31 pos_data = POS_DATA[pos_title] 

32 page_data[-1].pos = pos_data["pos"] 

33 page_data[-1].tags.extend(pos_data.get("tags", [])) 

34 if forms_data.pos == "unknown": 

35 forms_data.pos = page_data[-1].pos 

36 if forms_data.pos == page_data[-1].pos: 

37 page_data[-1].forms.extend(forms_data.forms) 

38 page_data[-1].categories.extend(forms_data.categories) 

39 else: 

40 forms_data.forms.clear() 

41 forms_data.categories.clear() 

42 extract_pos_section_nodes(wxr, page_data, base_data, forms_data, level_node) 

43 if len(page_data[-1].senses) == 0 and pos_title in LINKAGE_SECTIONS: 

44 page_data.pop() 

45 

46 

47def extract_pos_section_nodes( 

48 wxr: WiktextractContext, 

49 page_data: list[WordEntry], 

50 base_data: WordEntry, 

51 forms_data: WordEntry, 

52 level_node: LevelNode, 

53) -> None: 

54 gloss_list_start = 0 

55 for index, node in enumerate(level_node.children): 

56 if ( 

57 isinstance(node, WikiNode) 

58 and node.kind == NodeKind.LIST 

59 and node.sarg.endswith(("#", "::")) 

60 ): 

61 if gloss_list_start == 0 and node.sarg.endswith("#"): 

62 gloss_list_start = index 

63 extract_pos_header_line_nodes( 

64 wxr, page_data[-1], level_node.children[:index] 

65 ) 

66 for list_item in node.find_child(NodeKind.LIST_ITEM): 

67 extract_gloss_list_item(wxr, page_data[-1], list_item) 

68 elif isinstance(node, LevelNode): 

69 title_text = clean_node(wxr, None, node.largs) 

70 if title_text in POS_DATA and title_text not in LINKAGE_SECTIONS: 

71 # expanded from "eng-onv-d" form-of template 

72 from .page import parse_section 

73 

74 parse_section(wxr, page_data, base_data, forms_data, node) 

75 else: 

76 break 

77 elif ( 

78 isinstance(node, TemplateNode) 

79 and node.template_name in EXAMPLE_TEMPLATES 

80 and len(page_data[-1].senses) > 0 

81 ): 

82 extract_example_template(wxr, page_data[-1].senses[-1], node) 

83 elif isinstance(node, TemplateNode) and ( 

84 node.template_name 

85 in [ 

86 "noun-pl", 

87 "nl-advb-form", 

88 "noun-dim", 

89 "noun-dim-pl", 

90 "num-form", 

91 "ordn-form", 

92 "prep-form", 

93 "pronom-dem-form", 

94 "pronom-pos-form", 

95 "xh-pronom-pos-form", 

96 "oudeschrijfwijze", 

97 ] 

98 or node.template_name.endswith( 

99 ("adjc-form", "adverb-form", "noun-form") 

100 ) 

101 or re.search(r"-dec\d+", node.template_name) is not None 

102 ): 

103 extract_noun_form_of_template(wxr, page_data[-1], node) 

104 elif isinstance(node, TemplateNode) and ( 

105 node.template_name.startswith( 

106 ( 

107 "1ps", 

108 "2ps", 

109 "aanv-w", 

110 "onv-d", 

111 "ott-", 

112 "ovt-", 

113 "tps", 

114 "volt-d", 

115 "eng-onv-d", 

116 ) 

117 ) 

118 or node.template_name.endswith("verb-form") 

119 ): 

120 extract_verb_form_of_template( 

121 wxr, page_data, base_data, forms_data, node 

122 ) 

123 elif isinstance(node, TemplateNode): 

124 # tag template after form-of template 

125 cats = {} 

126 expanded_text = clean_node(wxr, cats, node) 

127 if ( 

128 expanded_text.startswith("(") 

129 and expanded_text.endswith(")") 

130 and len(page_data[-1].senses) > 0 

131 ): 

132 page_data[-1].senses[-1].raw_tags.append( 

133 expanded_text.strip("() ") 

134 ) 

135 page_data[-1].senses[-1].categories.extend( 

136 cats.get("categories", []) 

137 ) 

138 translate_raw_tags(page_data[-1].senses[-1]) 

139 

140 

141def extract_gloss_list_item( 

142 wxr: WiktextractContext, 

143 word_entry: WordEntry, 

144 list_item: WikiNode, 

145) -> None: 

146 create_new_sense = ( 

147 False if list_item.sarg == "::" and len(word_entry.senses) > 0 else True 

148 ) 

149 sense = Sense() if create_new_sense else word_entry.senses[-1] 

150 gloss_nodes = [] 

151 for child in list_item.children: 

152 if isinstance(child, TemplateNode): 

153 if child.template_name in GLOSS_TAG_TEMPLATES: 

154 sense.raw_tags.append(clean_node(wxr, sense, child)) 

155 elif child.template_name in LIST_ITEM_TAG_TEMPLATES: 155 ↛ 156line 155 didn't jump to line 156 because the condition on line 155 was never true

156 sense.tags.append(LIST_ITEM_TAG_TEMPLATES[child.template_name]) 

157 else: 

158 expanded_text = clean_node(wxr, sense, child) 

159 if expanded_text.startswith("(") and expanded_text.endswith( 

160 ")" 

161 ): 

162 sense.raw_tags.append(expanded_text.strip("() ")) 

163 else: 

164 gloss_nodes.append(expanded_text) 

165 elif isinstance(child, WikiNode) and child.kind == NodeKind.LIST: 

166 if child.sarg.endswith("*"): 166 ↛ 151line 166 didn't jump to line 151 because the condition on line 166 was always true

167 for next_list_item in child.find_child(NodeKind.LIST_ITEM): 

168 extract_example_list_item(wxr, sense, next_list_item) 

169 elif isinstance(child, WikiNode) and child.kind == NodeKind.ITALIC: 

170 italic_text = clean_node(wxr, sense, child) 

171 if italic_text.startswith("(") and italic_text.endswith(")"): 171 ↛ 174line 171 didn't jump to line 174 because the condition on line 171 was always true

172 sense.raw_tags.append(italic_text.strip("() ")) 

173 else: 

174 gloss_nodes.append(italic_text) 

175 else: 

176 gloss_nodes.append(child) 

177 

178 gloss_text = clean_node(wxr, sense, gloss_nodes) 

179 while gloss_text.startswith(","): # between qualifier templates 

180 gloss_text = gloss_text.removeprefix(",").strip() 

181 m = re.match(r"\(([^()]+)\)", gloss_text) 

182 if m is not None: 

183 new_gloss_text = gloss_text[m.end() :].strip() 

184 if new_gloss_text != "": 

185 # expanded "verouderd" template in "2ps" template 

186 gloss_text = new_gloss_text 

187 sense.raw_tags.append(m.group(1)) 

188 else: # gloss text after form-of template 

189 gloss_text = m.group(1) 

190 

191 if len(gloss_text) > 0: 

192 sense.glosses.append(gloss_text) 

193 if ( 193 ↛ exitline 193 didn't return from function 'extract_gloss_list_item' because the condition on line 193 was always true

194 len(sense.glosses) > 0 

195 or len(sense.tags) > 0 

196 or len(sense.raw_tags) > 0 

197 or len(sense.examples) > 0 

198 ): 

199 translate_raw_tags(sense) 

200 if len(sense.glosses) == 0: 

201 sense.tags.append("no-gloss") 

202 if create_new_sense: 

203 word_entry.senses.append(sense) 

204 

205 

206def extract_pos_header_line_nodes( 

207 wxr: WiktextractContext, word_entry: WordEntry, nodes: list[WikiNode | str] 

208) -> None: 

209 for node in nodes: 

210 if isinstance(node, str) and word_entry.etymology_index == "": 

211 m = re.search(r"\[(.+)\]", node.strip()) 

212 if m is not None: 

213 word_entry.etymology_index = m.group(1).strip() 

214 elif isinstance(node, TemplateNode): 

215 if node.template_name == "-l-": 215 ↛ 217line 215 didn't jump to line 217 because the condition on line 215 was always true

216 extract_l_template(wxr, word_entry, node) 

217 elif node.template_name == "dimt": 

218 word_entry.raw_tags.append(clean_node(wxr, word_entry, node)) 

219 translate_raw_tags(word_entry) 

220 

221 

222def extract_l_template( 

223 wxr: WiktextractContext, word_entry: WordEntry, node: TemplateNode 

224) -> None: 

225 # https://nl.wiktionary.org/wiki/Sjabloon:-l- 

226 first_arg = clean_node(wxr, None, node.template_parameters.get(1, "")) 

227 gender_args = { 

228 "n": "neuter", 

229 "m": "masculine", 

230 "fm": ["feminine", "masculine"], 

231 "p": "plural", 

232 } 

233 tag = gender_args.get(first_arg, []) 

234 if isinstance(tag, str): 

235 word_entry.tags.append(tag) 

236 elif isinstance(tag, list): 236 ↛ exitline 236 didn't return from function 'extract_l_template' because the condition on line 236 was always true

237 word_entry.tags.extend(tag) 

238 

239 

240# https://nl.wiktionary.org/wiki/Sjabloon:noun-pl 

241# https://nl.wiktionary.org/wiki/Sjabloon:noun-form 

242# https://nl.wiktionary.org/wiki/Sjabloon:oudeschrijfwijze 

243# "getal" and "gesl" args 

244NOUN_FORM_OF_TEMPLATE_NUM_TAGS = { 

245 "s": "singular", 

246 "p": "plural", 

247 "d": "dual", 

248 "c": "collective", 

249 "a": "animate", 

250 "i": "inanimate", 

251} 

252NOUN_FORM_OF_TEMPLATE_GENDER_TAGS = { 

253 "m": "masculine", 

254 "f": "feminine", 

255 "n": "neuter", 

256 "c": "common", 

257 "fm": ["feminine", "masculine"], 

258 "mf": ["feminine", "masculine"], 

259 "mn": ["masculine", "neuter"], 

260} 

261 

262 

263def extract_oudeschrijfwijze_template_g_arg( 

264 wxr: WiktextractContext, g_arg: str, sense: Sense 

265) -> bool: 

266 for tags_dict in [ 

267 NOUN_FORM_OF_TEMPLATE_GENDER_TAGS, 

268 NOUN_FORM_OF_TEMPLATE_NUM_TAGS, 

269 ]: 

270 if g_arg in tags_dict: 

271 tag = tags_dict[g_arg] 

272 if isinstance(tag, str): 272 ↛ 274line 272 didn't jump to line 274 because the condition on line 272 was always true

273 sense.tags.append(tag) 

274 elif isinstance(tag, list): 

275 sense.tags.extend(tag) 

276 return True 

277 return False 

278 

279 

280def extract_oudeschrijfwijze_template( 

281 wxr: WiktextractContext, t_node: TemplateNode, sense: Sense 

282) -> None: 

283 g_arg_str = clean_node(wxr, None, t_node.template_parameters.get("g", "")) 

284 if not extract_oudeschrijfwijze_template_g_arg(wxr, g_arg_str, sense): 

285 g_args = t_node.template_parameters.get("g", "") 

286 if isinstance(g_args, list): 

287 for g_arg in g_args: 

288 if isinstance(g_arg, TemplateNode): 

289 extract_oudeschrijfwijze_template_g_arg( 

290 wxr, g_arg.template_name, sense 

291 ) 

292 

293 

294def extract_noun_form_of_template( 

295 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

296) -> None: 

297 # https://nl.wiktionary.org/wiki/Categorie:Vormsjablonen 

298 sense = Sense(tags=["form-of"]) 

299 if t_node.template_name.endswith("-pl"): 

300 sense.tags.append("plural") 

301 else: 

302 num_arg = clean_node( 

303 wxr, None, t_node.template_parameters.get("getal", "") 

304 ) 

305 if num_arg in NOUN_FORM_OF_TEMPLATE_NUM_TAGS: 

306 sense.tags.append(NOUN_FORM_OF_TEMPLATE_NUM_TAGS[num_arg]) 

307 

308 gender_arg = clean_node( 

309 wxr, None, t_node.template_parameters.get("gesl", "") 

310 ) 

311 if gender_arg in NOUN_FORM_OF_TEMPLATE_GENDER_TAGS: 311 ↛ 312line 311 didn't jump to line 312 because the condition on line 311 was never true

312 gender_tag = NOUN_FORM_OF_TEMPLATE_GENDER_TAGS[gender_arg] 

313 if isinstance(gender_tag, str): 

314 sense.tags.append(gender_tag) 

315 elif isinstance(gender_tag, list): 

316 sense.tags.extend(gender_tag) 

317 

318 # Sjabloon:oudeschrijfwijze 

319 if t_node.template_name == "oudeschrijfwijze": 

320 extract_oudeschrijfwijze_template(wxr, t_node, sense) 

321 

322 form_of = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

323 if form_of != "": 323 ↛ 326line 323 didn't jump to line 326 because the condition on line 323 was always true

324 sense.form_of.append(AltForm(word=form_of)) 

325 

326 expanded_node = wxr.wtp.parse( 

327 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

328 ) 

329 for list_item in expanded_node.find_child_recursively(NodeKind.LIST_ITEM): 

330 sense.glosses.append(clean_node(wxr, None, list_item.children)) 

331 break 

332 clean_node(wxr, sense, expanded_node) 

333 word_entry.senses.append(sense) 

334 

335 

336def extract_verb_form_of_template( 

337 wxr: WiktextractContext, 

338 page_data: list[WordEntry], 

339 base_data: WordEntry, 

340 forms_data: WordEntry, 

341 t_node: TemplateNode, 

342) -> None: 

343 # https://nl.wiktionary.org/wiki/Categorie:Werkwoordsvormsjablonen_voor_het_Nederlands 

344 # https://nl.wiktionary.org/wiki/Categorie:Werkwoordsvormsjablonen 

345 from .page import extract_section_categories 

346 

347 orig_data_len = len(page_data) 

348 expanded_node = wxr.wtp.parse( 

349 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

350 ) 

351 extract_pos_section_nodes( 

352 wxr, page_data, base_data, forms_data, expanded_node 

353 ) 

354 form_of = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

355 for word_entry in page_data[orig_data_len - len(page_data) - 1 :]: 

356 for sense in word_entry.senses: 

357 sense.tags.append("form-of") 

358 if form_of != "": 358 ↛ 356line 358 didn't jump to line 356 because the condition on line 358 was always true

359 sense.form_of.append(AltForm(word=form_of)) 

360 extract_section_categories(wxr, word_entry, expanded_node) 

361 word_entry.tags.append("form-of")