Coverage for src/wiktextract/extractor/es/page.py: 61%

144 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2024-12-27 08:07 +0000

1from wikitextprocessor.parser import ( 

2 LEVEL_KIND_FLAGS, 

3 NodeKind, 

4 TemplateNode, 

5 WikiNode, 

6 WikiNodeChildrenList, 

7) 

8 

9from ...page import clean_node 

10from ...wxr_context import WiktextractContext 

11from ...wxr_logging import logger 

12from .conjugation import extract_conjugation_section 

13from .etymology import process_etymology_block 

14from .example import extract_example 

15from .gloss import extract_gloss, process_ambito_template, process_uso_template 

16from .inflection import extract_inflection 

17from .linkage import extract_linkage_section, process_linkage_template 

18from .models import Sense, WordEntry 

19from .pronunciation import process_pron_graf_template 

20from .section_titles import ( 

21 IGNORED_TITLES, 

22 LINKAGE_TITLES, 

23 POS_TITLES, 

24 TRANSLATIONS_TITLES, 

25) 

26from .sense_data import process_sense_data_list 

27from .translation import extract_translation_section 

28 

29 

30def parse_entries( 

31 wxr: WiktextractContext, 

32 page_data: list[WordEntry], 

33 base_data: WordEntry, 

34 level_node: WikiNode, 

35): 

36 """ 

37 Parse entries in a language section (level 2) or etymology section (level 3) 

38 and extract data affecting all subsections, e.g. the {pron-graf} template. 

39 

40 A language section may contain multiple entries, usually devided by 

41 different POS with level 3 headings, 

42 e.g. https://es.wiktionary.org/wiki/agua or 

43 https://es.wiktionary.org/wiki/love 

44 

45 If a word has distinct etmylogies, these are separated by level 3 headings 

46 and subdivided by their POS at level 4 headings, 

47 e.g. https://es.wiktionary.org/wiki/churro 

48 """ 

49 

50 # This might not be necessary but it's to prevent that base_data is applied 

51 # to entries that it shouldn't be applied to 

52 base_data_copy = base_data.model_copy(deep=True) 

53 unexpected_nodes = [] 

54 # Parse data affecting all subsections and add to base_data_copy 

55 for node in level_node.invert_find_child(LEVEL_KIND_FLAGS): 

56 if ( 

57 isinstance(node, TemplateNode) 

58 and node.template_name == "pron-graf" 

59 and wxr.config.capture_pronunciation 

60 ): 

61 process_pron_graf_template(wxr, base_data_copy, node) 

62 elif ( 62 ↛ 69line 62 didn't jump to line 69 because the condition on line 62 was never true

63 isinstance(node, WikiNode) 

64 and node.kind == NodeKind.LIST 

65 and node.sarg == ":*" 

66 ): 

67 # XXX: There might be other uses for this kind of list which are 

68 # being ignored here 

69 continue 

70 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK: 70 ↛ 73line 70 didn't jump to line 73 because the condition on line 70 was always true

71 clean_node(wxr, base_data_copy, node) 

72 else: 

73 unexpected_nodes.append(node) 

74 

75 if unexpected_nodes: 75 ↛ 76line 75 didn't jump to line 76 because the condition on line 75 was never true

76 wxr.wtp.debug( 

77 f"Found unexpected nodes {unexpected_nodes} " 

78 f"in section {level_node.largs}", 

79 sortid="extractor/es/page/parse_entries/69", 

80 ) 

81 

82 for sub_level_node in level_node.find_child(LEVEL_KIND_FLAGS): 

83 parse_section(wxr, page_data, base_data_copy, sub_level_node) 

84 

85 

86def parse_section( 

87 wxr: WiktextractContext, 

88 page_data: list[WordEntry], 

89 base_data: WordEntry, 

90 level_node: WikiNode, 

91) -> None: 

92 """ 

93 Parses indidividual sibling sections of an entry, 

94 e.g. https://es.wiktionary.org/wiki/amor: 

95 

96 === Etimología === 

97 === {{sustantivo masculino|es}} === 

98 === Locuciones === 

99 """ 

100 

101 categories = {} 

102 section_title = clean_node(wxr, categories, level_node.largs).lower() 

103 wxr.wtp.start_subsection(section_title) 

104 

105 pos_template_name = "" 

106 for level_node_template in level_node.find_content(NodeKind.TEMPLATE): 

107 pos_template_name = level_node_template.template_name 

108 

109 if section_title in IGNORED_TITLES: 109 ↛ 110line 109 didn't jump to line 110 because the condition on line 109 was never true

110 pass 

111 elif pos_template_name in POS_TITLES or section_title in POS_TITLES: 

112 pos_data = POS_TITLES.get( 

113 pos_template_name, POS_TITLES.get(section_title) 

114 ) 

115 pos_type = pos_data["pos"] 

116 if section_title != "forma flexiva": 

117 page_data.append(base_data.model_copy(deep=True)) 

118 page_data[-1].pos = pos_type 

119 page_data[-1].pos_title = section_title 

120 page_data[-1].tags.extend(pos_data.get("tags", [])) 

121 page_data[-1].categories.extend(categories.get("categories", [])) 

122 process_pos_block(wxr, page_data, level_node) 

123 elif ( 123 ↛ 128line 123 didn't jump to line 128 because the condition on line 123 was always true

124 section_title.startswith("etimología") 

125 and wxr.config.capture_etymologies 

126 ): 

127 process_etymology_block(wxr, base_data, level_node) 

128 elif ( 

129 section_title in TRANSLATIONS_TITLES and wxr.config.capture_translations 

130 ): 

131 if len(page_data) == 0: 

132 page_data.append(base_data.model_copy(deep=True)) 

133 extract_translation_section(wxr, page_data[-1], level_node) 

134 elif section_title in LINKAGE_TITLES: 

135 if len(page_data) == 0: 

136 page_data.append(base_data.model_copy(deep=True)) 

137 extract_linkage_section( 

138 wxr, page_data[-1], level_node, LINKAGE_TITLES[section_title] 

139 ) 

140 elif section_title == "conjugación": 

141 if len(page_data) == 0: 

142 page_data.append(base_data.model_copy(deep=True)) 

143 extract_conjugation_section(wxr, page_data[-1], level_node) 

144 else: 

145 wxr.wtp.debug( 

146 f"Unprocessed section: {section_title}", 

147 sortid="extractor/es/page/parse_section/48", 

148 ) 

149 

150 for next_level_node in level_node.find_child(LEVEL_KIND_FLAGS): 

151 parse_section(wxr, page_data, base_data, next_level_node) 

152 

153 

154def process_pos_block( 

155 wxr: WiktextractContext, 

156 page_data: list[WordEntry], 

157 pos_level_node: WikiNode, 

158): 

159 """ 

160 Senses are indicated by ListNodes with a semicolon as argument. They can be 

161 followed by multiple nodes that add different kinds of information to the 

162 sense. These nodes are collected in sense_children and processed after the 

163 next sense is encountered or after the last sense has been processed. 

164 """ 

165 

166 child_nodes = list(pos_level_node.filter_empty_str_child()) 

167 # All non-gloss nodes that add additional information to a sense 

168 sense_children: WikiNodeChildrenList = [] 

169 

170 for child in child_nodes: 

171 if ( 

172 isinstance(child, WikiNode) 

173 and child.kind == NodeKind.LIST 

174 and child.sarg == ";" 

175 ): 

176 # Consume sense_children of previous sense and extract gloss of 

177 # new sense 

178 process_sense_children(wxr, page_data, sense_children) 

179 sense_children = [] 

180 

181 extract_gloss(wxr, page_data, child) 

182 

183 elif page_data[-1].senses: 183 ↛ 184line 183 didn't jump to line 184 because the condition on line 183 was never true

184 sense_children.append(child) 

185 

186 else: 

187 # Process nodes before first sense 

188 if isinstance(child, TemplateNode) and ( 188 ↛ 192line 188 didn't jump to line 192 because the condition on line 188 was never true

189 "inflect" in child.template_name 

190 or "v.conj" in child.template_name 

191 ): 

192 extract_inflection(wxr, page_data, child) 

193 elif ( 193 ↛ 198line 193 didn't jump to line 198 because the condition on line 193 was never true

194 isinstance(child, WikiNode) 

195 and child.kind == NodeKind.LINK 

196 and "Categoría" in child.largs[0][0] 

197 ): 

198 clean_node(wxr, page_data[-1], child) 

199 else: 

200 wxr.wtp.debug( 

201 f"Found unexpected node in pos_block: {child}", 

202 sortid="extractor/es/page/process_pos_block/184", 

203 ) 

204 

205 if pos_level_node.contain_node(NodeKind.LIST): 

206 process_sense_children(wxr, page_data, sense_children) 

207 else: 

208 sense = Sense() 

209 gloss_text = clean_node(wxr, sense, pos_level_node.children) 

210 if len(gloss_text) > 0: 

211 sense.glosses.append(gloss_text) 

212 page_data[-1].senses.append(sense) 

213 

214 

215def process_sense_children( 

216 wxr: WiktextractContext, 

217 page_data: list[WordEntry], 

218 sense_children: WikiNodeChildrenList, 

219) -> None: 

220 """ 

221 In most cases additional information to a sense is given via special 

222 templates or lists. However, sometimes string nodes are used to add 

223 information to a preceeding template or list. 

224 

225 This function collects the nodes that form a group and calls the relevant 

226 methods for extraction. 

227 """ 

228 

229 def starts_new_group(child: WikiNode) -> bool: 

230 # Nested function for readibility 

231 return isinstance(child, WikiNode) and ( 

232 child.kind == NodeKind.TEMPLATE 

233 or child.kind == NodeKind.LIST 

234 or child.kind == NodeKind.LINK 

235 ) 

236 

237 def process_group( 

238 wxr: WiktextractContext, 

239 page_data: list[WordEntry], 

240 group: WikiNodeChildrenList, 

241 ) -> None: 

242 # Nested function for readibility 

243 if len(group) == 0: 243 ↛ 245line 243 didn't jump to line 245 because the condition on line 243 was always true

244 return 

245 elif isinstance(group[0], TemplateNode): 

246 template_name = group[0].template_name 

247 if template_name == "clear": 

248 return 

249 elif template_name.removesuffix("s") in LINKAGE_TITLES: 

250 process_linkage_template(wxr, page_data[-1], group[0]) 

251 elif template_name == "ejemplo": 

252 extract_example(wxr, page_data[-1].senses[-1], group) 

253 elif template_name == "uso": 

254 process_uso_template(wxr, page_data[-1].senses[-1], group[0]) 

255 elif template_name == "ámbito": 

256 process_ambito_template(wxr, page_data[-1].senses[-1], group[0]) 

257 else: 

258 wxr.wtp.debug( 

259 f"Found unexpected group specifying a sense: {group}," 

260 f"head template {template_name}", 

261 sortid="extractor/es/page/process_group/102", 

262 ) 

263 

264 elif isinstance(group[0], WikiNode) and group[0].kind == NodeKind.LIST: 

265 list_node = group[0] 

266 # List groups seem to not be followed by string nodes. 

267 # We, therefore, only process the list_node. 

268 process_sense_data_list(wxr, page_data[-1], list_node) 

269 

270 elif ( 

271 isinstance(child, WikiNode) 

272 and child.kind == NodeKind.LINK 

273 and "Categoría" in child.largs[0][0] 

274 ): 

275 # Extract sense categories 

276 clean_node(wxr, page_data[-1].senses[-1], child) 

277 

278 else: 

279 wxr.wtp.debug( 

280 f"Found unexpected group specifying a sense: {group}", 

281 sortid="extractor/es/page/process_group/117", 

282 ) 

283 

284 group: WikiNodeChildrenList = [] 

285 for child in sense_children: 285 ↛ 286line 285 didn't jump to line 286 because the loop on line 285 never started

286 if starts_new_group(child): 

287 process_group(wxr, page_data, group) 

288 group = [] 

289 group.append(child) 

290 process_group(wxr, page_data, group) 

291 

292 

293def parse_page( 

294 wxr: WiktextractContext, page_title: str, page_text: str 

295) -> list[dict[str, any]]: 

296 # style guide 

297 # https://es.wiktionary.org/wiki/Wikcionario:Guía_de_estilo 

298 # entry layout 

299 # https://es.wiktionary.org/wiki/Wikcionario:Estructura 

300 if wxr.config.verbose: 300 ↛ 301line 300 didn't jump to line 301 because the condition on line 300 was never true

301 logger.info(f"Parsing page: {page_title}") 

302 

303 wxr.wtp.start_page(page_title) 

304 tree = wxr.wtp.parse(page_text) 

305 page_data: list[WordEntry] = [] 

306 for level2_node in tree.find_child(NodeKind.LEVEL2): 

307 categories = {} 

308 lang_code = "unknown" 

309 lang_name = "unknown" 

310 for subtitle_template in level2_node.find_content(NodeKind.TEMPLATE): 310 ↛ 317line 310 didn't jump to line 317 because the loop on line 310 didn't complete

311 # https://es.wiktionary.org/wiki/Plantilla:lengua 

312 # https://es.wiktionary.org/wiki/Apéndice:Códigos_de_idioma 

313 if subtitle_template.template_name == "lengua": 313 ↛ 310line 313 didn't jump to line 310 because the condition on line 313 was always true

314 lang_code = subtitle_template.template_parameters.get(1).lower() 

315 lang_name = clean_node(wxr, categories, subtitle_template) 

316 break 

317 if ( 317 ↛ 321line 317 didn't jump to line 321 because the condition on line 317 was never true

318 wxr.config.capture_language_codes is not None 

319 and lang_code not in wxr.config.capture_language_codes 

320 ): 

321 continue 

322 wxr.wtp.start_section(lang_name) 

323 base_data = WordEntry( 

324 lang=lang_name, 

325 lang_code=lang_code, 

326 word=page_title, 

327 pos="unknown", 

328 categories=categories.get("categories", []), 

329 ) 

330 parse_entries(wxr, page_data, base_data, level2_node) 

331 

332 for data in page_data: 

333 if len(data.senses) == 0: 

334 data.senses.append(Sense(tags=["no-gloss"])) 

335 return [d.model_dump(exclude_defaults=True) for d in page_data]