Coverage for src/wiktextract/extractor/es/page.py: 61%
144 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1from wikitextprocessor.parser import (
2 LEVEL_KIND_FLAGS,
3 NodeKind,
4 TemplateNode,
5 WikiNode,
6 WikiNodeChildrenList,
7)
9from ...page import clean_node
10from ...wxr_context import WiktextractContext
11from ...wxr_logging import logger
12from .conjugation import extract_conjugation_section
13from .etymology import process_etymology_block
14from .example import extract_example
15from .gloss import extract_gloss, process_ambito_template, process_uso_template
16from .inflection import extract_inflection
17from .linkage import extract_linkage_section, process_linkage_template
18from .models import Sense, WordEntry
19from .pronunciation import process_pron_graf_template
20from .section_titles import (
21 IGNORED_TITLES,
22 LINKAGE_TITLES,
23 POS_TITLES,
24 TRANSLATIONS_TITLES,
25)
26from .sense_data import process_sense_data_list
27from .translation import extract_translation_section
30def parse_entries(
31 wxr: WiktextractContext,
32 page_data: list[WordEntry],
33 base_data: WordEntry,
34 level_node: WikiNode,
35):
36 """
37 Parse entries in a language section (level 2) or etymology section (level 3)
38 and extract data affecting all subsections, e.g. the {pron-graf} template.
40 A language section may contain multiple entries, usually devided by
41 different POS with level 3 headings,
42 e.g. https://es.wiktionary.org/wiki/agua or
43 https://es.wiktionary.org/wiki/love
45 If a word has distinct etmylogies, these are separated by level 3 headings
46 and subdivided by their POS at level 4 headings,
47 e.g. https://es.wiktionary.org/wiki/churro
48 """
50 # This might not be necessary but it's to prevent that base_data is applied
51 # to entries that it shouldn't be applied to
52 base_data_copy = base_data.model_copy(deep=True)
53 unexpected_nodes = []
54 # Parse data affecting all subsections and add to base_data_copy
55 for node in level_node.invert_find_child(LEVEL_KIND_FLAGS):
56 if (
57 isinstance(node, TemplateNode)
58 and node.template_name == "pron-graf"
59 and wxr.config.capture_pronunciation
60 ):
61 process_pron_graf_template(wxr, base_data_copy, node)
62 elif ( 62 ↛ 69line 62 didn't jump to line 69 because the condition on line 62 was never true
63 isinstance(node, WikiNode)
64 and node.kind == NodeKind.LIST
65 and node.sarg == ":*"
66 ):
67 # XXX: There might be other uses for this kind of list which are
68 # being ignored here
69 continue
70 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK: 70 ↛ 73line 70 didn't jump to line 73 because the condition on line 70 was always true
71 clean_node(wxr, base_data_copy, node)
72 else:
73 unexpected_nodes.append(node)
75 if unexpected_nodes: 75 ↛ 76line 75 didn't jump to line 76 because the condition on line 75 was never true
76 wxr.wtp.debug(
77 f"Found unexpected nodes {unexpected_nodes} "
78 f"in section {level_node.largs}",
79 sortid="extractor/es/page/parse_entries/69",
80 )
82 for sub_level_node in level_node.find_child(LEVEL_KIND_FLAGS):
83 parse_section(wxr, page_data, base_data_copy, sub_level_node)
86def parse_section(
87 wxr: WiktextractContext,
88 page_data: list[WordEntry],
89 base_data: WordEntry,
90 level_node: WikiNode,
91) -> None:
92 """
93 Parses indidividual sibling sections of an entry,
94 e.g. https://es.wiktionary.org/wiki/amor:
96 === Etimología ===
97 === {{sustantivo masculino|es}} ===
98 === Locuciones ===
99 """
101 categories = {}
102 section_title = clean_node(wxr, categories, level_node.largs).lower()
103 wxr.wtp.start_subsection(section_title)
105 pos_template_name = ""
106 for level_node_template in level_node.find_content(NodeKind.TEMPLATE):
107 pos_template_name = level_node_template.template_name
109 if section_title in IGNORED_TITLES: 109 ↛ 110line 109 didn't jump to line 110 because the condition on line 109 was never true
110 pass
111 elif pos_template_name in POS_TITLES or section_title in POS_TITLES:
112 pos_data = POS_TITLES.get(
113 pos_template_name, POS_TITLES.get(section_title)
114 )
115 pos_type = pos_data["pos"]
116 if section_title != "forma flexiva":
117 page_data.append(base_data.model_copy(deep=True))
118 page_data[-1].pos = pos_type
119 page_data[-1].pos_title = section_title
120 page_data[-1].tags.extend(pos_data.get("tags", []))
121 page_data[-1].categories.extend(categories.get("categories", []))
122 process_pos_block(wxr, page_data, level_node)
123 elif ( 123 ↛ 128line 123 didn't jump to line 128 because the condition on line 123 was always true
124 section_title.startswith("etimología")
125 and wxr.config.capture_etymologies
126 ):
127 process_etymology_block(wxr, base_data, level_node)
128 elif (
129 section_title in TRANSLATIONS_TITLES and wxr.config.capture_translations
130 ):
131 if len(page_data) == 0:
132 page_data.append(base_data.model_copy(deep=True))
133 extract_translation_section(wxr, page_data[-1], level_node)
134 elif section_title in LINKAGE_TITLES:
135 if len(page_data) == 0:
136 page_data.append(base_data.model_copy(deep=True))
137 extract_linkage_section(
138 wxr, page_data[-1], level_node, LINKAGE_TITLES[section_title]
139 )
140 elif section_title == "conjugación":
141 if len(page_data) == 0:
142 page_data.append(base_data.model_copy(deep=True))
143 extract_conjugation_section(wxr, page_data[-1], level_node)
144 else:
145 wxr.wtp.debug(
146 f"Unprocessed section: {section_title}",
147 sortid="extractor/es/page/parse_section/48",
148 )
150 for next_level_node in level_node.find_child(LEVEL_KIND_FLAGS):
151 parse_section(wxr, page_data, base_data, next_level_node)
154def process_pos_block(
155 wxr: WiktextractContext,
156 page_data: list[WordEntry],
157 pos_level_node: WikiNode,
158):
159 """
160 Senses are indicated by ListNodes with a semicolon as argument. They can be
161 followed by multiple nodes that add different kinds of information to the
162 sense. These nodes are collected in sense_children and processed after the
163 next sense is encountered or after the last sense has been processed.
164 """
166 child_nodes = list(pos_level_node.filter_empty_str_child())
167 # All non-gloss nodes that add additional information to a sense
168 sense_children: WikiNodeChildrenList = []
170 for child in child_nodes:
171 if (
172 isinstance(child, WikiNode)
173 and child.kind == NodeKind.LIST
174 and child.sarg == ";"
175 ):
176 # Consume sense_children of previous sense and extract gloss of
177 # new sense
178 process_sense_children(wxr, page_data, sense_children)
179 sense_children = []
181 extract_gloss(wxr, page_data, child)
183 elif page_data[-1].senses: 183 ↛ 184line 183 didn't jump to line 184 because the condition on line 183 was never true
184 sense_children.append(child)
186 else:
187 # Process nodes before first sense
188 if isinstance(child, TemplateNode) and ( 188 ↛ 192line 188 didn't jump to line 192 because the condition on line 188 was never true
189 "inflect" in child.template_name
190 or "v.conj" in child.template_name
191 ):
192 extract_inflection(wxr, page_data, child)
193 elif ( 193 ↛ 198line 193 didn't jump to line 198 because the condition on line 193 was never true
194 isinstance(child, WikiNode)
195 and child.kind == NodeKind.LINK
196 and "Categoría" in child.largs[0][0]
197 ):
198 clean_node(wxr, page_data[-1], child)
199 else:
200 wxr.wtp.debug(
201 f"Found unexpected node in pos_block: {child}",
202 sortid="extractor/es/page/process_pos_block/184",
203 )
205 if pos_level_node.contain_node(NodeKind.LIST):
206 process_sense_children(wxr, page_data, sense_children)
207 else:
208 sense = Sense()
209 gloss_text = clean_node(wxr, sense, pos_level_node.children)
210 if len(gloss_text) > 0:
211 sense.glosses.append(gloss_text)
212 page_data[-1].senses.append(sense)
215def process_sense_children(
216 wxr: WiktextractContext,
217 page_data: list[WordEntry],
218 sense_children: WikiNodeChildrenList,
219) -> None:
220 """
221 In most cases additional information to a sense is given via special
222 templates or lists. However, sometimes string nodes are used to add
223 information to a preceeding template or list.
225 This function collects the nodes that form a group and calls the relevant
226 methods for extraction.
227 """
229 def starts_new_group(child: WikiNode) -> bool:
230 # Nested function for readibility
231 return isinstance(child, WikiNode) and (
232 child.kind == NodeKind.TEMPLATE
233 or child.kind == NodeKind.LIST
234 or child.kind == NodeKind.LINK
235 )
237 def process_group(
238 wxr: WiktextractContext,
239 page_data: list[WordEntry],
240 group: WikiNodeChildrenList,
241 ) -> None:
242 # Nested function for readibility
243 if len(group) == 0: 243 ↛ 245line 243 didn't jump to line 245 because the condition on line 243 was always true
244 return
245 elif isinstance(group[0], TemplateNode):
246 template_name = group[0].template_name
247 if template_name == "clear":
248 return
249 elif template_name.removesuffix("s") in LINKAGE_TITLES:
250 process_linkage_template(wxr, page_data[-1], group[0])
251 elif template_name == "ejemplo":
252 extract_example(wxr, page_data[-1].senses[-1], group)
253 elif template_name == "uso":
254 process_uso_template(wxr, page_data[-1].senses[-1], group[0])
255 elif template_name == "ámbito":
256 process_ambito_template(wxr, page_data[-1].senses[-1], group[0])
257 else:
258 wxr.wtp.debug(
259 f"Found unexpected group specifying a sense: {group},"
260 f"head template {template_name}",
261 sortid="extractor/es/page/process_group/102",
262 )
264 elif isinstance(group[0], WikiNode) and group[0].kind == NodeKind.LIST:
265 list_node = group[0]
266 # List groups seem to not be followed by string nodes.
267 # We, therefore, only process the list_node.
268 process_sense_data_list(wxr, page_data[-1], list_node)
270 elif (
271 isinstance(child, WikiNode)
272 and child.kind == NodeKind.LINK
273 and "Categoría" in child.largs[0][0]
274 ):
275 # Extract sense categories
276 clean_node(wxr, page_data[-1].senses[-1], child)
278 else:
279 wxr.wtp.debug(
280 f"Found unexpected group specifying a sense: {group}",
281 sortid="extractor/es/page/process_group/117",
282 )
284 group: WikiNodeChildrenList = []
285 for child in sense_children: 285 ↛ 286line 285 didn't jump to line 286 because the loop on line 285 never started
286 if starts_new_group(child):
287 process_group(wxr, page_data, group)
288 group = []
289 group.append(child)
290 process_group(wxr, page_data, group)
293def parse_page(
294 wxr: WiktextractContext, page_title: str, page_text: str
295) -> list[dict[str, any]]:
296 # style guide
297 # https://es.wiktionary.org/wiki/Wikcionario:Guía_de_estilo
298 # entry layout
299 # https://es.wiktionary.org/wiki/Wikcionario:Estructura
300 if wxr.config.verbose: 300 ↛ 301line 300 didn't jump to line 301 because the condition on line 300 was never true
301 logger.info(f"Parsing page: {page_title}")
303 wxr.wtp.start_page(page_title)
304 tree = wxr.wtp.parse(page_text)
305 page_data: list[WordEntry] = []
306 for level2_node in tree.find_child(NodeKind.LEVEL2):
307 categories = {}
308 lang_code = "unknown"
309 lang_name = "unknown"
310 for subtitle_template in level2_node.find_content(NodeKind.TEMPLATE): 310 ↛ 317line 310 didn't jump to line 317 because the loop on line 310 didn't complete
311 # https://es.wiktionary.org/wiki/Plantilla:lengua
312 # https://es.wiktionary.org/wiki/Apéndice:Códigos_de_idioma
313 if subtitle_template.template_name == "lengua": 313 ↛ 310line 313 didn't jump to line 310 because the condition on line 313 was always true
314 lang_code = subtitle_template.template_parameters.get(1).lower()
315 lang_name = clean_node(wxr, categories, subtitle_template)
316 break
317 if ( 317 ↛ 321line 317 didn't jump to line 321 because the condition on line 317 was never true
318 wxr.config.capture_language_codes is not None
319 and lang_code not in wxr.config.capture_language_codes
320 ):
321 continue
322 wxr.wtp.start_section(lang_name)
323 base_data = WordEntry(
324 lang=lang_name,
325 lang_code=lang_code,
326 word=page_title,
327 pos="unknown",
328 categories=categories.get("categories", []),
329 )
330 parse_entries(wxr, page_data, base_data, level2_node)
332 for data in page_data:
333 if len(data.senses) == 0:
334 data.senses.append(Sense(tags=["no-gloss"]))
335 return [d.model_dump(exclude_defaults=True) for d in page_data]