Coverage for src/wiktextract/extractor/es/page.py: 78%
111 statements
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-09 23:59 +0000
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-09 23:59 +0000
1from wikitextprocessor.parser import (
2 LEVEL_KIND_FLAGS,
3 NodeKind,
4 TemplateNode,
5 WikiNode,
6)
8from ...page import clean_node
9from ...wxr_context import WiktextractContext
10from ...wxr_logging import logger
11from .conjugation import extract_conjugation_section
12from .etymology import extract_etymology_section
13from .linkage import (
14 extract_additional_information_section,
15 extract_alt_form_section,
16 extract_linkage_section,
17)
18from .models import Sense, WordEntry
19from .pos import extract_pos_section
20from .pronunciation import process_pron_graf_template
21from .section_titles import (
22 IGNORED_TITLES,
23 LINKAGE_TITLES,
24 POS_TITLES,
25 TRANSLATIONS_TITLES,
26)
27from .translation import extract_translation_section
30def parse_section(
31 wxr: WiktextractContext,
32 page_data: list[WordEntry],
33 base_data: WordEntry,
34 level_node: WikiNode,
35) -> None:
36 """
37 Parses indidividual sibling sections of an entry,
38 e.g. https://es.wiktionary.org/wiki/amor:
40 === Etimología ===
41 === {{sustantivo masculino|es}} ===
42 === Locuciones ===
43 """
45 categories = {}
46 section_title = clean_node(wxr, categories, level_node.largs)
47 original_section_title = section_title
48 section_title = section_title.lower()
49 wxr.wtp.start_subsection(original_section_title)
50 if section_title == "": 50 ↛ 51line 50 didn't jump to line 51 because the condition on line 50 was never true
51 return None
53 pos_template_name = ""
54 for level_node_template in level_node.find_content(NodeKind.TEMPLATE):
55 pos_template_name = level_node_template.template_name
56 break
58 pos_keys = [
59 section_title,
60 pos_template_name,
61 " ".join(section_title.split()[:2]),
62 section_title.split()[0],
63 ]
64 if section_title in IGNORED_TITLES: 64 ↛ 65line 64 didn't jump to line 65 because the condition on line 64 was never true
65 pass
66 elif any(key in POS_TITLES for key in pos_keys):
67 pos_data = None
68 for key in pos_keys: 68 ↛ 72line 68 didn't jump to line 72 because the loop on line 68 didn't complete
69 pos_data = POS_TITLES.get(key)
70 if pos_data is not None:
71 break
72 if pos_data is not None: 72 ↛ 137line 72 didn't jump to line 137 because the condition on line 72 was always true
73 pos_type = pos_data["pos"]
74 page_data.append(base_data.model_copy(deep=True))
75 page_data[-1].pos = pos_type
76 page_data[-1].pos_title = original_section_title
77 page_data[-1].tags.extend(pos_data.get("tags", []))
78 page_data[-1].categories.extend(categories.get("categories", []))
79 extract_pos_section(wxr, page_data[-1], level_node, section_title)
80 if len(page_data[-1].senses) == 0:
81 if "form-of" in page_data[-1].tags:
82 page_data.pop()
83 elif section_title in LINKAGE_TITLES: 83 ↛ 84line 83 didn't jump to line 84 because the condition on line 83 was never true
84 page_data.pop()
85 extract_linkage_section(
86 wxr,
87 page_data,
88 level_node,
89 LINKAGE_TITLES[section_title],
90 )
91 elif (
92 section_title.startswith("etimología")
93 and wxr.config.capture_etymologies
94 ):
95 if level_node.contain_node(LEVEL_KIND_FLAGS):
96 base_data = base_data.model_copy(deep=True)
97 extract_etymology_section(wxr, base_data, level_node)
98 elif (
99 section_title in TRANSLATIONS_TITLES and wxr.config.capture_translations
100 ):
101 if len(page_data) == 0: 101 ↛ 102line 101 didn't jump to line 102 because the condition on line 101 was never true
102 page_data.append(base_data.model_copy(deep=True))
103 extract_translation_section(wxr, page_data, level_node)
104 elif section_title == "descendientes": 104 ↛ 105line 104 didn't jump to line 105 because the condition on line 104 was never true
105 if len(page_data) == 0:
106 page_data.append(base_data.model_copy(deep=True))
107 extract_translation_section(wxr, page_data, level_node, False)
108 elif ( 108 ↛ 112line 108 didn't jump to line 112 because the condition on line 108 was never true
109 section_title in LINKAGE_TITLES
110 or section_title.removesuffix("s") in LINKAGE_TITLES
111 ):
112 if section_title not in LINKAGE_TITLES:
113 section_title = section_title.removesuffix("s")
114 if len(page_data) == 0:
115 page_data.append(base_data.model_copy(deep=True))
116 extract_linkage_section(
117 wxr, page_data, level_node, LINKAGE_TITLES[section_title]
118 )
119 elif section_title == "conjugación":
120 if len(page_data) == 0: 120 ↛ 121line 120 didn't jump to line 121 because the condition on line 120 was never true
121 page_data.append(base_data.model_copy(deep=True))
122 extract_conjugation_section(wxr, page_data, level_node)
123 elif section_title == "formas alternativas":
124 extract_alt_form_section(
125 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
126 )
127 elif section_title == "información adicional": 127 ↛ 132line 127 didn't jump to line 132 because the condition on line 127 was always true
128 extract_additional_information_section(
129 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
130 )
131 else:
132 wxr.wtp.debug(
133 f"Unprocessed section: {section_title}",
134 sortid="extractor/es/page/parse_section/48",
135 )
137 for link_node in level_node.find_child(NodeKind.LINK):
138 clean_node(
139 wxr, page_data[-1] if len(page_data) > 0 else base_data, link_node
140 )
142 for next_level_node in level_node.find_child(LEVEL_KIND_FLAGS):
143 parse_section(wxr, page_data, base_data, next_level_node)
146def parse_page(
147 wxr: WiktextractContext, page_title: str, page_text: str
148) -> list[dict[str, any]]:
149 # style guide
150 # https://es.wiktionary.org/wiki/Wikcionario:Guía_de_estilo
151 # entry layout
152 # https://es.wiktionary.org/wiki/Wikcionario:Estructura
153 if wxr.config.verbose: 153 ↛ 154line 153 didn't jump to line 154 because the condition on line 153 was never true
154 logger.info(f"Parsing page: {page_title}")
155 wxr.wtp.start_page(page_title)
156 tree = wxr.wtp.parse(page_text)
157 page_data: list[WordEntry] = []
158 for level2_node in tree.find_child(NodeKind.LEVEL2):
159 categories = {}
160 lang_code = "unknown"
161 lang_name = "unknown"
162 section_title = clean_node(wxr, None, level2_node.largs)
163 if section_title.lower() == "referencias y notas": 163 ↛ 164line 163 didn't jump to line 164 because the condition on line 163 was never true
164 continue
165 for subtitle_template in level2_node.find_content(NodeKind.TEMPLATE): 165 ↛ 172line 165 didn't jump to line 172 because the loop on line 165 didn't complete
166 # https://es.wiktionary.org/wiki/Plantilla:lengua
167 # https://es.wiktionary.org/wiki/Apéndice:Códigos_de_idioma
168 if subtitle_template.template_name == "lengua": 168 ↛ 165line 168 didn't jump to line 165 because the condition on line 168 was always true
169 lang_code = subtitle_template.template_parameters.get(1).lower()
170 lang_name = clean_node(wxr, categories, subtitle_template)
171 break
172 if ( 172 ↛ 176line 172 didn't jump to line 176 because the condition on line 172 was never true
173 wxr.config.capture_language_codes is not None
174 and lang_code not in wxr.config.capture_language_codes
175 ):
176 continue
177 wxr.wtp.start_section(lang_name)
178 base_data = WordEntry(
179 lang=lang_name,
180 lang_code=lang_code,
181 word=page_title,
182 pos="unknown",
183 categories=categories.get("categories", []),
184 )
185 for node in level2_node.find_child(NodeKind.TEMPLATE | NodeKind.LINK):
186 if (
187 isinstance(node, TemplateNode)
188 and node.template_name == "pron-graf"
189 ):
190 process_pron_graf_template(wxr, base_data, node)
191 elif node.kind == NodeKind.LINK: 191 ↛ 185line 191 didn't jump to line 185 because the condition on line 191 was always true
192 clean_node(wxr, base_data, node)
194 for next_level_node in level2_node.find_child(LEVEL_KIND_FLAGS):
195 parse_section(wxr, page_data, base_data, next_level_node)
197 for data in page_data:
198 if len(data.senses) == 0:
199 data.senses.append(Sense(tags=["no-gloss"]))
200 return [d.model_dump(exclude_defaults=True) for d in page_data]