Coverage for src/wiktextract/extractor/es/page.py: 79%
109 statements
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-04 10:58 +0000
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-04 10:58 +0000
1from wikitextprocessor.parser import (
2 LEVEL_KIND_FLAGS,
3 NodeKind,
4 TemplateNode,
5 WikiNode,
6)
8from ...page import clean_node
9from ...wxr_context import WiktextractContext
10from ...wxr_logging import logger
11from .conjugation import extract_conjugation_section
12from .etymology import extract_etymology_section
13from .linkage import (
14 extract_additional_information_section,
15 extract_alt_form_section,
16 extract_linkage_section,
17)
18from .models import Sense, WordEntry
19from .pos import extract_pos_section
20from .pronunciation import process_pron_graf_template
21from .section_titles import (
22 IGNORED_TITLES,
23 LINKAGE_TITLES,
24 POS_TITLES,
25 TRANSLATIONS_TITLES,
26)
27from .translation import extract_translation_section
30def parse_section(
31 wxr: WiktextractContext,
32 page_data: list[WordEntry],
33 base_data: WordEntry,
34 level_node: WikiNode,
35) -> None:
36 """
37 Parses indidividual sibling sections of an entry,
38 e.g. https://es.wiktionary.org/wiki/amor:
40 === Etimología ===
41 === {{sustantivo masculino|es}} ===
42 === Locuciones ===
43 """
45 categories = {}
46 section_title = clean_node(wxr, categories, level_node.largs)
47 original_section_title = section_title
48 section_title = section_title.lower()
49 wxr.wtp.start_subsection(original_section_title)
51 pos_template_name = ""
52 for level_node_template in level_node.find_content(NodeKind.TEMPLATE):
53 pos_template_name = level_node_template.template_name
54 break
56 pos_keys = [
57 section_title,
58 pos_template_name,
59 " ".join(section_title.split()[:2]),
60 section_title.split()[0],
61 ]
62 if section_title in IGNORED_TITLES: 62 ↛ 63line 62 didn't jump to line 63 because the condition on line 62 was never true
63 pass
64 elif any(key in POS_TITLES for key in pos_keys):
65 pos_data = None
66 for key in pos_keys: 66 ↛ 70line 66 didn't jump to line 70 because the loop on line 66 didn't complete
67 pos_data = POS_TITLES.get(key)
68 if pos_data is not None:
69 break
70 if pos_data is not None: 70 ↛ 135line 70 didn't jump to line 135 because the condition on line 70 was always true
71 pos_type = pos_data["pos"]
72 page_data.append(base_data.model_copy(deep=True))
73 page_data[-1].pos = pos_type
74 page_data[-1].pos_title = original_section_title
75 page_data[-1].tags.extend(pos_data.get("tags", []))
76 page_data[-1].categories.extend(categories.get("categories", []))
77 extract_pos_section(wxr, page_data[-1], level_node, section_title)
78 if len(page_data[-1].senses) == 0:
79 if "form-of" in page_data[-1].tags:
80 page_data.pop()
81 elif section_title in LINKAGE_TITLES: 81 ↛ 82line 81 didn't jump to line 82 because the condition on line 81 was never true
82 page_data.pop()
83 extract_linkage_section(
84 wxr,
85 page_data,
86 level_node,
87 LINKAGE_TITLES[section_title],
88 )
89 elif (
90 section_title.startswith("etimología")
91 and wxr.config.capture_etymologies
92 ):
93 if level_node.contain_node(LEVEL_KIND_FLAGS):
94 base_data = base_data.model_copy(deep=True)
95 extract_etymology_section(wxr, base_data, level_node)
96 elif (
97 section_title in TRANSLATIONS_TITLES and wxr.config.capture_translations
98 ):
99 if len(page_data) == 0: 99 ↛ 100line 99 didn't jump to line 100 because the condition on line 99 was never true
100 page_data.append(base_data.model_copy(deep=True))
101 extract_translation_section(wxr, page_data, level_node)
102 elif section_title == "descendientes": 102 ↛ 103line 102 didn't jump to line 103 because the condition on line 102 was never true
103 if len(page_data) == 0:
104 page_data.append(base_data.model_copy(deep=True))
105 extract_translation_section(wxr, page_data, level_node, False)
106 elif ( 106 ↛ 110line 106 didn't jump to line 110 because the condition on line 106 was never true
107 section_title in LINKAGE_TITLES
108 or section_title.removesuffix("s") in LINKAGE_TITLES
109 ):
110 if section_title not in LINKAGE_TITLES:
111 section_title = section_title.removesuffix("s")
112 if len(page_data) == 0:
113 page_data.append(base_data.model_copy(deep=True))
114 extract_linkage_section(
115 wxr, page_data, level_node, LINKAGE_TITLES[section_title]
116 )
117 elif section_title == "conjugación":
118 if len(page_data) == 0: 118 ↛ 119line 118 didn't jump to line 119 because the condition on line 118 was never true
119 page_data.append(base_data.model_copy(deep=True))
120 extract_conjugation_section(wxr, page_data, level_node)
121 elif section_title == "formas alternativas":
122 extract_alt_form_section(
123 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
124 )
125 elif section_title == "información adicional": 125 ↛ 130line 125 didn't jump to line 130 because the condition on line 125 was always true
126 extract_additional_information_section(
127 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
128 )
129 else:
130 wxr.wtp.debug(
131 f"Unprocessed section: {section_title}",
132 sortid="extractor/es/page/parse_section/48",
133 )
135 for link_node in level_node.find_child(NodeKind.LINK):
136 clean_node(
137 wxr, page_data[-1] if len(page_data) > 0 else base_data, link_node
138 )
140 for next_level_node in level_node.find_child(LEVEL_KIND_FLAGS):
141 parse_section(wxr, page_data, base_data, next_level_node)
144def parse_page(
145 wxr: WiktextractContext, page_title: str, page_text: str
146) -> list[dict[str, any]]:
147 # style guide
148 # https://es.wiktionary.org/wiki/Wikcionario:Guía_de_estilo
149 # entry layout
150 # https://es.wiktionary.org/wiki/Wikcionario:Estructura
151 if wxr.config.verbose: 151 ↛ 152line 151 didn't jump to line 152 because the condition on line 151 was never true
152 logger.info(f"Parsing page: {page_title}")
153 wxr.wtp.start_page(page_title)
154 tree = wxr.wtp.parse(page_text)
155 page_data: list[WordEntry] = []
156 for level2_node in tree.find_child(NodeKind.LEVEL2):
157 categories = {}
158 lang_code = "unknown"
159 lang_name = "unknown"
160 section_title = clean_node(wxr, None, level2_node.largs)
161 if section_title.lower() == "referencias y notas": 161 ↛ 162line 161 didn't jump to line 162 because the condition on line 161 was never true
162 continue
163 for subtitle_template in level2_node.find_content(NodeKind.TEMPLATE): 163 ↛ 170line 163 didn't jump to line 170 because the loop on line 163 didn't complete
164 # https://es.wiktionary.org/wiki/Plantilla:lengua
165 # https://es.wiktionary.org/wiki/Apéndice:Códigos_de_idioma
166 if subtitle_template.template_name == "lengua": 166 ↛ 163line 166 didn't jump to line 163 because the condition on line 166 was always true
167 lang_code = subtitle_template.template_parameters.get(1).lower()
168 lang_name = clean_node(wxr, categories, subtitle_template)
169 break
170 if ( 170 ↛ 174line 170 didn't jump to line 174 because the condition on line 170 was never true
171 wxr.config.capture_language_codes is not None
172 and lang_code not in wxr.config.capture_language_codes
173 ):
174 continue
175 wxr.wtp.start_section(lang_name)
176 base_data = WordEntry(
177 lang=lang_name,
178 lang_code=lang_code,
179 word=page_title,
180 pos="unknown",
181 categories=categories.get("categories", []),
182 )
183 for node in level2_node.find_child(NodeKind.TEMPLATE | NodeKind.LINK):
184 if (
185 isinstance(node, TemplateNode)
186 and node.template_name == "pron-graf"
187 ):
188 process_pron_graf_template(wxr, base_data, node)
189 elif node.kind == NodeKind.LINK: 189 ↛ 183line 189 didn't jump to line 183 because the condition on line 189 was always true
190 clean_node(wxr, base_data, node)
192 for next_level_node in level2_node.find_child(LEVEL_KIND_FLAGS):
193 parse_section(wxr, page_data, base_data, next_level_node)
195 for data in page_data:
196 if len(data.senses) == 0:
197 data.senses.append(Sense(tags=["no-gloss"]))
198 return [d.model_dump(exclude_defaults=True) for d in page_data]