Coverage for src/wiktextract/extractor/pt/page.py: 77%
97 statements
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-15 05:18 +0000
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-15 05:18 +0000
1from typing import Any
3from wikitextprocessor.parser import (
4 LEVEL_KIND_FLAGS,
5 LevelNode,
6 NodeKind,
7)
9from ...page import clean_node
10from ...wxr_context import WiktextractContext
11from .etymology import extract_etymology_section
12from .inflection import extract_conjugation_section, extract_degree_section
13from .linkage import (
14 extract_expression_section,
15 extract_forms_section,
16 extract_linkage_section,
17 extract_phraseology_section,
18)
19from .models import Sense, WordEntry
20from .pos import extract_pos_section
21from .pronunciation import extract_pronunciation_section
22from .section_titles import (
23 FORM_SECTION_TAGS,
24 LINKAGE_SECTIONS,
25 LINKAGE_TAGS,
26 POS_DATA,
27)
28from .translation import extract_translation_section
31def parse_section(
32 wxr: WiktextractContext,
33 page_data: list[WordEntry],
34 base_data: WordEntry,
35 level_node: LevelNode,
36) -> None:
37 cats = {}
38 title_text = clean_node(wxr, cats, level_node.largs).strip(
39 "⁰¹²³⁴⁵⁶⁷⁸⁹0123456789: \n"
40 )
41 if title_text.lower() in POS_DATA:
42 extract_pos_section(
43 wxr,
44 page_data,
45 base_data,
46 level_node,
47 title_text,
48 cats.get("categories", []),
49 )
50 if len(page_data[-1].senses) == 0 and title_text in FORM_SECTION_TAGS:
51 page_data.pop()
52 extract_forms_section(
53 wxr,
54 page_data[-1] if len(page_data) > 0 else base_data,
55 level_node,
56 title_text,
57 )
58 elif len(page_data[-1].senses) == 0 and title_text == "Expressão": 58 ↛ 59line 58 didn't jump to line 59 because the condition on line 58 was never true
59 page_data.pop()
60 extract_expression_section(
61 wxr,
62 page_data[-1] if len(page_data) > 0 else base_data,
63 level_node,
64 )
65 elif title_text in ["Tradução", "Traduções", "Cognatos", "Descendentes"]:
66 extract_translation_section(
67 wxr,
68 page_data[-1] if len(page_data) > 0 else base_data,
69 level_node,
70 title_text,
71 )
72 elif title_text == "Expressões":
73 extract_expression_section(
74 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
75 )
76 elif title_text.lower() in LINKAGE_SECTIONS:
77 extract_linkage_section(
78 wxr,
79 page_data[-1] if len(page_data) > 0 else base_data,
80 level_node,
81 LINKAGE_SECTIONS[title_text.lower()],
82 "",
83 0,
84 "",
85 LINKAGE_TAGS.get(title_text.lower(), []),
86 )
87 elif title_text == "Etimologia":
88 extract_etymology_section(wxr, page_data, level_node)
89 elif title_text in ["Pronúncia", "Romanização"]:
90 extract_pronunciation_section(
91 wxr, page_data if len(page_data) else [base_data], level_node
92 )
93 elif title_text == "Fraseologia":
94 extract_phraseology_section(
95 wxr, page_data[-1] if len(page_data) else base_data, level_node
96 )
97 elif title_text.startswith(("Nota", "Uso")): 97 ↛ 98line 97 didn't jump to line 98 because the condition on line 97 was never true
98 extract_note_section(wxr, page_data, level_node)
99 elif title_text == "Conjugação":
100 extract_conjugation_section(
101 wxr, page_data[-1] if len(page_data) else base_data, level_node
102 )
103 elif title_text == "Graus":
104 extract_degree_section(
105 wxr, page_data[-1] if len(page_data) else base_data, level_node
106 )
107 elif title_text in FORM_SECTION_TAGS:
108 extract_forms_section(
109 wxr,
110 page_data[-1] if len(page_data) > 0 else base_data,
111 level_node,
112 title_text,
113 )
114 elif title_text.lower() not in [ 114 ↛ 134line 114 didn't jump to line 134 because the condition on line 114 was never true
115 "ver também",
116 "ligação externa",
117 "ligações externas",
118 "ligação extena",
119 "referências",
120 "referência",
121 "no wikcionário",
122 "na wikipédia",
123 "no wikiquote",
124 "no wikispecies",
125 "no wikisaurus",
126 "no commons",
127 "no wikimedia commons",
128 "na internet",
129 "galeria",
130 "galeria de imagens",
131 "brasil",
132 "portugal",
133 ]:
134 wxr.wtp.debug(f"unknown section: {title_text}")
136 if title_text.lower() not in POS_DATA:
137 save_section_cats(
138 cats.get("categories", []), page_data, level_node, True
139 )
140 cats = {}
141 for link_node in level_node.find_child(NodeKind.LINK): 141 ↛ 142line 141 didn't jump to line 142 because the loop on line 141 never started
142 clean_node(wxr, cats, link_node)
143 save_section_cats(cats.get("categories", []), page_data, level_node, False)
145 if title_text.lower() not in ["pronúncia", "ver também"]:
146 for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
147 parse_section(wxr, page_data, base_data, next_level)
150def save_section_cats(
151 cats: list[str],
152 page_data: list[WordEntry],
153 level_node: LevelNode,
154 from_title: bool,
155) -> None:
156 if not from_title or (from_title and level_node.kind == NodeKind.LEVEL2):
157 for data in page_data:
158 if data.lang_code == page_data[-1].lang_code: 158 ↛ 157line 158 didn't jump to line 157 because the condition on line 158 was always true
159 data.categories.extend(cats)
160 elif len(page_data) > 0: 160 ↛ exitline 160 didn't return from function 'save_section_cats' because the condition on line 160 was always true
161 page_data[-1].categories.extend(cats)
164def parse_page(
165 wxr: WiktextractContext, page_title: str, page_text: str
166) -> list[dict[str, Any]]:
167 # page layout
168 # https://pt.wiktionary.org/wiki/Wikcionário:Livro_de_estilo
169 if ( 169 ↛ 175line 169 didn't jump to line 175 because the condition on line 169 was never true
170 "/traduções" in page_title
171 or "/tradução" in page_title
172 or page_title.startswith("Wikisaurus:")
173 ):
174 # skip translation and thesaurus pages
175 return []
176 wxr.wtp.start_page(page_title)
177 tree = wxr.wtp.parse(page_text)
178 page_data: list[WordEntry] = []
179 for level1_node in tree.find_child(NodeKind.LEVEL1):
180 lang_cats = {}
181 lang_name = clean_node(wxr, lang_cats, level1_node.largs)
182 if lang_name == "": 182 ↛ 183line 182 didn't jump to line 183 because the condition on line 182 was never true
183 lang_name = "unknown"
184 lang_code = "unknown"
185 for lang_template in level1_node.find_content(NodeKind.TEMPLATE): 185 ↛ 190line 185 didn't jump to line 190 because the loop on line 185 didn't complete
186 lang_code = lang_template.template_name.strip("-")
187 if lang_code == "": # template "--" 187 ↛ 188line 187 didn't jump to line 188 because the condition on line 187 was never true
188 lang_code = "unknown"
189 break
190 if ( 190 ↛ 194line 190 didn't jump to line 194 because the condition on line 190 was never true
191 wxr.config.capture_language_codes is not None
192 and lang_code not in wxr.config.capture_language_codes
193 ):
194 continue
195 wxr.wtp.start_section(lang_name)
196 base_data = WordEntry(
197 word=wxr.wtp.title,
198 lang_code=lang_code,
199 lang=lang_name,
200 pos="unknown",
201 categories=lang_cats.get("categories", []),
202 )
203 for next_level_node in level1_node.find_child(LEVEL_KIND_FLAGS):
204 parse_section(wxr, page_data, base_data, next_level_node)
206 for data in page_data:
207 if len(data.senses) == 0: 207 ↛ 208line 207 didn't jump to line 208 because the condition on line 207 was never true
208 data.senses.append(Sense(tags=["no-gloss"]))
209 return [m.model_dump(exclude_defaults=True) for m in page_data]
212def extract_note_section(
213 wxr: WiktextractContext,
214 page_data: list[WordEntry],
215 level_node: LevelNode,
216) -> None:
217 notes = []
218 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM):
219 note = clean_node(
220 wxr, None, list(list_item.invert_find_child(NodeKind.LIST))
221 )
222 if note != "":
223 notes.append(note)
224 for data in page_data:
225 if data.lang_code == page_data[-1].lang_code:
226 data.notes.extend(notes)