Coverage for src/wiktextract/extractor/pt/page.py: 77%
97 statements
« prev ^ index » next coverage.py v7.9.0, created at 2025-06-13 07:43 +0000
« prev ^ index » next coverage.py v7.9.0, created at 2025-06-13 07:43 +0000
1from typing import Any
3from wikitextprocessor.parser import (
4 LEVEL_KIND_FLAGS,
5 LevelNode,
6 NodeKind,
7)
9from ...page import clean_node
10from ...wxr_context import WiktextractContext
11from .etymology import extract_etymology_section
12from .inflection import extract_conjugation_section, extract_degree_section
13from .linkage import (
14 extract_expression_section,
15 extract_forms_section,
16 extract_linkage_section,
17 extract_phraseology_section,
18)
19from .models import Sense, WordEntry
20from .pos import extract_pos_section
21from .pronunciation import extract_pronunciation_section
22from .section_titles import (
23 FORM_SECTION_TAGS,
24 LINKAGE_SECTIONS,
25 LINKAGE_TAGS,
26 POS_DATA,
27)
28from .translation import extract_translation_section
31def parse_section(
32 wxr: WiktextractContext,
33 page_data: list[WordEntry],
34 base_data: WordEntry,
35 level_node: LevelNode,
36) -> None:
37 cats = {}
38 title_text = clean_node(wxr, cats, level_node.largs).strip(
39 "⁰¹²³⁴⁵⁶⁷⁸⁹0123456789: \n"
40 )
41 if title_text.lower() in POS_DATA:
42 extract_pos_section(
43 wxr,
44 page_data,
45 base_data,
46 level_node,
47 title_text,
48 cats.get("categories", []),
49 )
50 if len(page_data[-1].senses) == 0 and title_text in FORM_SECTION_TAGS:
51 page_data.pop()
52 extract_forms_section(
53 wxr,
54 page_data[-1] if len(page_data) > 0 else base_data,
55 level_node,
56 title_text,
57 )
58 elif len(page_data[-1].senses) == 0 and title_text == "Expressão": 58 ↛ 59line 58 didn't jump to line 59 because the condition on line 58 was never true
59 page_data.pop()
60 extract_expression_section(
61 wxr,
62 page_data[-1] if len(page_data) > 0 else base_data,
63 level_node,
64 )
65 elif title_text in ["Tradução", "Traduções", "Cognatos", "Descendentes"]:
66 extract_translation_section(
67 wxr,
68 page_data[-1] if len(page_data) > 0 else base_data,
69 level_node,
70 title_text,
71 )
72 elif title_text == "Expressões":
73 extract_expression_section(
74 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
75 )
76 elif title_text.lower() in LINKAGE_SECTIONS:
77 extract_linkage_section(
78 wxr,
79 page_data[-1] if len(page_data) > 0 else base_data,
80 level_node,
81 LINKAGE_SECTIONS[title_text.lower()],
82 "",
83 0,
84 "",
85 LINKAGE_TAGS.get(title_text.lower(), []),
86 )
87 elif title_text == "Etimologia":
88 extract_etymology_section(wxr, page_data, level_node)
89 elif title_text in ["Pronúncia", "Romanização"]:
90 extract_pronunciation_section(wxr, page_data, level_node)
91 elif title_text == "Fraseologia":
92 extract_phraseology_section(
93 wxr, page_data[-1] if len(page_data) else base_data, level_node
94 )
95 elif title_text.startswith(("Nota", "Uso")): 95 ↛ 96line 95 didn't jump to line 96 because the condition on line 95 was never true
96 extract_note_section(wxr, page_data, level_node)
97 elif title_text == "Conjugação":
98 extract_conjugation_section(
99 wxr, page_data[-1] if len(page_data) else base_data, level_node
100 )
101 elif title_text == "Graus":
102 extract_degree_section(
103 wxr, page_data[-1] if len(page_data) else base_data, level_node
104 )
105 elif title_text in FORM_SECTION_TAGS:
106 extract_forms_section(
107 wxr,
108 page_data[-1] if len(page_data) > 0 else base_data,
109 level_node,
110 title_text,
111 )
112 elif title_text.lower() not in [ 112 ↛ 132line 112 didn't jump to line 132 because the condition on line 112 was never true
113 "ver também",
114 "ligação externa",
115 "ligações externas",
116 "ligação extena",
117 "referências",
118 "referência",
119 "no wikcionário",
120 "na wikipédia",
121 "no wikiquote",
122 "no wikispecies",
123 "no wikisaurus",
124 "no commons",
125 "no wikimedia commons",
126 "na internet",
127 "galeria",
128 "galeria de imagens",
129 "brasil",
130 "portugal",
131 ]:
132 wxr.wtp.debug(f"unknown section: {title_text}")
134 if title_text.lower() not in POS_DATA:
135 save_section_cats(
136 cats.get("categories", []), page_data, level_node, True
137 )
138 cats = {}
139 for link_node in level_node.find_child(NodeKind.LINK): 139 ↛ 140line 139 didn't jump to line 140 because the loop on line 139 never started
140 clean_node(wxr, cats, link_node)
141 save_section_cats(cats.get("categories", []), page_data, level_node, False)
143 if title_text.lower() not in ["pronúncia", "ver também"]:
144 for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
145 parse_section(wxr, page_data, base_data, next_level)
148def save_section_cats(
149 cats: list[str],
150 page_data: list[WordEntry],
151 level_node: LevelNode,
152 from_title: bool,
153) -> None:
154 if not from_title or (from_title and level_node.kind == NodeKind.LEVEL2):
155 for data in page_data:
156 if data.lang_code == page_data[-1].lang_code: 156 ↛ 155line 156 didn't jump to line 155 because the condition on line 156 was always true
157 data.categories.extend(cats)
158 elif len(page_data) > 0: 158 ↛ exitline 158 didn't return from function 'save_section_cats' because the condition on line 158 was always true
159 page_data[-1].categories.extend(cats)
162def parse_page(
163 wxr: WiktextractContext, page_title: str, page_text: str
164) -> list[dict[str, Any]]:
165 # page layout
166 # https://pt.wiktionary.org/wiki/Wikcionário:Livro_de_estilo
167 if ( 167 ↛ 173line 167 didn't jump to line 173 because the condition on line 167 was never true
168 "/traduções" in page_title
169 or "/tradução" in page_title
170 or page_title.startswith("Wikisaurus:")
171 ):
172 # skip translation and thesaurus pages
173 return []
174 wxr.wtp.start_page(page_title)
175 tree = wxr.wtp.parse(page_text)
176 page_data: list[WordEntry] = []
177 for level1_node in tree.find_child(NodeKind.LEVEL1):
178 lang_cats = {}
179 lang_name = clean_node(wxr, lang_cats, level1_node.largs)
180 if lang_name == "": 180 ↛ 181line 180 didn't jump to line 181 because the condition on line 180 was never true
181 lang_name = "unknown"
182 lang_code = "unknown"
183 for lang_template in level1_node.find_content(NodeKind.TEMPLATE): 183 ↛ 188line 183 didn't jump to line 188 because the loop on line 183 didn't complete
184 lang_code = lang_template.template_name.strip("-")
185 if lang_code == "": # template "--" 185 ↛ 186line 185 didn't jump to line 186 because the condition on line 185 was never true
186 lang_code = "unknown"
187 break
188 if ( 188 ↛ 192line 188 didn't jump to line 192 because the condition on line 188 was never true
189 wxr.config.capture_language_codes is not None
190 and lang_code not in wxr.config.capture_language_codes
191 ):
192 continue
193 wxr.wtp.start_section(lang_name)
194 base_data = WordEntry(
195 word=wxr.wtp.title,
196 lang_code=lang_code,
197 lang=lang_name,
198 pos="unknown",
199 categories=lang_cats.get("categories", []),
200 )
201 for next_level_node in level1_node.find_child(LEVEL_KIND_FLAGS):
202 parse_section(wxr, page_data, base_data, next_level_node)
204 for data in page_data:
205 if len(data.senses) == 0: 205 ↛ 206line 205 didn't jump to line 206 because the condition on line 205 was never true
206 data.senses.append(Sense(tags=["no-gloss"]))
207 return [m.model_dump(exclude_defaults=True) for m in page_data]
210def extract_note_section(
211 wxr: WiktextractContext,
212 page_data: list[WordEntry],
213 level_node: LevelNode,
214) -> None:
215 notes = []
216 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM):
217 note = clean_node(
218 wxr, None, list(list_item.invert_find_child(NodeKind.LIST))
219 )
220 if note != "":
221 notes.append(note)
222 for data in page_data:
223 if data.lang_code == page_data[-1].lang_code:
224 data.notes.extend(notes)