Coverage for src/wiktextract/extractor/pt/page.py: 75%
89 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1from typing import Any
3from wikitextprocessor.parser import (
4 LEVEL_KIND_FLAGS,
5 LevelNode,
6 NodeKind,
7)
9from ...page import clean_node
10from ...wxr_context import WiktextractContext
11from .etymology import extract_etymology_section
12from .inflection import extract_conjugation_section, extract_degree_section
13from .linkage import (
14 extract_expression_section,
15 extract_linkage_section,
16 extract_phraseology_section,
17)
18from .models import Sense, WordEntry
19from .pos import extract_pos_section
20from .pronunciation import extract_pronunciation_section
21from .section_titles import LINKAGE_SECTIONS, LINKAGE_TAGS, POS_DATA
22from .translation import extract_translation_section
25def parse_section(
26 wxr: WiktextractContext,
27 page_data: list[WordEntry],
28 base_data: WordEntry,
29 level_node: LevelNode,
30) -> None:
31 cats = {}
32 title_text = clean_node(wxr, cats, level_node.largs).strip(
33 "⁰¹²³⁴⁵⁶⁷⁸⁹0123456789:"
34 )
35 if title_text.lower() in POS_DATA:
36 extract_pos_section(
37 wxr,
38 page_data,
39 base_data,
40 level_node,
41 title_text,
42 cats.get("categories", []),
43 )
44 elif title_text in ["Tradução", "Traduções", "Cognatos", "Descendentes"]:
45 extract_translation_section(
46 wxr,
47 page_data[-1] if len(page_data) > 0 else base_data,
48 level_node,
49 title_text,
50 )
51 elif title_text == "Expressões":
52 extract_expression_section(
53 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
54 )
55 elif title_text.lower() in LINKAGE_SECTIONS:
56 extract_linkage_section(
57 wxr,
58 page_data[-1] if len(page_data) > 0 else base_data,
59 level_node,
60 LINKAGE_SECTIONS[title_text.lower()],
61 "",
62 0,
63 "",
64 LINKAGE_TAGS.get(title_text.lower(), []),
65 )
66 elif title_text == "Etimologia":
67 extract_etymology_section(wxr, page_data, level_node)
68 elif title_text == "Pronúncia":
69 extract_pronunciation_section(wxr, page_data, level_node)
70 elif title_text == "Fraseologia":
71 extract_phraseology_section(
72 wxr, page_data[-1] if len(page_data) else base_data, level_node
73 )
74 elif title_text.startswith(("Nota", "Uso")): 74 ↛ 75line 74 didn't jump to line 75 because the condition on line 74 was never true
75 extract_note_section(wxr, page_data, level_node)
76 elif title_text == "Conjugação":
77 extract_conjugation_section(
78 wxr, page_data[-1] if len(page_data) else base_data, level_node
79 )
80 elif title_text == "Graus": 80 ↛ 84line 80 didn't jump to line 84 because the condition on line 80 was always true
81 extract_degree_section(
82 wxr, page_data[-1] if len(page_data) else base_data, level_node
83 )
84 elif title_text.lower() not in [
85 "ver também",
86 "ligação externa",
87 "ligações externas",
88 "ligação extena",
89 "referências",
90 "referência",
91 "no wikcionário",
92 "na wikipédia",
93 "no wikiquote",
94 "no wikispecies",
95 "no wikisaurus",
96 "no commons",
97 "no wikimedia commons",
98 "na internet",
99 "galeria",
100 "galeria de imagens",
101 ]:
102 wxr.wtp.debug(f"unknown section: {title_text}")
104 if title_text.lower() not in POS_DATA:
105 save_section_cats(
106 cats.get("categories", []), page_data, level_node, True
107 )
108 cats = {}
109 for link_node in level_node.find_child(NodeKind.LINK): 109 ↛ 110line 109 didn't jump to line 110 because the loop on line 109 never started
110 clean_node(wxr, cats, link_node)
111 save_section_cats(cats.get("categories", []), page_data, level_node, False)
113 if title_text.lower() not in ["pronúncia", "ver também"]:
114 for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
115 parse_section(wxr, page_data, base_data, next_level)
118def save_section_cats(
119 cats: list[str],
120 page_data: list[WordEntry],
121 level_node: LevelNode,
122 from_title: bool,
123) -> None:
124 if not from_title or (from_title and level_node.kind == NodeKind.LEVEL2):
125 for data in page_data:
126 if data.lang_code == page_data[-1].lang_code: 126 ↛ 125line 126 didn't jump to line 125 because the condition on line 126 was always true
127 data.categories.extend(cats)
128 elif len(page_data) > 0: 128 ↛ exitline 128 didn't return from function 'save_section_cats' because the condition on line 128 was always true
129 page_data[-1].categories.extend(cats)
132def parse_page(
133 wxr: WiktextractContext, page_title: str, page_text: str
134) -> list[dict[str, Any]]:
135 # page layout
136 # https://pt.wiktionary.org/wiki/Wikcionário:Livro_de_estilo
137 if "/traduções" in page_title or page_title.startswith("Wikisaurus:"): 137 ↛ 139line 137 didn't jump to line 139 because the condition on line 137 was never true
138 # skip translation and thesaurus pages
139 return []
140 wxr.wtp.start_page(page_title)
141 tree = wxr.wtp.parse(page_text)
142 page_data: list[WordEntry] = []
143 for level1_node in tree.find_child(NodeKind.LEVEL1):
144 lang_cats = {}
145 lang_name = clean_node(wxr, lang_cats, level1_node.largs)
146 if lang_name == "": 146 ↛ 147line 146 didn't jump to line 147 because the condition on line 146 was never true
147 lang_name = "unknown"
148 lang_code = "unknown"
149 for lang_template in level1_node.find_content(NodeKind.TEMPLATE): 149 ↛ 154line 149 didn't jump to line 154 because the loop on line 149 didn't complete
150 lang_code = lang_template.template_name.strip("-")
151 if lang_code == "": # template "--" 151 ↛ 152line 151 didn't jump to line 152 because the condition on line 151 was never true
152 lang_code = "unknown"
153 break
154 if ( 154 ↛ 158line 154 didn't jump to line 158 because the condition on line 154 was never true
155 wxr.config.capture_language_codes is not None
156 and lang_code not in wxr.config.capture_language_codes
157 ):
158 continue
159 wxr.wtp.start_section(lang_name)
160 base_data = WordEntry(
161 word=wxr.wtp.title,
162 lang_code=lang_code,
163 lang=lang_name,
164 pos="unknown",
165 categories=lang_cats.get("categories", []),
166 )
167 for next_level_node in level1_node.find_child(LEVEL_KIND_FLAGS):
168 parse_section(wxr, page_data, base_data, next_level_node)
170 for data in page_data:
171 if len(data.senses) == 0: 171 ↛ 172line 171 didn't jump to line 172 because the condition on line 171 was never true
172 data.senses.append(Sense(tags=["no-gloss"]))
173 return [m.model_dump(exclude_defaults=True) for m in page_data]
176def extract_note_section(
177 wxr: WiktextractContext,
178 page_data: list[WordEntry],
179 level_node: LevelNode,
180) -> None:
181 notes = []
182 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM):
183 note = clean_node(
184 wxr, None, list(list_item.invert_find_child(NodeKind.LIST))
185 )
186 if note != "":
187 notes.append(note)
188 for data in page_data:
189 if data.lang_code == page_data[-1].lang_code:
190 data.notes.extend(notes)