Coverage for src/wiktextract/extractor/it/page.py: 80%
65 statements
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-04 10:58 +0000
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-04 10:58 +0000
1from typing import Any
3from wikitextprocessor.parser import LEVEL_KIND_FLAGS, LevelNode, NodeKind
5from ...page import clean_node
6from ...wxr_context import WiktextractContext
7from .etymology import extract_citation_section, extract_etymology_section
8from .linkage import extract_form_section, extract_linkage_section
9from .models import Sense, WordEntry
10from .pos import extract_note_section, extract_pos_section
11from .section_titles import LINKAGE_SECTIONS, POS_DATA
12from .sound import extract_hyphenation_section, extract_pronunciation_section
13from .translation import extract_translation_section
16def parse_section(
17 wxr: WiktextractContext,
18 page_data: list[WordEntry],
19 base_data: WordEntry,
20 level_node: LevelNode,
21) -> None:
22 title_text = clean_node(wxr, None, level_node.largs)
23 if title_text in POS_DATA or title_text.startswith("Trascrizione"):
24 wxr.wtp.start_subsection(title_text)
25 extract_pos_section(wxr, page_data, base_data, level_node, title_text)
26 elif title_text == "Traduzione":
27 wxr.wtp.start_subsection(title_text)
28 extract_translation_section(wxr, page_data, level_node)
29 elif title_text == "Etimologia / Derivazione":
30 wxr.wtp.start_subsection(title_text)
31 extract_etymology_section(wxr, page_data, level_node)
32 elif title_text == "Citazione":
33 wxr.wtp.start_subsection(title_text)
34 extract_citation_section(wxr, page_data, level_node)
35 elif title_text == "Sillabazione":
36 wxr.wtp.start_subsection(title_text)
37 extract_hyphenation_section(wxr, page_data, level_node)
38 elif title_text == "Pronuncia":
39 wxr.wtp.start_subsection(title_text)
40 extract_pronunciation_section(wxr, page_data, level_node)
41 elif title_text in LINKAGE_SECTIONS: 41 ↛ 46line 41 didn't jump to line 46 because the condition on line 41 was always true
42 wxr.wtp.start_subsection(title_text)
43 extract_linkage_section(
44 wxr, page_data, level_node, LINKAGE_SECTIONS[title_text]
45 )
46 elif title_text == "Uso / Precisazioni":
47 extract_note_section(wxr, page_data, level_node)
48 elif title_text in ["Variazione", "Forme flesse", "Variazioni", "Variante"]:
49 extract_form_section(wxr, page_data, level_node)
50 elif title_text not in ["Note / Riferimenti"]:
51 wxr.wtp.debug(
52 f"Unknown section: {title_text}",
53 sortid="extractor/it/page/parse_section/49",
54 )
56 for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
57 parse_section(wxr, page_data, base_data, next_level)
60def parse_page(
61 wxr: WiktextractContext, page_title: str, page_text: str
62) -> list[dict[str, Any]]:
63 # page layout
64 # https://it.wiktionary.org/wiki/Wikizionario:Manuale_di_stile
65 # https://it.wiktionary.org/wiki/Aiuto:Come_iniziare_una_pagina
66 wxr.wtp.start_page(page_title)
67 tree = wxr.wtp.parse(page_text, pre_expand=True)
68 page_data: list[WordEntry] = []
69 for level2_node in tree.find_child(NodeKind.LEVEL2):
70 lang_cats = {}
71 lang_name = clean_node(wxr, lang_cats, level2_node.largs)
72 if lang_name in ["Altri progetti", "Note / Riferimenti"]: 72 ↛ 73line 72 didn't jump to line 73 because the condition on line 72 was never true
73 continue
74 lang_code = "unknown"
75 for lang_template in level2_node.find_content(NodeKind.TEMPLATE): 75 ↛ 78line 75 didn't jump to line 78 because the loop on line 75 didn't complete
76 lang_code = lang_template.template_name.strip("-")
77 break
78 if ( 78 ↛ 82line 78 didn't jump to line 82 because the condition on line 78 was never true
79 wxr.config.capture_language_codes is not None
80 and lang_code not in wxr.config.capture_language_codes
81 ):
82 continue
83 wxr.wtp.start_section(lang_name)
84 base_data = WordEntry(
85 word=wxr.wtp.title,
86 lang_code=lang_code,
87 lang=lang_name,
88 pos="unknown",
89 categories=lang_cats.get("categories", []),
90 )
91 for next_level_node in level2_node.find_child(LEVEL_KIND_FLAGS):
92 parse_section(wxr, page_data, base_data, next_level_node)
94 for data in page_data:
95 if len(data.senses) == 0: 95 ↛ 96line 95 didn't jump to line 96 because the condition on line 95 was never true
96 data.senses.append(Sense(tags=["no-gloss"]))
97 return [m.model_dump(exclude_defaults=True) for m in page_data]