Coverage for src/wiktextract/extractor/it/page.py: 91%
59 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1from typing import Any
3from wikitextprocessor.parser import LEVEL_KIND_FLAGS, LevelNode, NodeKind
5from ...page import clean_node
6from ...wxr_context import WiktextractContext
7from .etymology import extract_citation_section, extract_etymology_section
8from .linkage import extract_linkage_section
9from .models import Sense, WordEntry
10from .pos import extract_pos_section
11from .section_titles import LINKAGE_SECTIONS, POS_DATA
12from .sound import extract_hyphenation_section, extract_pronunciation_section
13from .translation import extract_translation_section
16def parse_section(
17 wxr: WiktextractContext,
18 page_data: list[WordEntry],
19 base_data: WordEntry,
20 level_node: LevelNode,
21) -> None:
22 title_text = clean_node(wxr, None, level_node.largs)
23 if title_text in POS_DATA:
24 wxr.wtp.start_subsection(title_text)
25 extract_pos_section(wxr, page_data, base_data, level_node, title_text)
26 elif title_text == "Traduzione":
27 wxr.wtp.start_subsection(title_text)
28 extract_translation_section(wxr, page_data, level_node)
29 elif title_text == "Etimologia / Derivazione":
30 wxr.wtp.start_subsection(title_text)
31 extract_etymology_section(wxr, page_data, level_node)
32 elif title_text == "Citazione":
33 wxr.wtp.start_subsection(title_text)
34 extract_citation_section(wxr, page_data, level_node)
35 elif title_text == "Sillabazione":
36 wxr.wtp.start_subsection(title_text)
37 extract_hyphenation_section(wxr, page_data, level_node)
38 elif title_text == "Pronuncia":
39 wxr.wtp.start_subsection(title_text)
40 extract_pronunciation_section(wxr, page_data, level_node)
41 elif title_text in LINKAGE_SECTIONS: 41 ↛ 47line 41 didn't jump to line 47 because the condition on line 41 was always true
42 wxr.wtp.start_subsection(title_text)
43 extract_linkage_section(
44 wxr, page_data, level_node, LINKAGE_SECTIONS[title_text]
45 )
47 for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
48 parse_section(wxr, page_data, base_data, next_level)
51def parse_page(
52 wxr: WiktextractContext, page_title: str, page_text: str
53) -> list[dict[str, Any]]:
54 # page layout
55 # https://it.wiktionary.org/wiki/Wikizionario:Manuale_di_stile
56 # https://it.wiktionary.org/wiki/Aiuto:Come_iniziare_una_pagina
57 wxr.wtp.start_page(page_title)
58 tree = wxr.wtp.parse(page_text, pre_expand=True)
59 page_data: list[WordEntry] = []
60 for level2_node in tree.find_child(NodeKind.LEVEL2):
61 lang_cats = {}
62 lang_name = clean_node(wxr, lang_cats, level2_node.largs)
63 if lang_name in ["Altri progetti", "Note / Riferimenti"]: 63 ↛ 64line 63 didn't jump to line 64 because the condition on line 63 was never true
64 continue
65 lang_code = "unknown"
66 for lang_template in level2_node.find_content(NodeKind.TEMPLATE): 66 ↛ 69line 66 didn't jump to line 69 because the loop on line 66 didn't complete
67 lang_code = lang_template.template_name.strip("-")
68 break
69 if ( 69 ↛ 73line 69 didn't jump to line 73 because the condition on line 69 was never true
70 wxr.config.capture_language_codes is not None
71 and lang_code not in wxr.config.capture_language_codes
72 ):
73 continue
74 wxr.wtp.start_section(lang_name)
75 base_data = WordEntry(
76 word=wxr.wtp.title,
77 lang_code=lang_code,
78 lang=lang_name,
79 pos="unknown",
80 categories=lang_cats.get("categories", []),
81 )
82 for next_level_node in level2_node.find_child(LEVEL_KIND_FLAGS):
83 parse_section(wxr, page_data, base_data, next_level_node)
85 for data in page_data:
86 if len(data.senses) == 0: 86 ↛ 87line 86 didn't jump to line 87 because the condition on line 86 was never true
87 data.senses.append(Sense(tags=["no-gloss"]))
88 return [m.model_dump(exclude_defaults=True) for m in page_data]