Coverage for src/wiktextract/extractor/it/page.py: 91%

1from typing import Any

3from wikitextprocessor.parser import LEVEL_KIND_FLAGS, LevelNode, NodeKind

5from ...page import clean_node

6from ...wxr_context import WiktextractContext

7from .etymology import extract_citation_section, extract_etymology_section

8from .linkage import extract_linkage_section

9from .models import Sense, WordEntry

10from .pos import extract_pos_section

11from .section_titles import LINKAGE_SECTIONS, POS_DATA

12from .sound import extract_hyphenation_section, extract_pronunciation_section

13from .translation import extract_translation_section

16def parse_section(

17 wxr: WiktextractContext,

18 page_data: list[WordEntry],

19 base_data: WordEntry,

20 level_node: LevelNode,

21) -> None:

22 title_text = clean_node(wxr, None, level_node.largs)

23 if title_text in POS_DATA:

24 wxr.wtp.start_subsection(title_text)

25 extract_pos_section(wxr, page_data, base_data, level_node, title_text)

26 elif title_text == "Traduzione":

27 wxr.wtp.start_subsection(title_text)

28 extract_translation_section(wxr, page_data, level_node)

29 elif title_text == "Etimologia / Derivazione":

30 wxr.wtp.start_subsection(title_text)

31 extract_etymology_section(wxr, page_data, level_node)

32 elif title_text == "Citazione":

33 wxr.wtp.start_subsection(title_text)

34 extract_citation_section(wxr, page_data, level_node)

35 elif title_text == "Sillabazione":

36 wxr.wtp.start_subsection(title_text)

37 extract_hyphenation_section(wxr, page_data, level_node)

38 elif title_text == "Pronuncia":

39 wxr.wtp.start_subsection(title_text)

40 extract_pronunciation_section(wxr, page_data, level_node)

41 elif title_text in LINKAGE_SECTIONS: 41 ↛ 47line 41 didn't jump to line 47 because the condition on line 41 was always true

42 wxr.wtp.start_subsection(title_text)

43 extract_linkage_section(

44 wxr, page_data, level_node, LINKAGE_SECTIONS[title_text]

45 )

47 for next_level in level_node.find_child(LEVEL_KIND_FLAGS):

48 parse_section(wxr, page_data, base_data, next_level)

51def parse_page(

52 wxr: WiktextractContext, page_title: str, page_text: str

53) -> list[dict[str, Any]]:

54 # page layout

55 # https://it.wiktionary.org/wiki/Wikizionario:Manuale_di_stile

56 # https://it.wiktionary.org/wiki/Aiuto:Come_iniziare_una_pagina

57 wxr.wtp.start_page(page_title)

58 tree = wxr.wtp.parse(page_text, pre_expand=True)

59 page_data: list[WordEntry] = []

60 for level2_node in tree.find_child(NodeKind.LEVEL2):

61 lang_cats = {}

62 lang_name = clean_node(wxr, lang_cats, level2_node.largs)

63 if lang_name in ["Altri progetti", "Note / Riferimenti"]: 63 ↛ 64line 63 didn't jump to line 64 because the condition on line 63 was never true

64 continue

65 lang_code = "unknown"

66 for lang_template in level2_node.find_content(NodeKind.TEMPLATE): 66 ↛ 69line 66 didn't jump to line 69 because the loop on line 66 didn't complete

67 lang_code = lang_template.template_name.strip("-")

68 break

69 if ( 69 ↛ 73line 69 didn't jump to line 73 because the condition on line 69 was never true

70 wxr.config.capture_language_codes is not None

71 and lang_code not in wxr.config.capture_language_codes

72 ):

73 continue

74 wxr.wtp.start_section(lang_name)

75 base_data = WordEntry(

76 word=wxr.wtp.title,

77 lang_code=lang_code,

78 lang=lang_name,

79 pos="unknown",

80 categories=lang_cats.get("categories", []),

81 )

82 for next_level_node in level2_node.find_child(LEVEL_KIND_FLAGS):

83 parse_section(wxr, page_data, base_data, next_level_node)

85 for data in page_data:

86 if len(data.senses) == 0: 86 ↛ 87line 86 didn't jump to line 87 because the condition on line 86 was never true

87 data.senses.append(Sense(tags=["no-gloss"]))

88 return [m.model_dump(exclude_defaults=True) for m in page_data]