Coverage for src/wiktextract/extractor/it/page.py: 80%

65 statements  

« prev     ^ index     » next       coverage.py v7.9.2, created at 2025-07-04 10:58 +0000

1from typing import Any 

2 

3from wikitextprocessor.parser import LEVEL_KIND_FLAGS, LevelNode, NodeKind 

4 

5from ...page import clean_node 

6from ...wxr_context import WiktextractContext 

7from .etymology import extract_citation_section, extract_etymology_section 

8from .linkage import extract_form_section, extract_linkage_section 

9from .models import Sense, WordEntry 

10from .pos import extract_note_section, extract_pos_section 

11from .section_titles import LINKAGE_SECTIONS, POS_DATA 

12from .sound import extract_hyphenation_section, extract_pronunciation_section 

13from .translation import extract_translation_section 

14 

15 

16def parse_section( 

17 wxr: WiktextractContext, 

18 page_data: list[WordEntry], 

19 base_data: WordEntry, 

20 level_node: LevelNode, 

21) -> None: 

22 title_text = clean_node(wxr, None, level_node.largs) 

23 if title_text in POS_DATA or title_text.startswith("Trascrizione"): 

24 wxr.wtp.start_subsection(title_text) 

25 extract_pos_section(wxr, page_data, base_data, level_node, title_text) 

26 elif title_text == "Traduzione": 

27 wxr.wtp.start_subsection(title_text) 

28 extract_translation_section(wxr, page_data, level_node) 

29 elif title_text == "Etimologia / Derivazione": 

30 wxr.wtp.start_subsection(title_text) 

31 extract_etymology_section(wxr, page_data, level_node) 

32 elif title_text == "Citazione": 

33 wxr.wtp.start_subsection(title_text) 

34 extract_citation_section(wxr, page_data, level_node) 

35 elif title_text == "Sillabazione": 

36 wxr.wtp.start_subsection(title_text) 

37 extract_hyphenation_section(wxr, page_data, level_node) 

38 elif title_text == "Pronuncia": 

39 wxr.wtp.start_subsection(title_text) 

40 extract_pronunciation_section(wxr, page_data, level_node) 

41 elif title_text in LINKAGE_SECTIONS: 41 ↛ 46line 41 didn't jump to line 46 because the condition on line 41 was always true

42 wxr.wtp.start_subsection(title_text) 

43 extract_linkage_section( 

44 wxr, page_data, level_node, LINKAGE_SECTIONS[title_text] 

45 ) 

46 elif title_text == "Uso / Precisazioni": 

47 extract_note_section(wxr, page_data, level_node) 

48 elif title_text in ["Variazione", "Forme flesse", "Variazioni", "Variante"]: 

49 extract_form_section(wxr, page_data, level_node) 

50 elif title_text not in ["Note / Riferimenti"]: 

51 wxr.wtp.debug( 

52 f"Unknown section: {title_text}", 

53 sortid="extractor/it/page/parse_section/49", 

54 ) 

55 

56 for next_level in level_node.find_child(LEVEL_KIND_FLAGS): 

57 parse_section(wxr, page_data, base_data, next_level) 

58 

59 

60def parse_page( 

61 wxr: WiktextractContext, page_title: str, page_text: str 

62) -> list[dict[str, Any]]: 

63 # page layout 

64 # https://it.wiktionary.org/wiki/Wikizionario:Manuale_di_stile 

65 # https://it.wiktionary.org/wiki/Aiuto:Come_iniziare_una_pagina 

66 wxr.wtp.start_page(page_title) 

67 tree = wxr.wtp.parse(page_text, pre_expand=True) 

68 page_data: list[WordEntry] = [] 

69 for level2_node in tree.find_child(NodeKind.LEVEL2): 

70 lang_cats = {} 

71 lang_name = clean_node(wxr, lang_cats, level2_node.largs) 

72 if lang_name in ["Altri progetti", "Note / Riferimenti"]: 72 ↛ 73line 72 didn't jump to line 73 because the condition on line 72 was never true

73 continue 

74 lang_code = "unknown" 

75 for lang_template in level2_node.find_content(NodeKind.TEMPLATE): 75 ↛ 78line 75 didn't jump to line 78 because the loop on line 75 didn't complete

76 lang_code = lang_template.template_name.strip("-") 

77 break 

78 if ( 78 ↛ 82line 78 didn't jump to line 82 because the condition on line 78 was never true

79 wxr.config.capture_language_codes is not None 

80 and lang_code not in wxr.config.capture_language_codes 

81 ): 

82 continue 

83 wxr.wtp.start_section(lang_name) 

84 base_data = WordEntry( 

85 word=wxr.wtp.title, 

86 lang_code=lang_code, 

87 lang=lang_name, 

88 pos="unknown", 

89 categories=lang_cats.get("categories", []), 

90 ) 

91 for next_level_node in level2_node.find_child(LEVEL_KIND_FLAGS): 

92 parse_section(wxr, page_data, base_data, next_level_node) 

93 

94 for data in page_data: 

95 if len(data.senses) == 0: 95 ↛ 96line 95 didn't jump to line 96 because the condition on line 95 was never true

96 data.senses.append(Sense(tags=["no-gloss"])) 

97 return [m.model_dump(exclude_defaults=True) for m in page_data]