Coverage for src/wiktextract/extractor/it/page.py: 91%

59 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2024-12-27 08:07 +0000

1from typing import Any 

2 

3from wikitextprocessor.parser import LEVEL_KIND_FLAGS, LevelNode, NodeKind 

4 

5from ...page import clean_node 

6from ...wxr_context import WiktextractContext 

7from .etymology import extract_citation_section, extract_etymology_section 

8from .linkage import extract_linkage_section 

9from .models import Sense, WordEntry 

10from .pos import extract_pos_section 

11from .section_titles import LINKAGE_SECTIONS, POS_DATA 

12from .sound import extract_hyphenation_section, extract_pronunciation_section 

13from .translation import extract_translation_section 

14 

15 

16def parse_section( 

17 wxr: WiktextractContext, 

18 page_data: list[WordEntry], 

19 base_data: WordEntry, 

20 level_node: LevelNode, 

21) -> None: 

22 title_text = clean_node(wxr, None, level_node.largs) 

23 if title_text in POS_DATA: 

24 wxr.wtp.start_subsection(title_text) 

25 extract_pos_section(wxr, page_data, base_data, level_node, title_text) 

26 elif title_text == "Traduzione": 

27 wxr.wtp.start_subsection(title_text) 

28 extract_translation_section(wxr, page_data, level_node) 

29 elif title_text == "Etimologia / Derivazione": 

30 wxr.wtp.start_subsection(title_text) 

31 extract_etymology_section(wxr, page_data, level_node) 

32 elif title_text == "Citazione": 

33 wxr.wtp.start_subsection(title_text) 

34 extract_citation_section(wxr, page_data, level_node) 

35 elif title_text == "Sillabazione": 

36 wxr.wtp.start_subsection(title_text) 

37 extract_hyphenation_section(wxr, page_data, level_node) 

38 elif title_text == "Pronuncia": 

39 wxr.wtp.start_subsection(title_text) 

40 extract_pronunciation_section(wxr, page_data, level_node) 

41 elif title_text in LINKAGE_SECTIONS: 41 ↛ 47line 41 didn't jump to line 47 because the condition on line 41 was always true

42 wxr.wtp.start_subsection(title_text) 

43 extract_linkage_section( 

44 wxr, page_data, level_node, LINKAGE_SECTIONS[title_text] 

45 ) 

46 

47 for next_level in level_node.find_child(LEVEL_KIND_FLAGS): 

48 parse_section(wxr, page_data, base_data, next_level) 

49 

50 

51def parse_page( 

52 wxr: WiktextractContext, page_title: str, page_text: str 

53) -> list[dict[str, Any]]: 

54 # page layout 

55 # https://it.wiktionary.org/wiki/Wikizionario:Manuale_di_stile 

56 # https://it.wiktionary.org/wiki/Aiuto:Come_iniziare_una_pagina 

57 wxr.wtp.start_page(page_title) 

58 tree = wxr.wtp.parse(page_text, pre_expand=True) 

59 page_data: list[WordEntry] = [] 

60 for level2_node in tree.find_child(NodeKind.LEVEL2): 

61 lang_cats = {} 

62 lang_name = clean_node(wxr, lang_cats, level2_node.largs) 

63 if lang_name in ["Altri progetti", "Note / Riferimenti"]: 63 ↛ 64line 63 didn't jump to line 64 because the condition on line 63 was never true

64 continue 

65 lang_code = "unknown" 

66 for lang_template in level2_node.find_content(NodeKind.TEMPLATE): 66 ↛ 69line 66 didn't jump to line 69 because the loop on line 66 didn't complete

67 lang_code = lang_template.template_name.strip("-") 

68 break 

69 if ( 69 ↛ 73line 69 didn't jump to line 73 because the condition on line 69 was never true

70 wxr.config.capture_language_codes is not None 

71 and lang_code not in wxr.config.capture_language_codes 

72 ): 

73 continue 

74 wxr.wtp.start_section(lang_name) 

75 base_data = WordEntry( 

76 word=wxr.wtp.title, 

77 lang_code=lang_code, 

78 lang=lang_name, 

79 pos="unknown", 

80 categories=lang_cats.get("categories", []), 

81 ) 

82 for next_level_node in level2_node.find_child(LEVEL_KIND_FLAGS): 

83 parse_section(wxr, page_data, base_data, next_level_node) 

84 

85 for data in page_data: 

86 if len(data.senses) == 0: 86 ↛ 87line 86 didn't jump to line 87 because the condition on line 86 was never true

87 data.senses.append(Sense(tags=["no-gloss"])) 

88 return [m.model_dump(exclude_defaults=True) for m in page_data]