Coverage for src / wiktextract / extractor / cs / page.py: 85%

68 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2025-12-12 08:09 +0000

1import re 

2from typing import Any 

3 

4from mediawiki_langcodes import name_to_code 

5from wikitextprocessor.parser import LEVEL_KIND_FLAGS, LevelNode, NodeKind 

6 

7from ...page import clean_node 

8from ...wxr_context import WiktextractContext 

9from .declension import extract_declension_section 

10from .etymology import extract_etymology_section 

11from .linkage import extract_alt_form_section, extract_linkage_section 

12from .models import Sense, WordEntry 

13from .pos import ( 

14 extract_note_section, 

15 extract_pos_section, 

16 extract_sense_section, 

17) 

18from .section_titles import LINKAGE_SECTIONS, POS_DATA 

19from .sound import ( 

20 extract_homophone_section, 

21 extract_hyphenation_section, 

22 extract_sound_section, 

23 extract_transcript_section, 

24) 

25from .translation import extract_translation_section 

26 

27 

28def parse_section( 

29 wxr: WiktextractContext, 

30 page_data: list[WordEntry], 

31 base_data: WordEntry, 

32 level_node: LevelNode, 

33): 

34 subtitle = clean_node(wxr, None, level_node.largs) 

35 subtitle = re.sub(r"\(\d+\)", "", subtitle).strip() 

36 if "/" in subtitle: 36 ↛ 37line 36 didn't jump to line 37 because the condition on line 36 was never true

37 subtitle = subtitle.split("/")[0].strip() 

38 if subtitle in POS_DATA: 

39 extract_pos_section(wxr, page_data, base_data, level_node, subtitle) 

40 elif subtitle == "význam" and len(page_data) > 0: 

41 extract_sense_section(wxr, page_data[-1], level_node) 

42 elif subtitle == "výslovnost": 

43 extract_sound_section(wxr, base_data, level_node) 

44 elif subtitle == "dělení": 

45 extract_hyphenation_section(wxr, base_data, level_node) 

46 elif subtitle == "etymologie": 

47 extract_etymology_section( 

48 wxr, 

49 page_data[-1] 

50 if level_node.kind != NodeKind.LEVEL3 and len(page_data) > 0 

51 else base_data, 

52 level_node, 

53 ) 

54 elif subtitle in ["varianty", "varianta zápisu", "varianty zápisu"]: 

55 extract_alt_form_section( 

56 wxr, 

57 page_data[-1] 

58 if level_node.kind == NodeKind.LEVEL4 

59 and len(page_data) > 0 

60 and base_data.lang == page_data[-1].lang 

61 else base_data, 

62 level_node, 

63 ) 

64 elif subtitle == "překlady": 

65 extract_translation_section( 

66 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node 

67 ) 

68 elif subtitle in LINKAGE_SECTIONS: 

69 extract_linkage_section( 

70 wxr, 

71 page_data[-1] if len(page_data) > 0 else base_data, 

72 level_node, 

73 LINKAGE_SECTIONS[subtitle], 

74 ) 

75 elif subtitle in ["stupňování", "časování"] or subtitle.startswith( 

76 "skloňování" 

77 ): 

78 extract_declension_section( 

79 wxr, 

80 page_data[-1] if len(page_data) > 0 else base_data, 

81 level_node, 

82 subtitle, 

83 ) 

84 elif subtitle == "homofony": 84 ↛ 85line 84 didn't jump to line 85 because the condition on line 84 was never true

85 extract_homophone_section(wxr, base_data, level_node) 

86 elif subtitle == "přepis": 86 ↛ 90line 86 didn't jump to line 90 because the condition on line 86 was always true

87 extract_transcript_section( 

88 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node 

89 ) 

90 elif subtitle == "poznámka k užití": 

91 extract_note_section( 

92 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node 

93 ) 

94 elif subtitle not in ["externí odkazy", "poznámky", "reference"]: 

95 wxr.wtp.debug(f"Unknown title: {subtitle}", sortid="cs/page/27") 

96 

97 for next_level in level_node.find_child(LEVEL_KIND_FLAGS): 

98 parse_section(wxr, page_data, base_data, next_level) 

99 

100 for link_node in level_node.find_child(NodeKind.LINK): 

101 clean_node( 

102 wxr, page_data[-1] if len(page_data) > 0 else base_data, link_node 

103 ) 

104 

105 

106def parse_page( 

107 wxr: WiktextractContext, page_title: str, page_text: str 

108) -> list[dict[str, Any]]: 

109 # page layout 

110 # https://cs.wiktionary.org/wiki/Wikislovník:Formát_hesla 

111 wxr.wtp.start_page(page_title) 

112 tree = wxr.wtp.parse(page_text) 

113 page_data = [] 

114 for level2_node in tree.find_child(NodeKind.LEVEL2): 

115 lang_name = clean_node(wxr, None, level2_node.largs) or "unknown" 

116 if lang_name in ["poznámky", "externí odkazy"]: 116 ↛ 117line 116 didn't jump to line 117 because the condition on line 116 was never true

117 continue 

118 lang_code = name_to_code(lang_name, "cs") or "unknown" 

119 if ( 119 ↛ 123line 119 didn't jump to line 123 because the condition on line 119 was never true

120 wxr.config.capture_language_codes is not None 

121 and lang_code not in wxr.config.capture_language_codes 

122 ): 

123 continue 

124 wxr.wtp.start_section(lang_name) 

125 base_data = WordEntry( 

126 word=wxr.wtp.title, 

127 lang_code=lang_code, 

128 lang=lang_name, 

129 pos="unknown", 

130 ) 

131 for next_level in level2_node.find_child(LEVEL_KIND_FLAGS): 

132 parse_section(wxr, page_data, base_data, next_level) 

133 

134 for data in page_data: 

135 if len(data.senses) == 0: 

136 data.senses.append(Sense(tags=["no-gloss"])) 

137 

138 return [d.model_dump(exclude_defaults=True) for d in page_data]