Coverage for src/wiktextract/extractor/cs/page.py: 86%

66 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2025-10-13 10:14 +0000

1import re 

2from typing import Any 

3 

4from mediawiki_langcodes import name_to_code 

5from wikitextprocessor.parser import LEVEL_KIND_FLAGS, LevelNode, NodeKind 

6 

7from ...page import clean_node 

8from ...wxr_context import WiktextractContext 

9from .declension import extract_declension_section 

10from .etymology import extract_etymology_section 

11from .linkage import extract_alt_form_section, extract_linkage_section 

12from .models import Sense, WordEntry 

13from .pos import ( 

14 extract_note_section, 

15 extract_pos_section, 

16 extract_sense_section, 

17) 

18from .section_titles import LINKAGE_SECTIONS, POS_DATA 

19from .sound import ( 

20 extract_homophone_section, 

21 extract_hyphenation_section, 

22 extract_sound_section, 

23 extract_transcript_section, 

24) 

25from .translation import extract_translation_section 

26 

27 

28def parse_section( 

29 wxr: WiktextractContext, 

30 page_data: list[WordEntry], 

31 base_data: WordEntry, 

32 level_node: LevelNode, 

33): 

34 subtitle = clean_node(wxr, None, level_node.largs) 

35 subtitle = re.sub(r"\(\d+\)", "", subtitle).strip() 

36 if subtitle in POS_DATA and level_node.contain_node(LEVEL_KIND_FLAGS): 

37 extract_pos_section(wxr, page_data, base_data, level_node, subtitle) 

38 elif subtitle == "význam" and len(page_data) > 0: 

39 extract_sense_section(wxr, page_data[-1], level_node) 

40 elif subtitle == "výslovnost": 

41 extract_sound_section(wxr, base_data, level_node) 

42 elif subtitle == "dělení": 

43 extract_hyphenation_section(wxr, base_data, level_node) 

44 elif subtitle == "etymologie": 

45 extract_etymology_section( 

46 wxr, 

47 page_data[-1] 

48 if level_node.kind != NodeKind.LEVEL3 and len(page_data) > 0 

49 else base_data, 

50 level_node, 

51 ) 

52 elif subtitle in ["varianty", "varianta zápisu", "varianty zápisu"]: 

53 extract_alt_form_section( 

54 wxr, 

55 page_data[-1] 

56 if level_node.kind == NodeKind.LEVEL4 

57 and len(page_data) > 0 

58 and base_data.lang == page_data[-1].lang 

59 else base_data, 

60 level_node, 

61 ) 

62 elif subtitle == "překlady": 

63 extract_translation_section( 

64 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node 

65 ) 

66 elif subtitle in LINKAGE_SECTIONS: 

67 extract_linkage_section( 

68 wxr, 

69 page_data[-1] if len(page_data) > 0 else base_data, 

70 level_node, 

71 LINKAGE_SECTIONS[subtitle], 

72 ) 

73 elif subtitle in ["stupňování", "časování"] or subtitle.startswith( 

74 "skloňování" 

75 ): 

76 extract_declension_section( 

77 wxr, 

78 page_data[-1] if len(page_data) > 0 else base_data, 

79 level_node, 

80 subtitle, 

81 ) 

82 elif subtitle == "homofony": 82 ↛ 83line 82 didn't jump to line 83 because the condition on line 82 was never true

83 extract_homophone_section(wxr, base_data, level_node) 

84 elif subtitle == "přepis": 84 ↛ 88line 84 didn't jump to line 88 because the condition on line 84 was always true

85 extract_transcript_section( 

86 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node 

87 ) 

88 elif subtitle == "poznámka k užití": 

89 extract_note_section( 

90 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node 

91 ) 

92 elif subtitle not in ["externí odkazy", "poznámky", "reference"]: 

93 wxr.wtp.debug(f"Unknown title: {subtitle}", sortid="cs/page/27") 

94 

95 for next_level in level_node.find_child(LEVEL_KIND_FLAGS): 

96 parse_section(wxr, page_data, base_data, next_level) 

97 

98 for link_node in level_node.find_child(NodeKind.LINK): 

99 clean_node( 

100 wxr, page_data[-1] if len(page_data) > 0 else base_data, link_node 

101 ) 

102 

103 

104def parse_page( 

105 wxr: WiktextractContext, page_title: str, page_text: str 

106) -> list[dict[str, Any]]: 

107 # page layout 

108 # https://cs.wiktionary.org/wiki/Wikislovník:Formát_hesla 

109 wxr.wtp.start_page(page_title) 

110 tree = wxr.wtp.parse(page_text) 

111 page_data = [] 

112 for level2_node in tree.find_child(NodeKind.LEVEL2): 

113 lang_name = clean_node(wxr, None, level2_node.largs) or "unknown" 

114 if lang_name in ["poznámky", "externí odkazy"]: 114 ↛ 115line 114 didn't jump to line 115 because the condition on line 114 was never true

115 continue 

116 lang_code = name_to_code(lang_name, "cs") or "unknown" 

117 if ( 117 ↛ 121line 117 didn't jump to line 121 because the condition on line 117 was never true

118 wxr.config.capture_language_codes is not None 

119 and lang_code not in wxr.config.capture_language_codes 

120 ): 

121 continue 

122 wxr.wtp.start_section(lang_name) 

123 base_data = WordEntry( 

124 word=wxr.wtp.title, 

125 lang_code=lang_code, 

126 lang=lang_name, 

127 pos="unknown", 

128 ) 

129 for next_level in level2_node.find_child(LEVEL_KIND_FLAGS): 

130 parse_section(wxr, page_data, base_data, next_level) 

131 

132 for data in page_data: 

133 if len(data.senses) == 0: 

134 data.senses.append(Sense(tags=["no-gloss"])) 

135 

136 return [d.model_dump(exclude_defaults=True) for d in page_data]