Coverage for src / wiktextract / extractor / cs / declension.py: 95%

87 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2025-12-12 08:09 +0000

1from dataclasses import dataclass 

2 

3from wikitextprocessor import LevelNode, NodeKind, TemplateNode 

4 

5from ...page import clean_node 

6from ...wxr_context import WiktextractContext 

7from ..share import capture_text_in_parentheses 

8from .models import Form, WordEntry 

9from .tags import translate_raw_tags 

10 

11DECLENSION_SECTION_TAGS = { 

12 "skloňování mužské": ["masculine"], 

13 "skloňování ženské": ["feminine"], 

14} 

15 

16 

17def extract_declension_section( 

18 wxr: WiktextractContext, 

19 word_entry: WordEntry, 

20 level_node: LevelNode, 

21 section_title: str, 

22): 

23 section_tags = DECLENSION_SECTION_TAGS.get(section_title, []) 

24 for t_node in level_node.find_child(NodeKind.TEMPLATE): 

25 if t_node.template_name == "Sloveso (ja)": 

26 extract_sloveso_ja_template(wxr, word_entry, t_node, section_tags) 

27 elif t_node.template_name.startswith( 27 ↛ 24line 27 didn't jump to line 24 because the condition on line 27 was always true

28 ("Substantivum ", "Adjektivum ", "Stupňování ", "Sloveso ") 

29 ): 

30 extract_substantivum_template(wxr, word_entry, t_node, section_tags) 

31 

32 

33@dataclass 

34class TableHeader: 

35 text: str 

36 colspan: int 

37 rowspan: int 

38 col_index: int 

39 row_index: int 

40 

41 

42def extract_substantivum_template( 

43 wxr: WiktextractContext, 

44 word_entry: WordEntry, 

45 t_node: TemplateNode, 

46 section_tags: list[str], 

47): 

48 # https://cs.wiktionary.org/wiki/Šablona:Substantivum_(cs) 

49 expanded_node = wxr.wtp.parse( 

50 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

51 ) 

52 clean_node(wxr, word_entry, expanded_node) 

53 for table in expanded_node.find_child(NodeKind.TABLE): 

54 col_headers = [] 

55 row_headers = [] 

56 table_caption = "" 

57 for caption_node in table.find_child(NodeKind.TABLE_CAPTION): 

58 table_caption = clean_node(wxr, None, caption_node.children) 

59 for row_index, row in enumerate(table.find_child(NodeKind.TABLE_ROW)): 

60 is_column_header = not row.contain_node(NodeKind.TABLE_CELL) 

61 col_index = 0 

62 for header in col_headers if is_column_header else row_headers: 

63 if ( 

64 header.rowspan > 1 

65 and header.row_index <= row_index 

66 and header.row_index + header.rowspan > row_index 

67 and header.col_index <= col_index 

68 ): 

69 col_index += header.colspan 

70 for cell in row.find_child( 

71 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

72 ): 

73 cell_text = clean_node(wxr, None, cell) 

74 colspan = int(cell.attrs.get("colspan", "1")) 

75 rowspan = int(cell.attrs.get("rowspan", "1")) 

76 if cell.kind == NodeKind.TABLE_HEADER_CELL: 

77 if is_column_header: 

78 col_headers.append( 

79 TableHeader( 

80 cell_text, 

81 colspan, 

82 rowspan, 

83 col_index, 

84 row_index, 

85 ) 

86 ) 

87 elif not is_column_header: 87 ↛ 131line 87 didn't jump to line 131 because the condition on line 87 was always true

88 row_headers.append( 

89 TableHeader( 

90 cell_text, 

91 colspan, 

92 rowspan, 

93 col_index, 

94 row_index, 

95 ) 

96 ) 

97 else: 

98 words = ( 

99 filter(None, map(str.strip, cell_text.split("/"))) 

100 if cell_text.count("/") == 1 

101 else [cell_text] 

102 ) 

103 for word in words: 

104 cell_tags, word = capture_text_in_parentheses(word) 

105 word = word.strip() 

106 if word in ["", "—", wxr.wtp.title]: 

107 continue 

108 form = Form( 

109 form=word, tags=section_tags, raw_tags=cell_tags 

110 ) 

111 if table_caption != "": 

112 form.raw_tags.append(table_caption) 

113 for row_header in row_headers: 

114 if ( 

115 row_header.text != "" 

116 and row_header.row_index < row_index + rowspan 

117 and row_header.row_index + row_header.rowspan 

118 > row_index 

119 ): 

120 form.raw_tags.append(row_header.text) 

121 for col_header in col_headers: 

122 if ( 

123 col_header.text != "" 

124 and col_header.col_index < col_index + colspan 

125 and col_header.col_index + col_header.colspan 

126 > col_index 

127 ): 

128 form.raw_tags.append(col_header.text) 

129 translate_raw_tags(form) 

130 word_entry.forms.append(form) 

131 col_index += colspan 

132 

133 

134def extract_sloveso_ja_template( 

135 wxr: WiktextractContext, 

136 word_entry: WordEntry, 

137 t_node: TemplateNode, 

138 section_tags: list[str], 

139): 

140 expanded_node = wxr.wtp.parse( 

141 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

142 ) 

143 forms = [] 

144 for table in expanded_node.find_child(NodeKind.TABLE): 

145 for row in table.find_child(NodeKind.TABLE_ROW): 

146 row_header = "" 

147 for col_index, cell in enumerate( 147 ↛ 145line 147 didn't jump to line 145 because the loop on line 147 didn't complete

148 row.find_child(NodeKind.TABLE_CELL | NodeKind.TABLE_HEADER_CELL) 

149 ): 

150 if col_index >= 3: 

151 break 

152 if cell.kind == NodeKind.TABLE_HEADER_CELL: 

153 row_header = clean_node(wxr, None, cell) 

154 elif cell.kind == NodeKind.TABLE_CELL: 154 ↛ 147line 154 didn't jump to line 147 because the condition on line 154 was always true

155 if col_index == 1: 

156 word = clean_node(wxr, None, cell) 

157 if word not in ["", wxr.wtp.title]: 157 ↛ 147line 157 didn't jump to line 147 because the condition on line 157 was always true

158 form = Form(form=word, raw_tags=section_tags) 

159 if row_header != "": 159 ↛ 161line 159 didn't jump to line 161 because the condition on line 159 was always true

160 form.raw_tags.append(row_header) 

161 translate_raw_tags(form) 

162 forms.append(form) 

163 elif col_index == 2 and len(forms) > 0: 163 ↛ 147line 163 didn't jump to line 147 because the condition on line 163 was always true

164 forms[-1].roman = clean_node(wxr, None, cell) 

165 

166 word_entry.forms.extend(forms)