Coverage for src/wiktextract/extractor/ru/inflection.py: 98%

138 statements  

« prev     ^ index     » next       coverage.py v7.11.0, created at 2025-11-07 08:08 +0000

1from dataclasses import dataclass 

2from itertools import chain 

3 

4from wikitextprocessor import HTMLNode, NodeKind, WikiNode 

5 

6from ...page import clean_node 

7from ...wxr_context import WiktextractContext 

8from .models import Form, WordEntry 

9from .tags import translate_raw_tags 

10 

11 

12@dataclass 

13class TableHeader: 

14 text: str 

15 col_index: int = 0 

16 colspan: int = 1 

17 row_index: int = 0 

18 rowspan: int = 1 

19 

20 

21# Викисловарь:Шаблоны словоизменений 

22 

23 

24def parse_html_forms_table( 

25 wxr: WiktextractContext, word_entry: WordEntry, table_tag: HTMLNode 

26): 

27 # HTML table 

28 # https://ru.wiktionary.org/wiki/Шаблон:прил 

29 # Шаблон:спряжения 

30 col_headers = [] 

31 row_headers = [] 

32 for row_index, tr_tag in enumerate(table_tag.find_html("tr")): 

33 row_has_data = any(tr_tag.find_html("td")) 

34 col_index = 0 

35 for header in chain(col_headers, row_headers): 

36 if ( 

37 row_index > header.row_index 

38 and row_index < header.row_index + header.rowspan 

39 and header.col_index <= col_index 

40 ): 

41 col_index += header.colspan 

42 for th_tag in tr_tag.find_html("th"): 

43 th_text = clean_node(wxr, None, th_tag) 

44 colspan = int(th_tag.attrs.get("colspan", "1")) 

45 rowspan = int(th_tag.attrs.get("rowspan", "1")) 

46 if not row_has_data: 

47 col_headers.append( 

48 TableHeader(th_text, col_index, colspan, row_index, rowspan) 

49 ) 

50 else: 

51 row_headers.append( 

52 TableHeader(th_text, col_index, colspan, row_index, rowspan) 

53 ) 

54 col_index += colspan 

55 

56 has_rowspan_td = [] 

57 for row_index, tr_tag in enumerate(table_tag.find_html("tr")): 

58 col_index = 0 

59 last_col_header_row = 0 

60 for col_header in col_headers[::-1]: 

61 if col_header.row_index < row_index: 

62 last_col_header_row = col_header.row_index 

63 break 

64 for row_header in row_headers: 

65 if ( 

66 row_index >= row_header.row_index 

67 and row_index < row_header.row_index + row_header.rowspan 

68 and row_header.col_index <= col_index 

69 ): 

70 col_index += row_header.colspan 

71 for td_tag in tr_tag.find_html("td"): 

72 for above_td in has_rowspan_td: 

73 if ( 

74 row_index > above_td.row_index 

75 and row_index < above_td.row_index + above_td.rowspan 

76 and above_td.col_index <= col_index 

77 ): 

78 col_index += above_td.colspan 

79 colspan = int(td_tag.attrs.get("colspan", "1")) 

80 rowspan = int(td_tag.attrs.get("rowspan", "1")) 

81 if rowspan > 1: 

82 has_rowspan_td.append( 

83 TableHeader("", col_index, colspan, row_index, rowspan) 

84 ) 

85 raw_tags = [] 

86 use_col_tags = [] 

87 for col_header in col_headers[::-1]: 

88 if ( 

89 col_header.col_index < col_index + colspan 

90 and col_index < col_header.col_index + col_header.colspan 

91 and col_header.text not in raw_tags 

92 and col_header.text not in use_col_tags 

93 # column header above cell and above last header 

94 # don't use headers for other top sections 

95 and col_header.row_index + col_header.rowspan 

96 in [last_col_header_row, last_col_header_row + 1] 

97 ): 

98 use_col_tags.append(col_header.text) 

99 raw_tags.extend(use_col_tags[::-1]) 

100 for row_header in row_headers: 

101 if ( 

102 row_header.row_index < row_index + rowspan 

103 and row_index < row_header.row_index + row_header.rowspan 

104 and row_header.text not in raw_tags 

105 ): 

106 raw_tags.append(row_header.text) 

107 form_nodes = [] 

108 for td_child in td_tag.children: 

109 if ( 

110 isinstance(td_child, HTMLNode) 

111 and td_child.tag == "span" 

112 and "cursor:help" in td_child.attrs.get("style", "") 

113 ): 

114 sup_tag = td_child.attrs.get("title", "") 

115 if sup_tag != "": 115 ↛ 108line 115 didn't jump to line 108 because the condition on line 115 was always true

116 raw_tags.append(sup_tag) 

117 else: 

118 form_nodes.append(td_child) 

119 td_text = clean_node(wxr, None, form_nodes) 

120 for line in td_text.splitlines(): 

121 for word in line.split(","): 

122 word = word.strip() 

123 if word not in ["", "—", wxr.wtp.title]: 

124 form = Form(form=word, raw_tags=raw_tags) 

125 translate_raw_tags(form) 

126 word_entry.forms.append(form) 

127 col_index += colspan 

128 

129 

130def parse_wikitext_forms_table( 

131 wxr: WiktextractContext, word_entry: WordEntry, table: WikiNode 

132) -> None: 

133 # https://ru.wiktionary.org/wiki/Шаблон:сущ-ru 

134 # Шаблон:inflection сущ ru 

135 col_headers = [] 

136 row_headers = [] 

137 for row_index, row in enumerate(table.find_child(NodeKind.TABLE_ROW)): 

138 row_has_data = row.contain_node(NodeKind.TABLE_CELL) 

139 col_index = 0 

140 for header in chain(col_headers, row_headers): 

141 if ( 141 ↛ 146line 141 didn't jump to line 146 because the condition on line 141 was never true

142 row_index > header.row_index 

143 and row_index < header.row_index + header.rowspan 

144 and header.col_index <= col_index 

145 ): 

146 col_index += header.colspan 

147 for cell_node in row.find_child( 

148 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

149 ): 

150 cell_text = clean_node(wxr, None, cell_node) 

151 colspan = int(cell_node.attrs.get("colspan", "1")) 

152 rowspan = int(cell_node.attrs.get("rowspan", "1")) 

153 if cell_node.kind == NodeKind.TABLE_CELL: 

154 pass 

155 elif not row_has_data: 

156 col_headers.append( 

157 TableHeader( 

158 cell_text, col_index, colspan, row_index, rowspan 

159 ) 

160 ) 

161 else: 

162 if cell_text == "М." and cell_node.contain_node(NodeKind.LINK): 

163 for link_node in cell_node.find_child(NodeKind.LINK): 163 ↛ 166line 163 didn't jump to line 166 because the loop on line 163 didn't complete

164 cell_text = clean_node(wxr, None, link_node.largs[0][0]) 

165 break 

166 row_headers.append( 

167 TableHeader( 

168 cell_text, col_index, colspan, row_index, rowspan 

169 ) 

170 ) 

171 col_index += colspan 

172 

173 for row_index, row in enumerate(table.find_child(NodeKind.TABLE_ROW)): 

174 col_index = 0 

175 for header in chain(col_headers, row_headers): 

176 if ( 

177 row_index >= header.row_index 

178 and row_index < header.row_index + header.rowspan 

179 and header.col_index <= col_index 

180 ): 

181 col_index += header.colspan 

182 for cell_node in row.find_child(NodeKind.TABLE_CELL): 

183 colspan = int(cell_node.attrs.get("colspan", "1")) 

184 rowspan = int(cell_node.attrs.get("rowspan", "1")) 

185 cell_text = clean_node(wxr, None, cell_node) 

186 last_col_header_row = -1 

187 use_tags = [] 

188 for line in cell_text.splitlines(): 

189 line = line.strip("\n /") 

190 if line not in ["", "—", "-", wxr.wtp.title]: 

191 form = Form(form=line) 

192 for col_header in col_headers[::-1]: 

193 if ( 

194 col_header.text != "" 

195 and col_header.col_index < col_index + colspan 

196 and col_index 

197 < col_header.col_index + col_header.colspan 

198 and col_header.text not in form.raw_tags 

199 and col_header.text not in use_tags 

200 and ( 

201 ( 

202 last_col_header_row != -1 

203 and col_header.row_index 

204 + col_header.rowspan 

205 in [ 

206 last_col_header_row, 

207 last_col_header_row + 1, 

208 ] 

209 ) 

210 or ( 

211 last_col_header_row == -1 

212 and col_header.row_index 

213 + col_header.rowspan 

214 <= row_index 

215 ) 

216 ) 

217 ): 

218 use_tags.append(col_header.text) 

219 last_col_header_row = col_header.row_index 

220 form.raw_tags.extend(use_tags[::-1]) 

221 use_tags.clear() 

222 for row_header in row_headers[::-1]: 

223 if ( 

224 row_header.text != "" 

225 and row_header.row_index < row_index + rowspan 

226 and row_index 

227 < row_header.row_index + row_header.rowspan 

228 and row_header.text not in form.raw_tags 

229 and row_header.text not in use_tags 

230 ): 

231 use_tags.append(row_header.text) 

232 form.raw_tags.extend(use_tags[::-1]) 

233 translate_raw_tags(form) 

234 word_entry.forms.append(form) 

235 col_index += colspan 

236 

237 

238def extract_прил_ru_comparative_forms( 

239 wxr: WiktextractContext, word_entry: WordEntry, expanded_node: WikiNode 

240) -> None: 

241 after_comparative = False 

242 for node in expanded_node.children: 

243 if isinstance(node, str): 

244 node_str = clean_node(wxr, None, node) 

245 if node_str.endswith("Сравнительная степень —"): 

246 after_comparative = True 

247 elif ( 

248 after_comparative 

249 and isinstance(node, WikiNode) 

250 and node.kind == NodeKind.ITALIC 

251 ): 

252 for link_node in node.find_child(NodeKind.LINK): 

253 form = clean_node(wxr, None, link_node) 

254 if form != "": 254 ↛ 252line 254 didn't jump to line 252 because the condition on line 254 was always true

255 word_entry.forms.append( 

256 Form(form=form, tags=["comparative"]) 

257 )