Coverage for src/wiktextract/extractor/ru/inflection.py: 98%

131 statements  

« prev     ^ index     » next       coverage.py v7.11.0, created at 2025-11-03 05:44 +0000

1from dataclasses import dataclass 

2from itertools import chain 

3 

4from wikitextprocessor import HTMLNode, NodeKind, WikiNode 

5 

6from ...page import clean_node 

7from ...wxr_context import WiktextractContext 

8from .models import Form, WordEntry 

9from .tags import translate_raw_tags 

10 

11 

12@dataclass 

13class TableHeader: 

14 text: str 

15 col_index: int = 0 

16 colspan: int = 1 

17 row_index: int = 0 

18 rowspan: int = 1 

19 

20 

21# Викисловарь:Шаблоны словоизменений 

22 

23 

24def parse_html_forms_table( 

25 wxr: WiktextractContext, word_entry: WordEntry, table_tag: HTMLNode 

26): 

27 # HTML table 

28 # https://ru.wiktionary.org/wiki/Шаблон:прил 

29 # Шаблон:спряжения 

30 col_headers = [] 

31 row_headers = [] 

32 for row_index, tr_tag in enumerate(table_tag.find_html("tr")): 

33 row_has_data = any(tr_tag.find_html("td")) 

34 col_index = 0 

35 for header in chain(col_headers, row_headers): 

36 if ( 

37 row_index > header.row_index 

38 and row_index < header.row_index + header.rowspan 

39 and header.col_index <= col_index 

40 ): 

41 col_index += header.colspan 

42 for th_tag in tr_tag.find_html("th"): 

43 th_text = clean_node(wxr, None, th_tag) 

44 colspan = int(th_tag.attrs.get("colspan", "1")) 

45 rowspan = int(th_tag.attrs.get("rowspan", "1")) 

46 if not row_has_data: 

47 col_headers.append( 

48 TableHeader(th_text, col_index, colspan, row_index, rowspan) 

49 ) 

50 else: 

51 row_headers.append( 

52 TableHeader(th_text, col_index, colspan, row_index, rowspan) 

53 ) 

54 col_index += colspan 

55 

56 has_rowspan_td = [] 

57 for row_index, tr_tag in enumerate(table_tag.find_html("tr")): 

58 col_index = 0 

59 last_col_header_row = 0 

60 for col_header in col_headers[::-1]: 

61 if col_header.row_index < row_index: 

62 last_col_header_row = col_header.row_index 

63 break 

64 for row_header in row_headers: 

65 if ( 

66 row_index >= row_header.row_index 

67 and row_index < row_header.row_index + row_header.rowspan 

68 and row_header.col_index <= col_index 

69 ): 

70 col_index += row_header.colspan 

71 for td_tag in tr_tag.find_html("td"): 

72 for above_td in has_rowspan_td: 

73 if ( 

74 row_index > above_td.row_index 

75 and row_index < above_td.row_index + above_td.rowspan 

76 and above_td.col_index <= col_index 

77 ): 

78 col_index += above_td.colspan 

79 colspan = int(td_tag.attrs.get("colspan", "1")) 

80 rowspan = int(td_tag.attrs.get("rowspan", "1")) 

81 if rowspan > 1: 

82 has_rowspan_td.append( 

83 TableHeader("", col_index, colspan, row_index, rowspan) 

84 ) 

85 td_text = clean_node(wxr, None, td_tag) 

86 raw_tags = [] 

87 use_col_tags = [] 

88 for col_header in col_headers[::-1]: 

89 if ( 

90 col_header.col_index < col_index + colspan 

91 and col_index < col_header.col_index + col_header.colspan 

92 and col_header.text not in raw_tags 

93 and col_header.text not in use_col_tags 

94 # column header above cell and above last header 

95 # don't use headers for other top sections 

96 and col_header.row_index + col_header.rowspan 

97 in [last_col_header_row, last_col_header_row + 1] 

98 ): 

99 use_col_tags.append(col_header.text) 

100 raw_tags.extend(use_col_tags[::-1]) 

101 for row_header in row_headers: 

102 if ( 

103 row_header.row_index < row_index + rowspan 

104 and row_index < row_header.row_index + row_header.rowspan 

105 and row_header.text not in raw_tags 

106 ): 

107 raw_tags.append(row_header.text) 

108 for line in td_text.splitlines(): 

109 for word in line.split(","): 

110 word = word.strip() 

111 if word not in ["", "—", wxr.wtp.title]: 111 ↛ 109line 111 didn't jump to line 109 because the condition on line 111 was always true

112 form = Form(form=word, raw_tags=raw_tags) 

113 translate_raw_tags(form) 

114 word_entry.forms.append(form) 

115 col_index += colspan 

116 

117 

118def parse_wikitext_forms_table( 

119 wxr: WiktextractContext, word_entry: WordEntry, table: WikiNode 

120) -> None: 

121 # https://ru.wiktionary.org/wiki/Шаблон:сущ-ru 

122 # Шаблон:inflection сущ ru 

123 col_headers = [] 

124 row_headers = [] 

125 for row_index, row in enumerate(table.find_child(NodeKind.TABLE_ROW)): 

126 row_has_data = row.contain_node(NodeKind.TABLE_CELL) 

127 col_index = 0 

128 for header in chain(col_headers, row_headers): 

129 if ( 129 ↛ 134line 129 didn't jump to line 134 because the condition on line 129 was never true

130 row_index > header.row_index 

131 and row_index < header.row_index + header.rowspan 

132 and header.col_index <= col_index 

133 ): 

134 col_index += header.colspan 

135 for cell_node in row.find_child( 

136 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

137 ): 

138 cell_text = clean_node(wxr, None, cell_node) 

139 colspan = int(cell_node.attrs.get("colspan", "1")) 

140 rowspan = int(cell_node.attrs.get("rowspan", "1")) 

141 if cell_node.kind == NodeKind.TABLE_CELL: 

142 pass 

143 elif not row_has_data: 

144 col_headers.append( 

145 TableHeader( 

146 cell_text, col_index, colspan, row_index, rowspan 

147 ) 

148 ) 

149 else: 

150 if cell_text == "М." and cell_node.contain_node(NodeKind.LINK): 

151 for link_node in cell_node.find_child(NodeKind.LINK): 151 ↛ 154line 151 didn't jump to line 154 because the loop on line 151 didn't complete

152 cell_text = clean_node(wxr, None, link_node.largs[0][0]) 

153 break 

154 row_headers.append( 

155 TableHeader( 

156 cell_text, col_index, colspan, row_index, rowspan 

157 ) 

158 ) 

159 col_index += colspan 

160 

161 for row_index, row in enumerate(table.find_child(NodeKind.TABLE_ROW)): 

162 col_index = 0 

163 for header in chain(col_headers, row_headers): 

164 if ( 

165 row_index >= header.row_index 

166 and row_index < header.row_index + header.rowspan 

167 and header.col_index <= col_index 

168 ): 

169 col_index += header.colspan 

170 for cell_node in row.find_child(NodeKind.TABLE_CELL): 

171 colspan = int(cell_node.attrs.get("colspan", "1")) 

172 rowspan = int(cell_node.attrs.get("rowspan", "1")) 

173 cell_text = clean_node(wxr, None, cell_node) 

174 last_col_header_row = -1 

175 use_tags = [] 

176 for line in cell_text.splitlines(): 

177 line = line.strip("\n /") 

178 if line not in ["", "—", "-", wxr.wtp.title]: 

179 form = Form(form=line) 

180 for col_header in col_headers[::-1]: 

181 if ( 

182 col_header.text != "" 

183 and col_header.col_index < col_index + colspan 

184 and col_index 

185 < col_header.col_index + col_header.colspan 

186 and col_header.text not in form.raw_tags 

187 and col_header.text not in use_tags 

188 and ( 

189 ( 

190 last_col_header_row != -1 

191 and col_header.row_index 

192 + col_header.rowspan 

193 in [ 

194 last_col_header_row, 

195 last_col_header_row + 1, 

196 ] 

197 ) 

198 or ( 

199 last_col_header_row == -1 

200 and col_header.row_index 

201 + col_header.rowspan 

202 <= row_index 

203 ) 

204 ) 

205 ): 

206 use_tags.append(col_header.text) 

207 last_col_header_row = col_header.row_index 

208 form.raw_tags.extend(use_tags[::-1]) 

209 use_tags.clear() 

210 for row_header in row_headers[::-1]: 

211 if ( 

212 row_header.text != "" 

213 and row_header.row_index < row_index + rowspan 

214 and row_index 

215 < row_header.row_index + row_header.rowspan 

216 and row_header.text not in form.raw_tags 

217 and row_header.text not in use_tags 

218 ): 

219 use_tags.append(row_header.text) 

220 form.raw_tags.extend(use_tags[::-1]) 

221 translate_raw_tags(form) 

222 word_entry.forms.append(form) 

223 col_index += colspan 

224 

225 

226def extract_прил_ru_comparative_forms( 

227 wxr: WiktextractContext, word_entry: WordEntry, expanded_node: WikiNode 

228) -> None: 

229 after_comparative = False 

230 for node in expanded_node.children: 

231 if isinstance(node, str): 

232 node_str = clean_node(wxr, None, node) 

233 if node_str.endswith("Сравнительная степень —"): 

234 after_comparative = True 

235 elif ( 

236 after_comparative 

237 and isinstance(node, WikiNode) 

238 and node.kind == NodeKind.ITALIC 

239 ): 

240 for link_node in node.find_child(NodeKind.LINK): 

241 form = clean_node(wxr, None, link_node) 

242 if form != "": 242 ↛ 240line 242 didn't jump to line 240 because the condition on line 242 was always true

243 word_entry.forms.append( 

244 Form(form=form, tags=["comparative"]) 

245 )