Coverage for src/wiktextract/extractor/ru/inflection.py: 95%

122 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-12 08:27 +0000

1from collections import defaultdict 

2from dataclasses import dataclass 

3 

4from wikitextprocessor import HTMLNode, NodeKind, WikiNode 

5 

6from ...page import clean_node 

7from ...wxr_context import WiktextractContext 

8from .models import Form, WordEntry 

9from .tags import translate_raw_tags 

10 

11 

12@dataclass 

13class TableHeader: 

14 text: str 

15 start_index: int 

16 span: int 

17 

18 

19# Викисловарь:Шаблоны словоизменений 

20 

21 

22def parse_html_forms_table( 

23 wxr: WiktextractContext, word_entry: WordEntry, table_tag: HTMLNode 

24): 

25 # HTML table 

26 # https://ru.wiktionary.org/wiki/Шаблон:прил 

27 column_headers = [] 

28 row_headers = [] 

29 td_rowspan = defaultdict(int) 

30 for row_index, tr_element in enumerate(table_tag.find_html("tr")): 

31 if len(list(tr_element.find_html("td"))) == 0: 

32 # all column headers 

33 col_index = 0 

34 for th_element in tr_element.find_html("th"): 

35 header_text = "" 

36 for header_link in th_element.find_child(NodeKind.LINK): 

37 header_text = clean_node(wxr, None, header_link) 

38 if header_text == "падеж": 

39 continue # ignore top left corner header 

40 header_span = int(th_element.attrs.get("colspan", "1")) 

41 column_headers.append( 

42 TableHeader(header_text, col_index, header_span) 

43 ) 

44 col_index += header_span 

45 else: # row headers 

46 for node in tr_element.children: 

47 if isinstance(node, HTMLNode) and ( 

48 node.tag == "th" 

49 or ( 

50 node.tag == "td" 

51 and node.attrs.get("bgcolor") == "#EEF9FF" 

52 ) 

53 ): 

54 header_text = "" 

55 for header_link in node.find_child(NodeKind.LINK): 

56 header_text = clean_node(wxr, None, header_link) 

57 header_span = int(node.attrs.get("rowspan", "1")) 

58 row_headers.append( 

59 TableHeader(header_text, row_index, header_span) 

60 ) 

61 

62 for row_index, tr_element in enumerate(table_tag.find_html("tr")): 

63 col_index = 0 

64 has_rowspan = False 

65 for td_element in tr_element.find_html("td"): 

66 rowspan = 1 

67 if td_element.attrs.get("bgcolor") == "#EEF9FF": 

68 # this is a td tag but contains header text 

69 continue 

70 if "rowspan" in td_element.attrs: 

71 rowspan = int(td_element.attrs["rowspan"]) 

72 td_rowspan[col_index] = rowspan - 1 

73 has_rowspan = True 

74 elif not has_rowspan: 

75 for rowspan_index, rowspan_value in td_rowspan.items(): 

76 if rowspan_value > 0 and col_index == rowspan_index: 

77 col_index += 1 

78 td_rowspan[rowspan_index] -= 1 

79 td_text = clean_node(wxr, None, td_element) 

80 for line in td_text.splitlines(): 

81 form = Form(form=line) 

82 for col_header in column_headers: 

83 if ( 

84 col_index >= col_header.start_index 

85 and col_index < col_header.start_index + col_header.span 

86 ): 

87 form.raw_tags.append(col_header.text) 

88 for row_header in row_headers: 

89 if ( 

90 row_index < row_header.start_index + row_header.span 

91 and row_index + rowspan > row_header.start_index 

92 ): 

93 form.raw_tags.append(row_header.text) 

94 if len(form.form) > 0: 94 ↛ 80line 94 didn't jump to line 80 because the condition on line 94 was always true

95 translate_raw_tags(form) 

96 word_entry.forms.append(form) 

97 col_index += 1 

98 

99 

100def parse_wikitext_forms_table( 

101 wxr: WiktextractContext, word_entry: WordEntry, table_node: WikiNode 

102) -> None: 

103 # https://ru.wiktionary.org/wiki/Шаблон:сущ-ru 

104 # Шаблон:inflection сущ ru 

105 # Шаблон:Гл-блок 

106 column_headers = [] 

107 for table_row in table_node.find_child(NodeKind.TABLE_ROW): 

108 row_headers = [] 

109 has_data_cell = table_row.contain_node(NodeKind.TABLE_CELL) 

110 for col_index, table_cell in enumerate( 

111 table_row.find_child( 

112 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

113 ) 

114 ): 

115 if table_cell.kind == NodeKind.TABLE_HEADER_CELL: 

116 cell_text = clean_node(wxr, None, table_cell) 

117 if not has_data_cell: 

118 column_headers.append(cell_text) 

119 else: 

120 if cell_text == "М." and table_cell.contain_node( 

121 NodeKind.LINK 

122 ): 

123 for link_node in table_cell.find_child(NodeKind.LINK): 123 ↛ 110line 123 didn't jump to line 110 because the loop on line 123 didn't complete

124 row_headers.append(link_node.largs[0][0]) 

125 break 

126 else: 

127 row_headers.append(cell_text) 

128 elif table_cell.kind == NodeKind.TABLE_CELL: 128 ↛ 110line 128 didn't jump to line 110 because the condition on line 128 was always true

129 cell_text = clean_node( # remove cursed <tr> tag 

130 wxr, 

131 None, 

132 [ 

133 n 

134 for n in table_cell.children 

135 if not (isinstance(n, HTMLNode) and n.tag == "tr") 

136 ], 

137 ) 

138 if table_cell.attrs.get("bgcolor", "").lower() == "#eef9ff": 

139 if cell_text == "М." and table_cell.contain_node( 139 ↛ 142line 139 didn't jump to line 142 because the condition on line 139 was never true

140 NodeKind.LINK 

141 ): 

142 for link_node in table_cell.find_child(NodeKind.LINK): 

143 row_headers.append(link_node.largs[0][0]) 

144 break 

145 else: 

146 row_headers.append(cell_text) 

147 else: 

148 for form_text in cell_text.splitlines(): 

149 add_form_data( 

150 word_entry, 

151 form_text, 

152 row_headers, 

153 column_headers, 

154 col_index, 

155 ) 

156 

157 # cursed layout from Шаблон:Гл-блок 

158 # tr tag could be after or inside table cell node: Шаблон:сущ cu (-а) 

159 for tr_tag in table_row.find_html_recursively("tr"): 

160 row_headers = [] 

161 has_th_tag = False 

162 for th_tag in tr_tag.find_html("th"): 

163 row_headers.append(clean_node(wxr, None, th_tag)) 

164 has_th_tag = True 

165 for td_index, td_tag in enumerate(tr_tag.find_html("td")): 

166 if td_tag.contain_node(NodeKind.LINK): 

167 for link_node in td_tag.find_child(NodeKind.LINK): 

168 if td_tag.attrs.get("bgcolor", "").lower() == "#eef9ff": 

169 row_headers.append(clean_node(wxr, None, link_node)) 

170 else: 

171 add_form_data( 

172 word_entry, 

173 clean_node(wxr, None, link_node), 

174 row_headers, 

175 [] 

176 if "colspan" in td_tag.attrs 

177 else column_headers, 

178 td_index, 

179 ) 

180 else: 

181 add_form_data( 

182 word_entry, 

183 clean_node(wxr, None, td_tag), 

184 row_headers, 

185 [] if "colspan" in td_tag.attrs else column_headers, 

186 td_index + 1 if has_th_tag else td_index, 

187 ) 

188 

189 

190def add_form_data( 

191 word_entry: WordEntry, 

192 form_text: str, 

193 row_headers: list[str], 

194 col_headers: list[str], 

195 col_index: int, 

196) -> None: 

197 form = Form(form=form_text.strip(" /")) 

198 form.raw_tags.extend(row_headers) 

199 if col_index < len(col_headers) and col_headers[col_index] != "": 

200 form.raw_tags.append(col_headers[col_index]) 

201 if form.form not in ["", "—", "-"]: 

202 translate_raw_tags(form) 

203 word_entry.forms.append(form) 

204 

205 

206def extract_прил_ru_comparative_forms( 

207 wxr: WiktextractContext, word_entry: WordEntry, expanded_node: WikiNode 

208) -> None: 

209 after_comparative = False 

210 for node in expanded_node.children: 

211 if isinstance(node, str): 

212 node_str = clean_node(wxr, None, node) 

213 if node_str.endswith("Сравнительная степень —"): 

214 after_comparative = True 

215 elif ( 

216 after_comparative 

217 and isinstance(node, WikiNode) 

218 and node.kind == NodeKind.ITALIC 

219 ): 

220 for link_node in node.find_child(NodeKind.LINK): 

221 form = clean_node(wxr, None, link_node) 

222 if form != "": 222 ↛ 220line 222 didn't jump to line 220 because the condition on line 222 was always true

223 word_entry.forms.append( 

224 Form(form=form, tags=["comparative"]) 

225 )