Coverage for src/wiktextract/extractor/ru/inflection.py: 92%

127 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-06-13 07:43 +0000

1from collections import defaultdict 

2from dataclasses import dataclass 

3 

4from wikitextprocessor import HTMLNode, NodeKind, WikiNode 

5 

6from ...page import clean_node 

7from ...wxr_context import WiktextractContext 

8from .models import Form, WordEntry 

9from .tags import translate_raw_tags 

10 

11 

12@dataclass 

13class TableHeader: 

14 text: str 

15 start_index: int 

16 span: int 

17 

18 

19# Викисловарь:Шаблоны словоизменений 

20 

21 

22def parse_html_forms_table( 

23 wxr: WiktextractContext, word_entry: WordEntry, table_tag: HTMLNode 

24): 

25 # HTML table 

26 # https://ru.wiktionary.org/wiki/Шаблон:прил 

27 column_headers = [] 

28 row_headers = [] 

29 td_rowspan = defaultdict(int) 

30 for tr_element in table_tag.find_html("tr"): 

31 if len(list(tr_element.find_html("td"))) == 0: 

32 # all header 

33 current_index = 0 

34 for th_element in tr_element.find_html("th"): 

35 header_text = "" 

36 for header_link in th_element.find_child(NodeKind.LINK): 

37 header_text = clean_node(wxr, None, header_link) 

38 if header_text == "падеж": 

39 continue # ignore top left corner header 

40 header_span = int(th_element.attrs.get("colspan", "1")) 

41 column_headers.append( 

42 TableHeader(header_text, current_index, header_span) 

43 ) 

44 current_index += header_span 

45 else: 

46 col_index = 0 

47 has_rowspan = False 

48 for th_element in tr_element.find_html("th"): 48 ↛ 49line 48 didn't jump to line 49 because the loop on line 48 never started

49 header_text = "" 

50 for header_link in th_element.find_child(NodeKind.LINK): 

51 header_text = clean_node(wxr, None, header_link) 

52 header_span = int(th_element.attrs.get("rowspan", "1")) 

53 row_headers.append(TableHeader(header_text, 0, header_span)) 

54 

55 for td_element in tr_element.find_html("td"): 

56 if td_element.attrs.get("bgcolor") == "#EEF9FF": 

57 # this is a td tag but contains header text 

58 header_text = "" 

59 for header_link in td_element.find_child(NodeKind.LINK): 

60 header_text = clean_node(wxr, None, header_link) 

61 header_span = int(td_element.attrs.get("rowspan", "1")) 

62 row_headers.append(TableHeader(header_text, 0, header_span)) 

63 continue 

64 if "rowspan" in td_element.attrs: 

65 td_rowspan[col_index] = int(td_element.attrs["rowspan"]) - 1 

66 has_rowspan = True 

67 elif not has_rowspan: 

68 for rowspan_index, rowspan_value in td_rowspan.items(): 

69 if rowspan_value > 0 and col_index == rowspan_index: 

70 col_index += 1 

71 td_rowspan[rowspan_index] -= 1 

72 td_text = clean_node(wxr, None, td_element) 

73 for line in td_text.split(): 

74 form = Form(form=line) 

75 for col_header in column_headers: 

76 if ( 

77 col_index >= col_header.start_index 

78 and col_index 

79 < col_header.start_index + col_header.span 

80 ): 

81 form.raw_tags.append(col_header.text) 

82 form.raw_tags.extend([h.text for h in row_headers]) 

83 if len(form.form) > 0: 83 ↛ 73line 83 didn't jump to line 73 because the condition on line 83 was always true

84 translate_raw_tags(form) 

85 word_entry.forms.append(form) 

86 col_index += 1 

87 

88 updated_row_headers = [] 

89 for row_header in row_headers: 

90 if row_header.span > 1: 

91 row_header.span -= 1 

92 updated_row_headers.append(row_header) 

93 row_headers = updated_row_headers 

94 

95 

96def parse_wikitext_forms_table( 

97 wxr: WiktextractContext, word_entry: WordEntry, table_node: WikiNode 

98) -> None: 

99 # https://ru.wiktionary.org/wiki/Шаблон:сущ-ru 

100 # Шаблон:inflection сущ ru 

101 # Шаблон:Гл-блок 

102 column_headers = [] 

103 for table_row in table_node.find_child(NodeKind.TABLE_ROW): 

104 row_headers = [] 

105 has_data_cell = table_row.contain_node(NodeKind.TABLE_CELL) 

106 for col_index, table_cell in enumerate( 

107 table_row.find_child( 

108 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

109 ) 

110 ): 

111 if table_cell.kind == NodeKind.TABLE_HEADER_CELL: 

112 cell_text = clean_node(wxr, None, table_cell) 

113 if not has_data_cell: 

114 column_headers.append(cell_text) 

115 else: 

116 if cell_text == "М." and table_cell.contain_node( 

117 NodeKind.LINK 

118 ): 

119 for link_node in table_cell.find_child(NodeKind.LINK): 119 ↛ 106line 119 didn't jump to line 106 because the loop on line 119 didn't complete

120 row_headers.append(link_node.largs[0][0]) 

121 break 

122 else: 

123 row_headers.append(cell_text) 

124 elif table_cell.kind == NodeKind.TABLE_CELL: 124 ↛ 106line 124 didn't jump to line 106 because the condition on line 124 was always true

125 cell_text = clean_node( # remove cursed <tr> tag 

126 wxr, 

127 None, 

128 [ 

129 n 

130 for n in table_cell.children 

131 if not (isinstance(n, HTMLNode) and n.tag == "tr") 

132 ], 

133 ) 

134 if table_cell.attrs.get("bgcolor", "").lower() == "#eef9ff": 

135 if cell_text == "М." and table_cell.contain_node( 135 ↛ 138line 135 didn't jump to line 138 because the condition on line 135 was never true

136 NodeKind.LINK 

137 ): 

138 for link_node in table_cell.find_child(NodeKind.LINK): 

139 row_headers.append(link_node.largs[0][0]) 

140 break 

141 else: 

142 row_headers.append(cell_text) 

143 else: 

144 for form_text in cell_text.splitlines(): 

145 add_form_data( 

146 word_entry, 

147 form_text, 

148 row_headers, 

149 column_headers, 

150 col_index, 

151 ) 

152 

153 # cursed layout from Шаблон:Гл-блок 

154 # tr tag could be after or inside table cell node: Шаблон:сущ cu (-а) 

155 for tr_tag in table_row.find_html_recursively("tr"): 

156 row_headers = [] 

157 has_th_tag = False 

158 for th_tag in tr_tag.find_html("th"): 

159 row_headers.append(clean_node(wxr, None, th_tag)) 

160 has_th_tag = True 

161 for td_index, td_tag in enumerate(tr_tag.find_html("td")): 

162 if td_tag.contain_node(NodeKind.LINK): 

163 for link_node in td_tag.find_child(NodeKind.LINK): 

164 if td_tag.attrs.get("bgcolor", "").lower() == "#eef9ff": 

165 row_headers.append(clean_node(wxr, None, link_node)) 

166 else: 

167 add_form_data( 

168 word_entry, 

169 clean_node(wxr, None, link_node), 

170 row_headers, 

171 [] 

172 if "colspan" in td_tag.attrs 

173 else column_headers, 

174 td_index, 

175 ) 

176 else: 

177 add_form_data( 

178 word_entry, 

179 clean_node(wxr, None, td_tag), 

180 row_headers, 

181 [] if "colspan" in td_tag.attrs else column_headers, 

182 td_index + 1 if has_th_tag else td_index, 

183 ) 

184 

185 

186def add_form_data( 

187 word_entry: WordEntry, 

188 form_text: str, 

189 row_headers: list[str], 

190 col_headers: list[str], 

191 col_index: int, 

192) -> None: 

193 form = Form(form=form_text.strip(" /")) 

194 form.raw_tags.extend(row_headers) 

195 if col_index < len(col_headers) and col_headers[col_index] != "": 

196 form.raw_tags.append(col_headers[col_index]) 

197 if form.form not in ["", "—", "-"]: 

198 translate_raw_tags(form) 

199 word_entry.forms.append(form) 

200 

201 

202def extract_прил_ru_comparative_forms( 

203 wxr: WiktextractContext, word_entry: WordEntry, expanded_node: WikiNode 

204) -> None: 

205 after_comparative = False 

206 for node in expanded_node.children: 

207 if isinstance(node, str): 

208 node_str = clean_node(wxr, None, node) 

209 if node_str.endswith("Сравнительная степень —"): 

210 after_comparative = True 

211 elif ( 

212 after_comparative 

213 and isinstance(node, WikiNode) 

214 and node.kind == NodeKind.ITALIC 

215 ): 

216 for link_node in node.find_child(NodeKind.LINK): 

217 form = clean_node(wxr, None, link_node) 

218 if form != "": 218 ↛ 216line 218 didn't jump to line 216 because the condition on line 218 was always true

219 word_entry.forms.append( 

220 Form(form=form, tags=["comparative"]) 

221 )