Coverage for src/wiktextract/extractor/ru/inflection.py: 98%

103 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-10-25 10:11 +0000

1from collections import defaultdict 

2from dataclasses import dataclass 

3 

4from wikitextprocessor import HTMLNode, NodeKind, WikiNode 

5 

6from ...page import clean_node 

7from ...wxr_context import WiktextractContext 

8from .models import Form, WordEntry 

9from .tags import translate_raw_tags 

10 

11 

12@dataclass 

13class TableHeader: 

14 text: str 

15 start_index: int 

16 span: int 

17 

18 

19def parse_adj_forms_table( 

20 wxr: WiktextractContext, 

21 word_entry: WordEntry, 

22 expanded_template: WikiNode, 

23): 

24 # HTML table 

25 # https://ru.wiktionary.org/wiki/Шаблон:прил 

26 for table_element in expanded_template.find_html("table"): 

27 column_headers = [] 

28 row_headers = [] 

29 td_rowspan = defaultdict(int) 

30 for tr_element in table_element.find_html("tr"): 

31 if len(list(tr_element.find_html("td"))) == 0: 

32 # all header 

33 current_index = 0 

34 for th_element in tr_element.find_html("th"): 

35 header_text = "" 

36 for header_link in th_element.find_child(NodeKind.LINK): 

37 header_text = clean_node(wxr, None, header_link) 

38 if header_text == "падеж": 

39 continue # ignore top left corner header 

40 header_span = int(th_element.attrs.get("colspan", "1")) 

41 column_headers.append( 

42 TableHeader(header_text, current_index, header_span) 

43 ) 

44 current_index += header_span 

45 else: 

46 col_index = 0 

47 has_rowspan = False 

48 for td_element in tr_element.find_html("td"): 

49 if td_element.attrs.get("bgcolor") == "#EEF9FF": 

50 # this is a td tag but contains header text 

51 header_text = "" 

52 for header_link in td_element.find_child(NodeKind.LINK): 

53 header_text = clean_node(wxr, None, header_link) 

54 header_span = int(td_element.attrs.get("rowspan", "1")) 

55 row_headers.append( 

56 TableHeader(header_text, 0, header_span) 

57 ) 

58 continue 

59 if "rowspan" in td_element.attrs: 

60 td_rowspan[col_index] = ( 

61 int(td_element.attrs["rowspan"]) - 1 

62 ) 

63 has_rowspan = True 

64 elif not has_rowspan: 

65 for rowspan_index, rowspan_value in td_rowspan.items(): 

66 if rowspan_value > 0 and col_index == rowspan_index: 

67 col_index += 1 

68 td_rowspan[rowspan_index] -= 1 

69 td_text = clean_node(wxr, None, td_element) 

70 for line in td_text.split(): 

71 form = Form(form=line) 

72 for col_header in column_headers: 

73 if ( 

74 col_index >= col_header.start_index 

75 and col_index 

76 < col_header.start_index + col_header.span 

77 ): 

78 form.raw_tags.append(col_header.text) 

79 form.raw_tags.extend([h.text for h in row_headers]) 

80 if len(form.form) > 0: 80 ↛ 70line 80 didn't jump to line 70 because the condition on line 80 was always true

81 translate_raw_tags(form) 

82 word_entry.forms.append(form) 

83 col_index += 1 

84 

85 updated_row_headers = [] 

86 for row_header in row_headers: 

87 if row_header.span > 1: 

88 row_header.span -= 1 

89 updated_row_headers.append(row_header) 

90 row_headers = updated_row_headers 

91 

92 

93def parse_wikitext_forms_table( 

94 wxr: WiktextractContext, 

95 word_entry: WordEntry, 

96 expanded_template: WikiNode, 

97) -> None: 

98 # https://ru.wiktionary.org/wiki/Шаблон:сущ-ru 

99 # Шаблон:inflection сущ ru 

100 # Шаблон:Гл-блок 

101 table_nodes = list(expanded_template.find_child(NodeKind.TABLE)) 

102 if len(table_nodes) == 0: 

103 return 

104 table_node = table_nodes[0] 

105 column_headers = [] 

106 for table_row in table_node.find_child(NodeKind.TABLE_ROW): 

107 row_headers = [] 

108 for col_index, table_cell in enumerate( 

109 table_row.find_child( 

110 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

111 ) 

112 ): 

113 if table_cell.kind == NodeKind.TABLE_HEADER_CELL: 

114 column_headers.append(clean_node(wxr, None, table_cell)) 

115 elif table_cell.kind == NodeKind.TABLE_CELL: 115 ↛ 108line 115 didn't jump to line 108 because the condition on line 115 was always true

116 cell_text = clean_node( # remove cursed <tr> tag 

117 wxr, 

118 None, 

119 [ 

120 n 

121 for n in table_cell.children 

122 if not (isinstance(n, HTMLNode) and n.tag == "tr") 

123 ], 

124 ) 

125 if table_cell.attrs.get("bgcolor", "").lower() == "#eef9ff": 

126 if cell_text == "М." and table_cell.contain_node( 

127 NodeKind.LINK 

128 ): 

129 for link_node in table_cell.find_child(NodeKind.LINK): 129 ↛ 108line 129 didn't jump to line 108 because the loop on line 129 didn't complete

130 row_headers.append(link_node.largs[0][0]) 

131 break 

132 else: 

133 row_headers.append(cell_text) 

134 else: 

135 for form_text in cell_text.splitlines(): 

136 add_form_data( 

137 word_entry, 

138 form_text, 

139 row_headers, 

140 column_headers, 

141 col_index, 

142 ) 

143 

144 # cursed layout from Шаблон:Гл-блок 

145 # tr tag could be after or inside table cell node: Шаблон:сущ cu (-а) 

146 for tr_tag in table_row.find_html_recursively("tr"): 

147 row_headers = [] 

148 for td_index, td_tag in enumerate(tr_tag.find_html("td")): 

149 if td_tag.contain_node(NodeKind.LINK): 

150 for link_node in td_tag.find_child(NodeKind.LINK): 

151 if td_tag.attrs.get("bgcolor", "").lower() == "#eef9ff": 

152 row_headers.append(clean_node(wxr, None, link_node)) 

153 else: 

154 add_form_data( 

155 word_entry, 

156 clean_node(wxr, None, link_node), 

157 row_headers, 

158 [] 

159 if "colspan" in td_tag.attrs 

160 else column_headers, 

161 td_index, 

162 ) 

163 else: 

164 add_form_data( 

165 word_entry, 

166 clean_node(wxr, None, td_tag), 

167 row_headers, 

168 [] if "colspan" in td_tag.attrs else column_headers, 

169 td_index, 

170 ) 

171 

172 clean_node(wxr, word_entry, expanded_template) # add category links 

173 

174 

175def add_form_data( 

176 word_entry: WordEntry, 

177 form_text: str, 

178 row_headers: list[str], 

179 col_headers: list[str], 

180 col_index: int, 

181) -> None: 

182 form = Form(form=form_text.strip(" /")) 

183 form.raw_tags.extend(row_headers) 

184 if col_index < len(col_headers): 

185 form.raw_tags.append(col_headers[col_index]) 

186 if len(form.form) > 0 and form.form != "—": 

187 translate_raw_tags(form) 

188 word_entry.forms.append(form)