Coverage for src/wiktextract/extractor/it/inflection.py: 96%

134 statements  

« prev     ^ index     » next       coverage.py v7.11.0, created at 2025-11-03 05:44 +0000

1from dataclasses import dataclass 

2from itertools import chain 

3 

4from wikitextprocessor import NodeKind, TemplateNode, WikiNode 

5 

6from ...page import clean_node 

7from ...wxr_context import WiktextractContext 

8from .models import Form, WordEntry 

9from .tags import translate_raw_tags 

10 

11 

12def extract_tabs_template( 

13 wxr: WiktextractContext, word_entry: WordEntry, node: TemplateNode 

14) -> None: 

15 # https://it.wiktionary.org/wiki/Template:Tabs 

16 tags = [ 

17 ["masculine", "singular"], 

18 ["masculine", "plural"], 

19 ["feminine", "singular"], 

20 ["feminine", "plural"], 

21 ] 

22 for arg_name in range(1, 5): 

23 arg_value = clean_node( 

24 wxr, None, node.template_parameters.get(arg_name, "") 

25 ) 

26 if arg_value not in ["", wxr.wtp.title]: 

27 form = Form(form=arg_value, tags=tags[arg_name - 1]) 

28 word_entry.forms.append(form) 

29 

30 

31def extract_it_decl_agg_template( 

32 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

33) -> None: 

34 # https://it.wiktionary.org/wiki/Template:It-decl-agg4 

35 # https://it.wiktionary.org/wiki/Template:It-decl-agg2 

36 expanded_node = wxr.wtp.parse( 

37 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

38 ) 

39 for table in expanded_node.find_child(NodeKind.TABLE): 

40 raw_tag = "" 

41 col_tags = [] 

42 for row in table.find_child(NodeKind.TABLE_ROW): 

43 row_tag = "" 

44 col_index = 0 

45 for cell in row.find_child( 

46 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

47 ): 

48 match cell.kind: 

49 case NodeKind.TABLE_HEADER_CELL: 

50 col_span = cell.attrs.get("colspan", "") 

51 if col_span != "": 

52 raw_tag = clean_node(wxr, None, cell) 

53 elif ( 

54 len( 

55 [ 

56 n 

57 for n in row.find_child( 

58 NodeKind.TABLE_HEADER_CELL 

59 ) 

60 ] 

61 ) 

62 == 1 

63 ): 

64 row_tag = clean_node(wxr, None, cell) 

65 else: 

66 col_header = clean_node(wxr, None, cell) 

67 if col_header != "": 67 ↛ 45line 67 didn't jump to line 45 because the condition on line 67 was always true

68 col_tags.append(col_header) 

69 case NodeKind.TABLE_CELL: 69 ↛ 45line 69 didn't jump to line 45 because the pattern on line 69 always matched

70 word = clean_node(wxr, None, cell) 

71 if word not in ["", wxr.wtp.title]: 

72 form = Form(form=word) 

73 if raw_tag != "": 73 ↛ 75line 73 didn't jump to line 75 because the condition on line 73 was always true

74 form.raw_tags.append(raw_tag) 

75 if row_tag != "": 75 ↛ 77line 75 didn't jump to line 77 because the condition on line 75 was always true

76 form.raw_tags.append(row_tag) 

77 if col_index < len(col_tags): 77 ↛ 79line 77 didn't jump to line 79 because the condition on line 77 was always true

78 form.raw_tags.append(col_tags[col_index]) 

79 translate_raw_tags(form) 

80 word_entry.forms.append(form) 

81 col_index += 1 

82 

83 

84def extract_appendix_conjugation_page( 

85 wxr: WiktextractContext, word_entry: WordEntry, page_title: str 

86) -> None: 

87 # https://it.wiktionary.org/wiki/Appendice:Coniugazioni 

88 page_text = wxr.wtp.get_page_body(page_title, 100) 

89 if page_text is None: 89 ↛ 90line 89 didn't jump to line 90 because the condition on line 89 was never true

90 return 

91 root = wxr.wtp.parse(page_text) 

92 for t_node in root.find_child(NodeKind.TEMPLATE): 

93 if t_node.template_name.lower().endswith("-conj"): 93 ↛ 92line 93 didn't jump to line 92 because the condition on line 93 was always true

94 extract_conj_template(wxr, word_entry, t_node, page_title) 

95 

96 

97@dataclass 

98class TableHeader: 

99 text: str 

100 col_index: int 

101 colspan: int 

102 row_index: int 

103 rowspan: int 

104 

105 

106def extract_conj_template( 

107 wxr: WiktextractContext, 

108 word_entry: WordEntry, 

109 t_node: TemplateNode, 

110 page_title: str, 

111) -> None: 

112 # https://it.wiktionary.org/wiki/Template:It-conj 

113 expanded_node = wxr.wtp.parse( 

114 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

115 ) 

116 for table in expanded_node.find_child(NodeKind.TABLE): 

117 col_headers = [] 

118 row_headers = [] 

119 for row_index, row in enumerate(table.find_child(NodeKind.TABLE_ROW)): 

120 row_has_data = row.contain_node(NodeKind.TABLE_CELL) 

121 col_index = 0 

122 for header in chain(col_headers, row_headers): 

123 if ( 

124 row_index > header.row_index 

125 and row_index < header.row_index + header.rowspan 

126 and header.col_index <= col_index 

127 ): 

128 col_index += header.colspan 

129 for cell_node in row.find_child( 

130 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

131 ): 

132 cell_text = clean_node(wxr, None, cell_node) 

133 colspan = int(cell_node.attrs.get("colspan", "1")) 

134 rowspan = int(cell_node.attrs.get("rowspan", "1")) 

135 if cell_node.kind == NodeKind.TABLE_CELL: 

136 pass 

137 elif not row_has_data: 

138 col_headers.append( 

139 TableHeader( 

140 cell_text, col_index, colspan, row_index, rowspan 

141 ) 

142 ) 

143 else: 

144 row_headers.append( 

145 TableHeader( 

146 cell_text, col_index, colspan, row_index, rowspan 

147 ) 

148 ) 

149 col_index += colspan 

150 

151 for row_index, row in enumerate(table.find_child(NodeKind.TABLE_ROW)): 

152 col_index = 0 

153 added_headers = set() 

154 for header in chain(col_headers, row_headers): 

155 if ( 

156 row_index >= header.row_index 

157 and row_index < header.row_index + header.rowspan 

158 and header.col_index <= col_index 

159 ): 

160 col_index += header.colspan 

161 added_headers.add(header.text) 

162 for cell_node in row.find_child( 

163 NodeKind.TABLE_CELL | NodeKind.TABLE_HEADER_CELL 

164 ): 

165 cell_has_table = False 

166 for cell_table in cell_node.find_child_recursively( 

167 NodeKind.TABLE 

168 ): 

169 extract_conj_cell_table( 

170 wxr, 

171 word_entry, 

172 cell_table, 

173 row_headers, 

174 col_headers, 

175 page_title, 

176 col_index, 

177 row_index, 

178 ) 

179 cell_has_table = True 

180 if not cell_has_table: 

181 colspan = int(cell_node.attrs.get("colspan", "1")) 

182 rowspan = int(cell_node.attrs.get("rowspan", "1")) 

183 cell_text = clean_node(wxr, None, cell_node) 

184 if cell_node.kind == NodeKind.TABLE_HEADER_CELL: 

185 if cell_text not in added_headers: 

186 col_index += colspan 

187 continue 

188 for line in cell_text.splitlines(): 

189 for form_str in line.split(","): 

190 form_str = form_str.strip() 

191 if form_str not in ["", "—", wxr.wtp.title]: 

192 add_conj_form( 

193 word_entry, 

194 form_str, 

195 page_title, 

196 colspan, 

197 rowspan, 

198 col_index, 

199 col_headers, 

200 row_index, 

201 row_headers, 

202 ) 

203 col_index += colspan 

204 

205 

206def extract_conj_cell_table( 

207 wxr: WiktextractContext, 

208 word_entry: WordEntry, 

209 table_node: WikiNode, 

210 row_headers: list[TableHeader], 

211 col_headers: list[TableHeader], 

212 page_title: str, 

213 start_col_index: int, 

214 row_index: int, 

215): 

216 for row in table_node.find_child(NodeKind.TABLE_ROW): 

217 for col_index, cell in enumerate(row.find_child(NodeKind.TABLE_CELL)): 

218 colspan = int(cell.attrs.get("colspan", "1")) 

219 rowspan = int(cell.attrs.get("rowspan", "1")) 

220 for cell_str in clean_node(wxr, None, cell).splitlines(): 

221 if cell_str not in ["", "—", wxr.wtp.title]: 221 ↛ 220line 221 didn't jump to line 220 because the condition on line 221 was always true

222 add_conj_form( 

223 word_entry, 

224 cell_str, 

225 page_title, 

226 colspan, 

227 rowspan, 

228 start_col_index + col_index, 

229 col_headers, 

230 row_index, 

231 row_headers, 

232 ) 

233 

234 

235def add_conj_form( 

236 word_entry: WordEntry, 

237 form_str: str, 

238 page_title: str, 

239 colspan: int, 

240 rowspan: int, 

241 col_index: int, 

242 col_headers: list[TableHeader], 

243 row_index: int, 

244 row_headers: list[TableHeader], 

245): 

246 form = Form(form=form_str, source=page_title) 

247 use_tags = [] 

248 last_col_header_row = -1 

249 last_row_header_col = -1 

250 for col_header in col_headers[::-1]: 

251 if ( 

252 col_header.col_index < col_index + colspan 

253 and col_index < col_header.col_index + col_header.colspan 

254 and col_header.text not in form.raw_tags 

255 and col_header.text not in use_tags 

256 and ( 

257 ( 

258 last_col_header_row != -1 

259 and col_header.row_index + col_header.rowspan 

260 in [last_col_header_row, last_col_header_row + 1] 

261 ) 

262 or ( 

263 last_col_header_row == -1 

264 and col_header.row_index + col_header.rowspan <= row_index 

265 ) 

266 ) 

267 ) or ( 

268 # the last "imperativo" column header in Template:It-conj 

269 col_header.col_index == 0 

270 and col_header.row_index < row_index + rowspan 

271 and col_header.row_index + col_header.rowspan > row_index 

272 ): 

273 use_tags.append(col_header.text) 

274 last_col_header_row = col_header.row_index 

275 form.raw_tags.extend(use_tags[::-1]) 

276 use_tags.clear() 

277 for row_header in row_headers[::-1]: 

278 if ( 

279 row_header.row_index < row_index + rowspan 

280 and row_index < row_header.row_index + row_header.rowspan 

281 and row_header.text not in form.raw_tags 

282 and row_header.text not in use_tags 

283 and ( 

284 ( 

285 last_row_header_col != -1 

286 and ( 

287 row_header.col_index + row_header.colspan 

288 in [last_row_header_col, last_row_header_col + 1] 

289 or row_header.col_index == last_row_header_col 

290 ) 

291 ) 

292 or ( 

293 last_row_header_col == -1 

294 and row_header.col_index + row_header.colspan <= col_index 

295 ) 

296 ) 

297 ): 

298 use_tags.append(row_header.text) 

299 last_row_header_col = row_header.col_index 

300 form.raw_tags.extend(use_tags[::-1]) 

301 translate_raw_tags(form) 

302 word_entry.forms.append(form)