Coverage for src/wiktextract/extractor/it/inflection.py: 91%

117 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2024-12-27 08:07 +0000

1import re 

2from dataclasses import dataclass 

3 

4from wikitextprocessor import NodeKind, TemplateNode, WikiNode 

5 

6from ...page import clean_node 

7from ...wxr_context import WiktextractContext 

8from .models import Form, WordEntry 

9from .tags import translate_raw_tags 

10 

11 

12def extract_tabs_template( 

13 wxr: WiktextractContext, word_entry: WordEntry, node: TemplateNode 

14) -> None: 

15 # https://it.wiktionary.org/wiki/Template:Tabs 

16 tags = [ 

17 ["masculine", "singular"], 

18 ["masculine", "plural"], 

19 ["feminine", "singular"], 

20 ["feminine", "plural"], 

21 ] 

22 for arg_name in range(1, 5): 

23 arg_value = clean_node( 

24 wxr, None, node.template_parameters.get(arg_name, "") 

25 ) 

26 if arg_value not in ["", wxr.wtp.title]: 

27 form = Form(form=arg_value, tags=tags[arg_name - 1]) 

28 word_entry.forms.append(form) 

29 

30 

31def extract_it_decl_agg_template( 

32 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

33) -> None: 

34 # https://it.wiktionary.org/wiki/Template:It-decl-agg4 

35 # https://it.wiktionary.org/wiki/Template:It-decl-agg2 

36 expanded_node = wxr.wtp.parse( 

37 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

38 ) 

39 for table in expanded_node.find_child(NodeKind.TABLE): 

40 raw_tag = "" 

41 col_tags = [] 

42 for row in table.find_child(NodeKind.TABLE_ROW): 

43 row_tag = "" 

44 col_index = 0 

45 for cell in row.find_child( 

46 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

47 ): 

48 match cell.kind: 

49 case NodeKind.TABLE_HEADER_CELL: 

50 col_span = cell.attrs.get("colspan", "") 

51 if col_span != "": 

52 raw_tag = clean_node(wxr, None, cell) 

53 elif ( 

54 len( 

55 [ 

56 n 

57 for n in row.find_child( 

58 NodeKind.TABLE_HEADER_CELL 

59 ) 

60 ] 

61 ) 

62 == 1 

63 ): 

64 row_tag = clean_node(wxr, None, cell) 

65 else: 

66 col_header = clean_node(wxr, None, cell) 

67 if col_header != "": 67 ↛ 45line 67 didn't jump to line 45 because the condition on line 67 was always true

68 col_tags.append(col_header) 

69 case NodeKind.TABLE_CELL: 69 ↛ 45line 69 didn't jump to line 45 because the pattern on line 69 always matched

70 word = clean_node(wxr, None, cell) 

71 if word not in ["", wxr.wtp.title]: 

72 form = Form(form=word) 

73 if raw_tag != "": 73 ↛ 75line 73 didn't jump to line 75 because the condition on line 73 was always true

74 form.raw_tags.append(raw_tag) 

75 if row_tag != "": 75 ↛ 77line 75 didn't jump to line 77 because the condition on line 75 was always true

76 form.raw_tags.append(row_tag) 

77 if col_index < len(col_tags): 77 ↛ 79line 77 didn't jump to line 79 because the condition on line 77 was always true

78 form.raw_tags.append(col_tags[col_index]) 

79 translate_raw_tags(form) 

80 word_entry.forms.append(form) 

81 col_index += 1 

82 

83 

84def extract_appendix_conjugation_page( 

85 wxr: WiktextractContext, word_entry: WordEntry, page_title: str 

86) -> None: 

87 # https://it.wiktionary.org/wiki/Appendice:Coniugazioni 

88 page_text = wxr.wtp.get_page_body(page_title, 100) 

89 if page_text is None: 89 ↛ 90line 89 didn't jump to line 90 because the condition on line 89 was never true

90 return 

91 root = wxr.wtp.parse(page_text) 

92 for t_node in root.find_child(NodeKind.TEMPLATE): 

93 if t_node.template_name.lower() == "it-conj": 93 ↛ 92line 93 didn't jump to line 92 because the condition on line 93 was always true

94 extract_it_conj_template(wxr, word_entry, t_node, page_title) 

95 

96 

97@dataclass 

98class TableHeader: 

99 text: str 

100 col_index: int 

101 colspan: int 

102 row_index: int 

103 rowspan: int 

104 

105 

106def extract_it_conj_template( 

107 wxr: WiktextractContext, 

108 word_entry: WordEntry, 

109 t_node: TemplateNode, 

110 page_title: str, 

111) -> None: 

112 # https://it.wiktionary.org/wiki/Template:It-conj 

113 expanded_node = wxr.wtp.parse( 

114 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

115 ) 

116 for table in expanded_node.find_child(NodeKind.TABLE): 

117 col_headers = [] 

118 row_header = "" 

119 for row in table.find_child(NodeKind.TABLE_ROW): 

120 col_index = 0 

121 for cell in row.find_child( 

122 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

123 ): 

124 match cell.kind: 

125 case NodeKind.TABLE_HEADER_CELL: 

126 header_str = clean_node(wxr, None, cell) 

127 if header_str in ["persona", "indicativo"]: 

128 continue 

129 elif header_str in ["condizionale", "congiuntivo"]: 129 ↛ 130line 129 didn't jump to line 130 because the condition on line 129 was never true

130 col_headers.clear() 

131 continue 

132 elif header_str == "imperativo": 

133 col_headers.clear() 

134 row_header = "imperativo" 

135 continue 

136 

137 if row.contain_node(NodeKind.TABLE_CELL): 

138 row_header = header_str 

139 else: 

140 colspan = 1 

141 colspan_str = cell.attrs.get("colspan", "1") 

142 if re.fullmatch(r"\d+", colspan_str): 142 ↛ 144line 142 didn't jump to line 144 because the condition on line 142 was always true

143 colspan = int(colspan_str) 

144 col_headers.append( 

145 TableHeader( 

146 header_str, col_index, colspan, 0, 0 

147 ) 

148 ) 

149 col_index += colspan 

150 case NodeKind.TABLE_CELL: 150 ↛ 121line 150 didn't jump to line 121 because the pattern on line 150 always matched

151 cell_has_table = False 

152 for cell_table in cell.find_child_recursively( 

153 NodeKind.TABLE 

154 ): 

155 extract_it_conj_cell_table( 

156 wxr, 

157 word_entry, 

158 cell_table, 

159 row_header, 

160 col_headers, 

161 page_title, 

162 ) 

163 cell_has_table = True 

164 if not cell_has_table: 

165 for form_str in clean_node( 

166 wxr, None, cell 

167 ).splitlines(): 

168 form_str = form_str.strip(", ") 

169 if form_str.startswith("verbo di "): 169 ↛ 170line 169 didn't jump to line 170 because the condition on line 169 was never true

170 continue # first row 

171 if form_str not in ["", wxr.wtp.title]: 171 ↛ 165line 171 didn't jump to line 165 because the condition on line 171 was always true

172 add_it_conj_form( 

173 word_entry, 

174 form_str, 

175 page_title, 

176 row_header, 

177 col_index, 

178 col_headers, 

179 ) 

180 col_index += 1 

181 

182 

183def extract_it_conj_cell_table( 

184 wxr: WiktextractContext, 

185 word_entry: WordEntry, 

186 table_node: WikiNode, 

187 row_header: str, 

188 col_headers: list[TableHeader], 

189 page_title: str, 

190) -> None: 

191 for row in table_node.find_child(NodeKind.TABLE_ROW): 

192 for col_index, cell in enumerate(row.find_child(NodeKind.TABLE_CELL)): 

193 for cell_str in clean_node(wxr, None, cell).splitlines(): 

194 if cell_str not in ["", wxr.wtp.title]: 194 ↛ 193line 194 didn't jump to line 193 because the condition on line 194 was always true

195 add_it_conj_form( 

196 word_entry, 

197 cell_str, 

198 page_title, 

199 row_header, 

200 col_index, 

201 col_headers, 

202 ) 

203 

204 

205def add_it_conj_form( 

206 word_entry: WordEntry, 

207 form_str: str, 

208 page_title: str, 

209 row_header: str, 

210 col_index: int, 

211 col_headers: list[TableHeader], 

212) -> None: 

213 form = Form(form=form_str, source=page_title) 

214 if row_header != "": 214 ↛ 216line 214 didn't jump to line 216 because the condition on line 214 was always true

215 form.raw_tags.append(row_header) 

216 for col_header in col_headers: 

217 if ( 

218 col_index >= col_header.col_index 

219 and col_index < col_header.col_index + col_header.colspan 

220 ): 

221 form.raw_tags.append(col_header.text) 

222 translate_raw_tags(form) 

223 word_entry.forms.append(form)