Coverage for src/wiktextract/extractor/pt/inflection.py: 93%

164 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2024-12-27 08:07 +0000

1import re 

2from dataclasses import dataclass 

3 

4from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode 

5 

6from ...page import clean_node 

7from ...wxr_context import WiktextractContext 

8from .models import Form, WordEntry 

9from .tags import translate_raw_tags 

10 

11 

12@dataclass 

13class TableHeader: 

14 text: str 

15 col_index: int 

16 colspan: int 

17 row_index: int 

18 rowspan: int 

19 

20 

21def extract_flex_template( 

22 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

23) -> None: 

24 # https://pt.wiktionary.org/wiki/Predefinição:flex.pt 

25 expanded_node = wxr.wtp.parse( 

26 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

27 ) 

28 for table_node in expanded_node.find_child(NodeKind.TABLE): 

29 col_headers = [] 

30 for row_node in table_node.find_child(NodeKind.TABLE_ROW): 

31 row_header = "" 

32 col_cell_index = 0 

33 col_header_index = 0 

34 for cell_node in row_node.find_child( 

35 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

36 ): 

37 col_span = 1 

38 col_span_str = cell_node.attrs.get("colspan", "1") 

39 if re.fullmatch(r"\d+", col_span_str): 39 ↛ 41line 39 didn't jump to line 41 because the condition on line 39 was always true

40 col_span = int(col_span_str) 

41 cell_text = clean_node(wxr, None, cell_node) 

42 if cell_text == "": 

43 continue 

44 if cell_node.kind == NodeKind.TABLE_HEADER_CELL: 

45 if row_node.contain_node(NodeKind.TABLE_CELL): 

46 row_header = cell_text 

47 else: 

48 col_headers.append( 

49 TableHeader( 

50 cell_text, col_header_index, col_span, 0, 0 

51 ) 

52 ) 

53 col_header_index += col_span 

54 elif cell_node.attrs.get("style") == "background:#f4f4f4;": 54 ↛ 55line 54 didn't jump to line 55 because the condition on line 54 was never true

55 row_header = cell_text 

56 col_header_index += col_span 

57 else: 

58 for link_node in cell_node.find_child(NodeKind.LINK): 

59 form_str = clean_node(wxr, None, link_node) 

60 if form_str in ["", "–", "-", wxr.wtp.title]: 

61 continue 

62 form_data = Form(form=form_str) 

63 if row_header != "": 63 ↛ 65line 63 didn't jump to line 65 because the condition on line 63 was always true

64 form_data.raw_tags.append(row_header) 

65 for col_header in col_headers: 

66 if ( 

67 col_cell_index >= col_header.col_index 

68 and col_cell_index 

69 < col_header.col_index + col_header.colspan 

70 ): 

71 form_data.raw_tags.append(col_header.text) 

72 translate_raw_tags(form_data) 

73 word_entry.forms.append(form_data) 

74 

75 col_cell_index += col_span 

76 

77 

78def extract_conjugation_section( 

79 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode 

80) -> None: 

81 for t_node in level_node.find_child(NodeKind.TEMPLATE): 

82 if t_node.template_name.startswith(("conj.pt", "conj/pt")): 

83 extract_conj_pt_template(wxr, word_entry, t_node) 

84 elif t_node.template_name.startswith("conj.en"): 84 ↛ 81line 84 didn't jump to line 81 because the condition on line 84 was always true

85 extract_conj_en_template(wxr, word_entry, t_node) 

86 

87 

88def extract_conj_pt_template( 

89 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

90) -> None: 

91 # https://pt.wiktionary.org/wiki/Predefinição:conj.pt 

92 # https://pt.wiktionary.org/wiki/Predefinição:conj/pt 

93 expanded_node = wxr.wtp.parse( 

94 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

95 ) 

96 for index, table_node in enumerate( 

97 expanded_node.find_child_recursively(NodeKind.TABLE) 

98 ): 

99 match index: 

100 case 0: 

101 extract_conj_pt_template_first_table( 

102 wxr, word_entry, table_node 

103 ) 

104 case 1: 104 ↛ 96line 104 didn't jump to line 96 because the pattern on line 104 always matched

105 extract_conj_pt_template_second_table( 

106 wxr, word_entry, table_node 

107 ) 

108 

109 

110def extract_conj_pt_template_first_table( 

111 wxr: WiktextractContext, word_entry: WordEntry, table_node: WikiNode 

112) -> None: 

113 for row in table_node.find_child(NodeKind.TABLE_ROW): 

114 row_header = "" 

115 for cell in row.find_child( 

116 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

117 ): 

118 match cell.kind: 

119 case NodeKind.TABLE_HEADER_CELL: 

120 row_header = clean_node(wxr, None, cell) 

121 case NodeKind.TABLE_CELL: 121 ↛ 115line 121 didn't jump to line 115 because the pattern on line 121 always matched

122 form_str = clean_node(wxr, None, cell) 

123 if form_str not in ["", wxr.wtp.title]: 

124 form = Form(form=form_str) 

125 if row_header != "": 125 ↛ 127line 125 didn't jump to line 127 because the condition on line 125 was always true

126 form.raw_tags.append(row_header) 

127 translate_raw_tags(form) 

128 word_entry.forms.append(form) 

129 

130 

131def extract_conj_pt_template_second_table( 

132 wxr: WiktextractContext, word_entry: WordEntry, table_node: WikiNode 

133) -> None: 

134 col_headers = [] 

135 row_headers = [] 

136 row_index = 0 

137 for row in table_node.find_child(NodeKind.TABLE_ROW): 

138 col_index = 0 

139 for cell in row.find_child( 

140 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

141 ): 

142 match cell.kind: 

143 case NodeKind.TABLE_HEADER_CELL: 

144 colspan = 1 

145 colspan_str = cell.attrs.get("colspan", "1") 

146 if re.fullmatch(r"\d+", colspan_str): 146 ↛ 148line 146 didn't jump to line 148 because the condition on line 146 was always true

147 colspan = int(colspan_str) 

148 rowspan = 1 

149 rowspan_str = cell.attrs.get("rowspan", "1") 

150 if re.fullmatch(r"\d+", rowspan_str): 150 ↛ 152line 150 didn't jump to line 152 because the condition on line 150 was always true

151 rowspan = int(rowspan_str) 

152 header_str = clean_node(wxr, None, cell) 

153 if header_str == "": 

154 continue 

155 if rowspan > 1: 

156 row_index = 0 

157 row_headers.clear() 

158 header = TableHeader( 

159 header_str, col_index, colspan, row_index, rowspan 

160 ) 

161 if not row.contain_node(NodeKind.TABLE_CELL): 

162 col_headers.append(header) 

163 col_index += colspan 

164 else: 

165 row_headers.append(header) 

166 case NodeKind.TABLE_CELL: 166 ↛ 139line 166 didn't jump to line 139 because the pattern on line 166 always matched

167 has_link = False 

168 for link_node in cell.find_child(NodeKind.LINK): 

169 link_str = clean_node(wxr, None, link_node) 

170 if link_str not in ["", wxr.wtp.title]: 170 ↛ 179line 170 didn't jump to line 179 because the condition on line 170 was always true

171 add_conj_pt_form( 

172 word_entry, 

173 link_str, 

174 col_index, 

175 row_index, 

176 col_headers, 

177 row_headers, 

178 ) 

179 has_link = True 

180 if not has_link: 

181 cell_str = clean_node(wxr, None, cell) 

182 if cell_str not in ["", wxr.wtp.title]: 182 ↛ 191line 182 didn't jump to line 191 because the condition on line 182 was always true

183 add_conj_pt_form( 

184 word_entry, 

185 cell_str, 

186 col_index, 

187 row_index, 

188 col_headers, 

189 row_headers, 

190 ) 

191 col_index += 1 

192 

193 row_index += 1 

194 

195 

196def add_conj_pt_form( 

197 word_entry: WordEntry, 

198 form_str: str, 

199 col_index: int, 

200 row_index: int, 

201 col_headers: list[TableHeader], 

202 row_headers: list[TableHeader], 

203) -> None: 

204 form = Form(form=form_str) 

205 for col_header in col_headers: 

206 if ( 

207 col_index >= col_header.col_index 

208 and col_index < col_header.col_index + col_header.colspan 

209 ): 

210 form.raw_tags.append(col_header.text) 

211 for row_header in row_headers: 

212 if ( 212 ↛ 211line 212 didn't jump to line 211 because the condition on line 212 was always true

213 row_index >= row_header.row_index 

214 and row_index < row_header.row_index + row_header.rowspan 

215 ): 

216 form.raw_tags.append(row_header.text) 

217 translate_raw_tags(form) 

218 word_entry.forms.append(form) 

219 

220 

221def extract_conj_en_template( 

222 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

223) -> None: 

224 # https://pt.wiktionary.org/wiki/Predefinição:conj.en 

225 expanded_node = wxr.wtp.parse( 

226 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

227 ) 

228 for table in expanded_node.find_child(NodeKind.TABLE): 

229 for row in table.find_child(NodeKind.TABLE_ROW): 

230 for cell in row.find_child(NodeKind.TABLE_CELL): 

231 raw_tag = "" 

232 for sup_tag in cell.find_html("sup"): 

233 raw_tag = clean_node(wxr, None, sup_tag.children).strip( 

234 ": " 

235 ) 

236 for list_node in cell.find_child(NodeKind.LIST): 

237 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

238 for bold_node in list_item.find_child(NodeKind.BOLD): 

239 form_str = clean_node(wxr, None, bold_node) 

240 if form_str not in ["", wxr.wtp.title]: 240 ↛ 238line 240 didn't jump to line 238 because the condition on line 240 was always true

241 form = Form(form=form_str) 

242 if raw_tag != "": 242 ↛ 244line 242 didn't jump to line 244 because the condition on line 242 was always true

243 form.raw_tags.append(raw_tag) 

244 translate_raw_tags(form) 

245 word_entry.forms.append(form) 

246 

247 

248def extract_degree_section( 

249 wxr: WiktextractContext, 

250 word_entry: WordEntry, 

251 level_node: LevelNode, 

252) -> None: 

253 for list_node in level_node.find_child(NodeKind.LIST): 

254 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

255 for index, bold_node in list_item.find_child(NodeKind.BOLD, True): 255 ↛ 254line 255 didn't jump to line 254 because the loop on line 255 didn't complete

256 bold_str = clean_node(wxr, None, bold_node) 

257 forms_str = clean_node( 

258 wxr, None, list_item.children[index + 1 :] 

259 ).strip(": ") 

260 for form_str in forms_str.split(","): 

261 form_str = form_str.strip() 

262 if form_str not in ["", wxr.wtp.title]: 262 ↛ 260line 262 didn't jump to line 260 because the condition on line 262 was always true

263 form = Form(form=form_str) 

264 if form_str != "": 264 ↛ 266line 264 didn't jump to line 266 because the condition on line 264 was always true

265 form.raw_tags.append(bold_str) 

266 translate_raw_tags(form) 

267 word_entry.forms.append(form) 

268 break