Coverage for src/wiktextract/extractor/pt/inflection.py: 93%

1import re

2from dataclasses import dataclass

4from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode

6from ...page import clean_node

7from ...wxr_context import WiktextractContext

8from .models import Form, WordEntry

9from .tags import translate_raw_tags

12@dataclass

13class TableHeader:

14 text: str

15 col_index: int

16 colspan: int

17 row_index: int

18 rowspan: int

21def extract_flex_template(

22 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode

23) -> None:

24 # https://pt.wiktionary.org/wiki/Predefinição:flex.pt

25 expanded_node = wxr.wtp.parse(

26 wxr.wtp.node_to_wikitext(t_node), expand_all=True

27 )

28 for table_node in expanded_node.find_child(NodeKind.TABLE):

29 col_headers = []

30 for row_node in table_node.find_child(NodeKind.TABLE_ROW):

31 row_header = ""

32 col_cell_index = 0

33 col_header_index = 0

34 for cell_node in row_node.find_child(

35 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL

36 ):

37 col_span = 1

38 col_span_str = cell_node.attrs.get("colspan", "1")

39 if re.fullmatch(r"\d+", col_span_str): 39 ↛ 41line 39 didn't jump to line 41 because the condition on line 39 was always true

40 col_span = int(col_span_str)

41 cell_text = clean_node(wxr, None, cell_node)

42 if cell_text == "":

43 continue

44 if cell_node.kind == NodeKind.TABLE_HEADER_CELL:

45 if row_node.contain_node(NodeKind.TABLE_CELL):

46 row_header = cell_text

47 else:

48 col_headers.append(

49 TableHeader(

50 cell_text, col_header_index, col_span, 0, 0

51 )

52 )

53 col_header_index += col_span

54 elif cell_node.attrs.get("style") == "background:#f4f4f4;": 54 ↛ 55line 54 didn't jump to line 55 because the condition on line 54 was never true

55 row_header = cell_text

56 col_header_index += col_span

57 else:

58 for link_node in cell_node.find_child(NodeKind.LINK):

59 form_str = clean_node(wxr, None, link_node)

60 if form_str in ["", "–", "-", wxr.wtp.title]:

61 continue

62 form_data = Form(form=form_str)

63 if row_header != "": 63 ↛ 65line 63 didn't jump to line 65 because the condition on line 63 was always true

64 form_data.raw_tags.append(row_header)

65 for col_header in col_headers:

66 if (

67 col_cell_index >= col_header.col_index

68 and col_cell_index

69 < col_header.col_index + col_header.colspan

70 ):

71 form_data.raw_tags.append(col_header.text)

72 translate_raw_tags(form_data)

73 word_entry.forms.append(form_data)

75 col_cell_index += col_span

78def extract_conjugation_section(

79 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode

80) -> None:

81 for t_node in level_node.find_child(NodeKind.TEMPLATE):

82 if t_node.template_name.startswith(("conj.pt", "conj/pt")):

83 extract_conj_pt_template(wxr, word_entry, t_node)

84 elif t_node.template_name.startswith("conj.en"): 84 ↛ 81line 84 didn't jump to line 81 because the condition on line 84 was always true

85 extract_conj_en_template(wxr, word_entry, t_node)

88def extract_conj_pt_template(

89 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode

90) -> None:

91 # https://pt.wiktionary.org/wiki/Predefinição:conj.pt

92 # https://pt.wiktionary.org/wiki/Predefinição:conj/pt

93 expanded_node = wxr.wtp.parse(

94 wxr.wtp.node_to_wikitext(t_node), expand_all=True

95 )

96 for index, table_node in enumerate(

97 expanded_node.find_child_recursively(NodeKind.TABLE)

98 ):

99 match index:

100 case 0:

101 extract_conj_pt_template_first_table(

102 wxr, word_entry, table_node

103 )

104 case 1: 104 ↛ 96line 104 didn't jump to line 96 because the pattern on line 104 always matched

105 extract_conj_pt_template_second_table(

106 wxr, word_entry, table_node

107 )

108

109

110def extract_conj_pt_template_first_table(

111 wxr: WiktextractContext, word_entry: WordEntry, table_node: WikiNode

112) -> None:

113 for row in table_node.find_child(NodeKind.TABLE_ROW):

114 row_header = ""

115 for cell in row.find_child(

116 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL

117 ):

118 match cell.kind:

119 case NodeKind.TABLE_HEADER_CELL:

120 row_header = clean_node(wxr, None, cell)

121 case NodeKind.TABLE_CELL: 121 ↛ 115line 121 didn't jump to line 115 because the pattern on line 121 always matched

122 form_str = clean_node(wxr, None, cell)

123 if form_str not in ["", wxr.wtp.title]:

124 form = Form(form=form_str)

125 if row_header != "": 125 ↛ 127line 125 didn't jump to line 127 because the condition on line 125 was always true

126 form.raw_tags.append(row_header)

127 translate_raw_tags(form)

128 word_entry.forms.append(form)

129

130

131def extract_conj_pt_template_second_table(

132 wxr: WiktextractContext, word_entry: WordEntry, table_node: WikiNode

133) -> None:

134 col_headers = []

135 row_headers = []

136 row_index = 0

137 for row in table_node.find_child(NodeKind.TABLE_ROW):

138 col_index = 0

139 for cell in row.find_child(

140 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL

141 ):

142 match cell.kind:

143 case NodeKind.TABLE_HEADER_CELL:

144 colspan = 1

145 colspan_str = cell.attrs.get("colspan", "1")

146 if re.fullmatch(r"\d+", colspan_str): 146 ↛ 148line 146 didn't jump to line 148 because the condition on line 146 was always true

147 colspan = int(colspan_str)

148 rowspan = 1

149 rowspan_str = cell.attrs.get("rowspan", "1")

150 if re.fullmatch(r"\d+", rowspan_str): 150 ↛ 152line 150 didn't jump to line 152 because the condition on line 150 was always true

151 rowspan = int(rowspan_str)

152 header_str = clean_node(wxr, None, cell)

153 if header_str == "":

154 continue

155 if rowspan > 1:

156 row_index = 0

157 row_headers.clear()

158 header = TableHeader(

159 header_str, col_index, colspan, row_index, rowspan

160 )

161 if not row.contain_node(NodeKind.TABLE_CELL):

162 col_headers.append(header)

163 col_index += colspan

164 else:

165 row_headers.append(header)

166 case NodeKind.TABLE_CELL: 166 ↛ 139line 166 didn't jump to line 139 because the pattern on line 166 always matched

167 has_link = False

168 for link_node in cell.find_child(NodeKind.LINK):

169 link_str = clean_node(wxr, None, link_node)

170 if link_str not in ["", wxr.wtp.title]: 170 ↛ 179line 170 didn't jump to line 179 because the condition on line 170 was always true

171 add_conj_pt_form(

172 word_entry,

173 link_str,

174 col_index,

175 row_index,

176 col_headers,

177 row_headers,

178 )

179 has_link = True

180 if not has_link:

181 cell_str = clean_node(wxr, None, cell)

182 if cell_str not in ["", wxr.wtp.title]: 182 ↛ 191line 182 didn't jump to line 191 because the condition on line 182 was always true

183 add_conj_pt_form(

184 word_entry,

185 cell_str,

186 col_index,

187 row_index,

188 col_headers,

189 row_headers,

190 )

191 col_index += 1

192

193 row_index += 1

194

195

196def add_conj_pt_form(

197 word_entry: WordEntry,

198 form_str: str,

199 col_index: int,

200 row_index: int,

201 col_headers: list[TableHeader],

202 row_headers: list[TableHeader],

203) -> None:

204 form = Form(form=form_str)

205 for col_header in col_headers:

206 if (

207 col_index >= col_header.col_index

208 and col_index < col_header.col_index + col_header.colspan

209 ):

210 form.raw_tags.append(col_header.text)

211 for row_header in row_headers:

212 if ( 212 ↛ 211line 212 didn't jump to line 211 because the condition on line 212 was always true

213 row_index >= row_header.row_index

214 and row_index < row_header.row_index + row_header.rowspan

215 ):

216 form.raw_tags.append(row_header.text)

217 translate_raw_tags(form)

218 word_entry.forms.append(form)

219

220

221def extract_conj_en_template(

222 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode

223) -> None:

224 # https://pt.wiktionary.org/wiki/Predefinição:conj.en

225 expanded_node = wxr.wtp.parse(

226 wxr.wtp.node_to_wikitext(t_node), expand_all=True

227 )

228 for table in expanded_node.find_child(NodeKind.TABLE):

229 for row in table.find_child(NodeKind.TABLE_ROW):

230 for cell in row.find_child(NodeKind.TABLE_CELL):

231 raw_tag = ""

232 for sup_tag in cell.find_html("sup"):

233 raw_tag = clean_node(wxr, None, sup_tag.children).strip(

234 ": "

235 )

236 for list_node in cell.find_child(NodeKind.LIST):

237 for list_item in list_node.find_child(NodeKind.LIST_ITEM):

238 for bold_node in list_item.find_child(NodeKind.BOLD):

239 form_str = clean_node(wxr, None, bold_node)

240 if form_str not in ["", wxr.wtp.title]: 240 ↛ 238line 240 didn't jump to line 238 because the condition on line 240 was always true

241 form = Form(form=form_str)

242 if raw_tag != "": 242 ↛ 244line 242 didn't jump to line 244 because the condition on line 242 was always true

243 form.raw_tags.append(raw_tag)

244 translate_raw_tags(form)

245 word_entry.forms.append(form)

246

247

248def extract_degree_section(

249 wxr: WiktextractContext,

250 word_entry: WordEntry,

251 level_node: LevelNode,

252) -> None:

253 for list_node in level_node.find_child(NodeKind.LIST):

254 for list_item in list_node.find_child(NodeKind.LIST_ITEM):

255 for index, bold_node in list_item.find_child(NodeKind.BOLD, True): 255 ↛ 254line 255 didn't jump to line 254 because the loop on line 255 didn't complete

256 bold_str = clean_node(wxr, None, bold_node)

257 forms_str = clean_node(

258 wxr, None, list_item.children[index + 1 :]

259 ).strip(": ")

260 for form_str in forms_str.split(","):

261 form_str = form_str.strip()

262 if form_str not in ["", wxr.wtp.title]: 262 ↛ 260line 262 didn't jump to line 260 because the condition on line 262 was always true

263 form = Form(form=form_str)

264 if form_str != "": 264 ↛ 266line 264 didn't jump to line 266 because the condition on line 264 was always true

265 form.raw_tags.append(bold_str)

266 translate_raw_tags(form)

267 word_entry.forms.append(form)

268 break