Coverage for src/wiktextract/extractor/fr/inflection.py: 91%

133 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2024-12-27 08:07 +0000

1from dataclasses import dataclass 

2 

3from wikitextprocessor.parser import NodeKind, TemplateNode, WikiNode 

4 

5from ...page import clean_node 

6from ...wxr_context import WiktextractContext 

7from .models import Form, WordEntry 

8from .pronunciation import is_ipa_text 

9from .tags import translate_raw_tags 

10 

11 

12def extract_inflection( 

13 wxr: WiktextractContext, 

14 page_data: list[WordEntry], 

15 template_node: TemplateNode, 

16) -> None: 

17 # inflection templates 

18 # https://fr.wiktionary.org/wiki/Catégorie:Modèles_d’accord_en_français 

19 if template_node.template_name.startswith("en-adj"): 

20 process_en_adj_table(wxr, page_data, template_node) 

21 else: 

22 process_inflection_table(wxr, page_data, template_node) 

23 

24 

25IGNORE_TABLE_HEADERS = frozenset( 

26 { 

27 "terme", # https://fr.wiktionary.org/wiki/Modèle:de-adj 

28 "forme", # br-flex-adj 

29 "temps", # en-conj-rég, 

30 "cas", # lt_décl_as, ro-nom-tab(lower case) 

31 "commun", # sv-nom-c-ar 

32 "personne", # hu-pos-otok 

33 "pronom personnel", # it-enclise 

34 "mutation", # br-nom 

35 "nombre", # ca-accord-mixte2 

36 "nature", # de-adj 

37 "genre", # es-accord-oa 

38 "conjugaison présent indicatif", # avk-tab-conjug 

39 "mode", # eo-conj 

40 "avec suffixes possessifs", # fi-décl-valo 

41 } 

42) 

43IGNORE_TABLE_HEADER_PREFIXES = ( 

44 "voir la conjugaison du verbe ", # Modèle:fr-verbe-flexion 

45 "conjugaison de ", # sv-conj-ar 

46 "déclinaison de ", # da-adj 

47) 

48IGNORE_TABLE_CELL = frozenset( 

49 { 

50 "Déclinaisons", # de-adj 

51 "—", # https://fr.wiktionary.org/wiki/Modèle:vls-nom 

52 } 

53) 

54IGNORE_TABLE_CELL_PREFIXES = ( 

55 "voir conjugaison ", # en-conj, avk-conj 

56) 

57 

58 

59@dataclass 

60class ColspanHeader: 

61 text: str 

62 index: int 

63 span: int 

64 

65 

66def table_data_cell_is_header( 

67 wxr: WiktextractContext, cell_node: WikiNode, page_title: str 

68) -> bool: 

69 # first child is bold node 

70 if cell_node.kind == NodeKind.TABLE_CELL: 70 ↛ 81line 70 didn't jump to line 81 because the condition on line 70 was always true

71 for child in cell_node.filter_empty_str_child(): 71 ↛ 81line 71 didn't jump to line 81 because the loop on line 71 didn't complete

72 cell_text = clean_node(wxr, None, child) 

73 return ( 

74 isinstance(child, WikiNode) 

75 and child.kind == NodeKind.BOLD 

76 and len(cell_text) > 0 

77 and cell_text[0].isupper() 

78 and cell_text != page_title 

79 ) 

80 

81 return False 

82 

83 

84def process_inflection_table( 

85 wxr: WiktextractContext, 

86 page_data: list[WordEntry], 

87 table_template: TemplateNode, 

88) -> None: 

89 expanded_node = wxr.wtp.parse( 

90 wxr.wtp.node_to_wikitext(table_template), expand_all=True 

91 ) 

92 table_nodes = list(expanded_node.find_child(NodeKind.TABLE)) 

93 if len(table_nodes) == 0: 93 ↛ 94line 93 didn't jump to line 94 because the condition on line 93 was never true

94 return 

95 table_node = table_nodes[0] 

96 column_headers = [] 

97 rowspan_headers = [] 

98 colspan_headers = [] 

99 for row_num, table_row in enumerate( 

100 table_node.find_child(NodeKind.TABLE_ROW) 

101 ): 

102 # filter empty table cells 

103 table_row_nodes = [ 

104 row_node_child 

105 for row_node_child in table_row.children 

106 if isinstance(row_node_child, WikiNode) 

107 and ( 

108 row_node_child.kind == NodeKind.TABLE_HEADER_CELL 

109 or ( 

110 row_node_child.kind == NodeKind.TABLE_CELL 

111 and len(list(row_node_child.filter_empty_str_child())) > 0 

112 ) 

113 ) 

114 and row_node_child.attrs.get("style") != "display:none" 

115 and "invisible" not in row_node_child.attrs.get("class", "") 

116 ] 

117 current_row_has_data_cell = any( 

118 isinstance(cell, WikiNode) 

119 and cell.kind == NodeKind.TABLE_CELL 

120 and not table_data_cell_is_header(wxr, cell, page_data[-1].word) 

121 for cell in table_row_nodes 

122 ) 

123 if not current_row_has_data_cell: 

124 column_headers.clear() 

125 row_headers = [] 

126 new_rowspan_headers = [] 

127 for rowspan_text, rowspan_count in rowspan_headers: 

128 row_headers.append(rowspan_text) 

129 if rowspan_count - 1 > 0: 129 ↛ 130line 129 didn't jump to line 130 because the condition on line 129 was never true

130 new_rowspan_headers.append((rowspan_text, rowspan_count - 1)) 

131 rowspan_headers = new_rowspan_headers 

132 

133 column_cell_index = 0 

134 for column_num, table_cell in enumerate(table_row_nodes): 

135 form_data = Form() 

136 if isinstance(table_cell, WikiNode): 136 ↛ 134line 136 didn't jump to line 134 because the condition on line 136 was always true

137 if ( 

138 table_cell.kind == NodeKind.TABLE_HEADER_CELL 

139 or table_data_cell_is_header( 

140 wxr, table_cell, page_data[-1].word 

141 ) 

142 ): 

143 if any( 

144 table_cell.find_html( 

145 "span", 

146 attr_name="class", 

147 attr_value="ligne-de-forme", 

148 ) 

149 ): 

150 # ignore gender header in template "ro-nom-tab" 

151 continue 

152 table_header_text = clean_node( 

153 wxr, None, table_cell 

154 ).replace("\n", " ") 

155 if ( 

156 table_header_text.lower() in IGNORE_TABLE_HEADERS 

157 or table_header_text.lower().startswith( 

158 IGNORE_TABLE_HEADER_PREFIXES 

159 ) 

160 or len(table_header_text.strip()) == 0 

161 ): 

162 continue 

163 rsplit_header = table_header_text.rsplit(maxsplit=1) 

164 if len(rsplit_header) > 1 and rsplit_header[-1].isdecimal(): 

165 # "Pluriel 1" in template "br-nom" 

166 table_header_text = rsplit_header[0] 

167 

168 if not current_row_has_data_cell: 

169 # if all cells of the row are header cells 

170 # then the header cells are column headers 

171 if "colspan" in table_cell.attrs: 

172 colspan_headers.append( 

173 ColspanHeader( 

174 table_header_text, 

175 column_cell_index, 

176 int(table_cell.attrs.get("colspan")), 

177 ) 

178 ) 

179 else: 

180 column_headers.append(table_header_text) 

181 column_cell_index += int( 

182 table_cell.attrs.get("colspan", 1) 

183 ) 

184 else: 

185 if table_header_text not in row_headers: 185 ↛ 187line 185 didn't jump to line 187 because the condition on line 185 was always true

186 row_headers.append(table_header_text) 

187 if "rowspan" in table_cell.attrs: 

188 rowspan_headers.append( 

189 ( 

190 table_header_text, 

191 int(table_cell.attrs.get("rowspan")) - 1, 

192 ) 

193 ) 

194 elif table_cell.kind == NodeKind.TABLE_CELL: 194 ↛ 134line 194 didn't jump to line 134 because the condition on line 194 was always true

195 table_cell_lines = clean_node(wxr, None, table_cell) 

196 for table_cell_line in table_cell_lines.splitlines(): 

197 if is_ipa_text(table_cell_line): 

198 insert_ipa(form_data, table_cell_line) 

199 elif ( 

200 table_cell_line != page_data[-1].word 

201 and table_cell_line not in IGNORE_TABLE_CELL 

202 and not table_cell_line.lower().startswith( 

203 IGNORE_TABLE_CELL_PREFIXES 

204 ) 

205 ): 

206 if form_data.form == "": 

207 form_data.form = table_cell_line 

208 else: 

209 form_data.form += "\n" + table_cell_line 

210 for colspan_header in colspan_headers: 

211 if ( 

212 column_cell_index >= colspan_header.index 

213 and column_cell_index 

214 < colspan_header.index + colspan_header.span 

215 ): 

216 form_data.raw_tags.append(colspan_header.text) 

217 if ( 

218 "colspan" not in table_cell.attrs 

219 and len(column_headers) > column_cell_index 

220 and column_headers[column_cell_index].lower() 

221 not in IGNORE_TABLE_HEADERS 

222 ): 

223 form_data.raw_tags.append( 

224 column_headers[column_cell_index] 

225 ) 

226 

227 if len(row_headers) > 0: 

228 form_data.raw_tags.extend(row_headers) 

229 if form_data.form != "": 

230 for form in form_data.form.splitlines(): 

231 if form.startswith("(") and form.endswith(")"): 231 ↛ 232line 231 didn't jump to line 232 because the condition on line 231 was never true

232 form_data.raw_tags.append(form.strip("()")) 

233 continue 

234 new_form_data = form_data.model_copy(deep=True) 

235 new_form_data.form = form.removeprefix("ou ") 

236 translate_raw_tags( 

237 new_form_data, table_template.template_name 

238 ) 

239 if len(new_form_data.form.strip()) > 0: 239 ↛ 230line 239 didn't jump to line 230 because the condition on line 239 was always true

240 page_data[-1].forms.append(new_form_data) 

241 

242 colspan_text = table_cell.attrs.get("colspan", "1") 

243 if colspan_text.isdecimal(): 243 ↛ 134line 243 didn't jump to line 134 because the condition on line 243 was always true

244 column_cell_index += int(colspan_text) 

245 

246 

247def split_ipa(text: str) -> list[str]: 

248 # break IPA text if it contains "ou"(or) 

249 if " ou " in text: 

250 # two ipa texts in the same line: "en-conj-rég" template 

251 return text.split(" ou ") 

252 if text.startswith("ou "): 

253 return [text.removeprefix("ou ")] 

254 if text.endswith("Prononciation ?\\"): 

255 # inflection table templates use a edit link when the ipa data is 

256 # missing, and the link usually ends with "Prononciation ?" 

257 return [] 

258 return [text] 

259 

260 

261def insert_ipa(form: Form, ipa_text: str) -> None: 

262 ipa_data = split_ipa(ipa_text) 

263 if len(ipa_data) == 0: 

264 return 

265 form.ipas.extend(ipa_data) 

266 

267 

268def process_en_adj_table( 

269 wxr: WiktextractContext, 

270 page_data: list[WordEntry], 

271 template_node: WikiNode, 

272) -> None: 

273 # https://fr.wiktionary.org/wiki/Modèle:en-adj 

274 # and other en-adj* templates 

275 # these templates use normal table cell for column table header 

276 expanded_node = wxr.wtp.parse( 

277 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

278 ) 

279 table_nodes = list(expanded_node.find_child(NodeKind.TABLE)) 

280 if len(table_nodes) == 0: 280 ↛ 281line 280 didn't jump to line 281 because the condition on line 280 was never true

281 return 

282 table_node = table_nodes[0] 

283 for row_num, table_row in enumerate( 

284 table_node.find_child(NodeKind.TABLE_ROW) 

285 ): 

286 if row_num == 0: 

287 # skip header 

288 continue 

289 if len(table_row.children) > 1: 289 ↛ 283line 289 didn't jump to line 283 because the condition on line 289 was always true

290 form_data = Form() 

291 form_data.raw_tags.append( 

292 clean_node(wxr, None, table_row.children[0]) 

293 ) 

294 form_text = clean_node(wxr, None, table_row.children[1]) 

295 for form_line in form_text.splitlines(): 

296 if form_line in IGNORE_TABLE_CELL: 296 ↛ 297line 296 didn't jump to line 297 because the condition on line 296 was never true

297 continue 

298 elif is_ipa_text(form_line): 

299 insert_ipa(form_data, form_line) 

300 else: 

301 form_data.form = form_line 

302 if form_data.form != page_data[-1].word and len(form_data.form) > 0: 

303 translate_raw_tags(form_data) 

304 page_data[-1].forms.append(form_data)