Coverage for src/wiktextract/extractor/fr/inflection.py: 90%

180 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-12 08:27 +0000

1import re 

2from dataclasses import dataclass 

3 

4from wikitextprocessor.parser import NodeKind, TemplateNode, WikiNode 

5 

6from ...page import clean_node 

7from ...wxr_context import WiktextractContext 

8from .models import Form, WordEntry 

9from .pronunciation import is_ipa_text 

10from .tags import translate_raw_tags 

11 

12 

13def extract_inflection( 

14 wxr: WiktextractContext, page_data: list[WordEntry], t_node: TemplateNode 

15): 

16 # inflection templates 

17 # https://fr.wiktionary.org/wiki/Catégorie:Modèles_d’accord_en_français 

18 if t_node.template_name.startswith("en-adj"): 

19 process_en_adj_table(wxr, page_data[-1], t_node) 

20 elif t_node.template_name == "fro-adj": 

21 extract_fro_adj_template(wxr, page_data[-1], t_node) 

22 else: 

23 process_inflection_table(wxr, page_data, t_node) 

24 

25 

26IGNORE_TABLE_HEADERS = frozenset( 

27 { 

28 "terme", # https://fr.wiktionary.org/wiki/Modèle:de-adj 

29 "forme", # br-flex-adj 

30 "temps", # en-conj-rég, 

31 "cas", # lt_décl_as, ro-nom-tab(lower case) 

32 "commun", # sv-nom-c-ar 

33 "personne", # hu-pos-otok 

34 "pronom personnel", # it-enclise 

35 "mutation", # br-nom 

36 "nombre", # ca-accord-mixte2 

37 "nature", # de-adj 

38 "genre", # es-accord-oa 

39 "conjugaison présent indicatif", # avk-tab-conjug 

40 "mode", # eo-conj 

41 "avec suffixes possessifs", # fi-décl-valo 

42 "en kurmandji", # flex-ku-nomf 

43 } 

44) 

45IGNORE_TABLE_HEADER_PREFIXES = ( 

46 "voir la conjugaison du verbe ", # Modèle:fr-verbe-flexion 

47 "conjugaison de ", # sv-conj-ar 

48 "déclinaison de ", # da-adj 

49) 

50IGNORE_TABLE_CELL = frozenset( 

51 { 

52 "Déclinaisons", # de-adj 

53 "—", # https://fr.wiktionary.org/wiki/Modèle:vls-nom 

54 } 

55) 

56IGNORE_TABLE_CELL_PREFIXES = ( 

57 "voir conjugaison ", # en-conj, avk-conj 

58) 

59 

60 

61@dataclass 

62class TableHeader: 

63 text: str 

64 index: int 

65 span: int 

66 

67 

68def table_data_cell_is_header( 

69 wxr: WiktextractContext, cell_node: WikiNode, page_title: str 

70) -> bool: 

71 # first child is bold node 

72 if cell_node.kind == NodeKind.TABLE_CELL: 72 ↛ 83line 72 didn't jump to line 83 because the condition on line 72 was always true

73 for child in cell_node.filter_empty_str_child(): 73 ↛ 83line 73 didn't jump to line 83 because the loop on line 73 didn't complete

74 cell_text = clean_node(wxr, None, child) 

75 return ( 

76 isinstance(child, WikiNode) 

77 and child.kind == NodeKind.BOLD 

78 and len(cell_text) > 0 

79 and cell_text[0].isupper() 

80 and cell_text != page_title 

81 ) 

82 

83 return False 

84 

85 

86def process_inflection_table( 

87 wxr: WiktextractContext, page_data: list[WordEntry], t_node: TemplateNode 

88) -> None: 

89 from .form_line import is_conj_link, process_conj_link_node 

90 

91 expanded_node = wxr.wtp.parse( 

92 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

93 ) 

94 table_nodes = list(expanded_node.find_child(NodeKind.TABLE)) 

95 if len(table_nodes) == 0: 95 ↛ 96line 95 didn't jump to line 96 because the condition on line 95 was never true

96 return 

97 table_node = table_nodes[0] 

98 column_headers = [] 

99 rowspan_headers = [] 

100 colspan_headers = [] 

101 for row_num, table_row in enumerate( 

102 table_node.find_child(NodeKind.TABLE_ROW) 

103 ): 

104 # filter empty table cells 

105 table_row_nodes = [ 

106 row_node_child 

107 for row_node_child in table_row.children 

108 if isinstance(row_node_child, WikiNode) 

109 and ( 

110 row_node_child.kind == NodeKind.TABLE_HEADER_CELL 

111 or ( 

112 row_node_child.kind == NodeKind.TABLE_CELL 

113 and len(list(row_node_child.filter_empty_str_child())) > 0 

114 ) 

115 ) 

116 and row_node_child.attrs.get("style") != "display:none" 

117 and "invisible" not in row_node_child.attrs.get("class", "") 

118 ] 

119 current_row_has_data_cell = any( 

120 isinstance(cell, WikiNode) 

121 and cell.kind == NodeKind.TABLE_CELL 

122 and not table_data_cell_is_header(wxr, cell, page_data[-1].word) 

123 for cell in table_row_nodes 

124 ) 

125 if not current_row_has_data_cell: 

126 column_headers.clear() 

127 row_headers = [] 

128 new_rowspan_headers = [] 

129 for rowspan_text, rowspan_count in rowspan_headers: 

130 row_headers.append(rowspan_text) 

131 if rowspan_count - 1 > 0: 131 ↛ 132line 131 didn't jump to line 132 because the condition on line 131 was never true

132 new_rowspan_headers.append((rowspan_text, rowspan_count - 1)) 

133 rowspan_headers = new_rowspan_headers 

134 

135 column_cell_index = 0 

136 for column_num, table_cell in enumerate(table_row_nodes): 

137 form_data = Form() 

138 if isinstance(table_cell, WikiNode): 138 ↛ 136line 138 didn't jump to line 136 because the condition on line 138 was always true

139 if ( 

140 table_cell.kind == NodeKind.TABLE_HEADER_CELL 

141 or table_data_cell_is_header( 

142 wxr, table_cell, page_data[-1].word 

143 ) 

144 ): 

145 if any( 

146 table_cell.find_html( 

147 "span", 

148 attr_name="class", 

149 attr_value="ligne-de-forme", 

150 ) 

151 ): 

152 # ignore gender header in template "ro-nom-tab" 

153 continue 

154 table_header_text = clean_node( 

155 wxr, None, table_cell 

156 ).replace("\n", " ") 

157 if ( 

158 table_header_text.lower() in IGNORE_TABLE_HEADERS 

159 or table_header_text.lower().startswith( 

160 IGNORE_TABLE_HEADER_PREFIXES 

161 ) 

162 or len(table_header_text.strip()) == 0 

163 ): 

164 continue 

165 rsplit_header = table_header_text.rsplit(maxsplit=1) 

166 if len(rsplit_header) > 1 and rsplit_header[-1].isdecimal(): 

167 # "Pluriel 1" in template "br-nom" 

168 table_header_text = rsplit_header[0] 

169 

170 if not current_row_has_data_cell: 

171 # if all cells of the row are header cells 

172 # then the header cells are column headers 

173 if "colspan" in table_cell.attrs: 

174 colspan_headers.append( 

175 TableHeader( 

176 table_header_text, 

177 column_cell_index, 

178 int(table_cell.attrs.get("colspan")), 

179 ) 

180 ) 

181 else: 

182 column_headers.append(table_header_text) 

183 column_cell_index += int( 

184 table_cell.attrs.get("colspan", 1) 

185 ) 

186 else: 

187 if table_header_text not in row_headers: 187 ↛ 189line 187 didn't jump to line 189 because the condition on line 187 was always true

188 row_headers.append(table_header_text) 

189 if "rowspan" in table_cell.attrs: 

190 rowspan_headers.append( 

191 ( 

192 table_header_text, 

193 int(table_cell.attrs.get("rowspan")) - 1, 

194 ) 

195 ) 

196 elif table_cell.kind == NodeKind.TABLE_CELL: 196 ↛ 136line 196 didn't jump to line 136 because the condition on line 196 was always true

197 has_conj_link = False 

198 for link_node in table_cell.find_child(NodeKind.LINK): 

199 if is_conj_link(wxr, link_node): 199 ↛ 200line 199 didn't jump to line 200 because the condition on line 199 was never true

200 process_conj_link_node(wxr, link_node, page_data) 

201 has_conj_link = True 

202 break 

203 if has_conj_link: 203 ↛ 204line 203 didn't jump to line 204 because the condition on line 203 was never true

204 continue 

205 table_cell_lines = clean_node(wxr, None, table_cell) 

206 for table_cell_line in table_cell_lines.splitlines(): 

207 if is_ipa_text(table_cell_line): 

208 insert_ipa(form_data, table_cell_line) 

209 elif ( 

210 table_cell_line != page_data[-1].word 

211 and table_cell_line not in IGNORE_TABLE_CELL 

212 and not table_cell_line.lower().startswith( 

213 IGNORE_TABLE_CELL_PREFIXES 

214 ) 

215 ): 

216 if form_data.form == "": 

217 form_data.form = table_cell_line 

218 else: 

219 form_data.form += "\n" + table_cell_line 

220 for colspan_header in colspan_headers: 

221 if ( 

222 column_cell_index >= colspan_header.index 

223 and column_cell_index 

224 < colspan_header.index + colspan_header.span 

225 ): 

226 form_data.raw_tags.append(colspan_header.text) 

227 if ( 

228 "colspan" not in table_cell.attrs 

229 and len(column_headers) > column_cell_index 

230 and column_headers[column_cell_index].lower() 

231 not in IGNORE_TABLE_HEADERS 

232 ): 

233 form_data.raw_tags.append( 

234 column_headers[column_cell_index] 

235 ) 

236 

237 if len(row_headers) > 0: 

238 form_data.raw_tags.extend(row_headers) 

239 if form_data.form != "": 

240 for form in form_data.form.splitlines(): 

241 if form.startswith("(") and form.endswith(")"): 241 ↛ 242line 241 didn't jump to line 242 because the condition on line 241 was never true

242 form_data.raw_tags.append(form.strip("()")) 

243 continue 

244 new_form_data = form_data.model_copy(deep=True) 

245 new_form_data.form = form.removeprefix("ou ") 

246 translate_raw_tags( 

247 new_form_data, t_node.template_name 

248 ) 

249 if len(new_form_data.form.strip()) > 0: 249 ↛ 240line 249 didn't jump to line 240 because the condition on line 249 was always true

250 page_data[-1].forms.append(new_form_data) 

251 

252 colspan_text = table_cell.attrs.get("colspan", "1") 

253 if colspan_text.isdecimal(): 253 ↛ 136line 253 didn't jump to line 136 because the condition on line 253 was always true

254 column_cell_index += int(colspan_text) 

255 

256 

257def split_ipa(text: str) -> list[str]: 

258 # break IPA text if it contains "ou"(or) 

259 if " ou " in text: 

260 # two ipa texts in the same line: "en-conj-rég" template 

261 return text.split(" ou ") 

262 if text.startswith("ou "): 

263 return [text.removeprefix("ou ")] 

264 if text.endswith("Prononciation ?\\"): 

265 # inflection table templates use a edit link when the ipa data is 

266 # missing, and the link usually ends with "Prononciation ?" 

267 return [] 

268 return [text] 

269 

270 

271def insert_ipa(form: Form, ipa_text: str) -> None: 

272 ipa_data = split_ipa(ipa_text) 

273 if len(ipa_data) == 0: 

274 return 

275 form.ipas.extend(ipa_data) 

276 

277 

278def process_en_adj_table( 

279 wxr: WiktextractContext, word_entry: WordEntry, t_node: WikiNode 

280) -> None: 

281 # https://fr.wiktionary.org/wiki/Modèle:en-adj 

282 # and other en-adj* templates 

283 # these templates use normal table cell for column table header 

284 expanded_node = wxr.wtp.parse( 

285 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

286 ) 

287 table_nodes = list(expanded_node.find_child(NodeKind.TABLE)) 

288 if len(table_nodes) == 0: 288 ↛ 289line 288 didn't jump to line 289 because the condition on line 288 was never true

289 return 

290 table_node = table_nodes[0] 

291 for row_num, table_row in enumerate( 

292 table_node.find_child(NodeKind.TABLE_ROW) 

293 ): 

294 if row_num == 0: 

295 # skip header 

296 continue 

297 if len(table_row.children) > 1: 297 ↛ 291line 297 didn't jump to line 291 because the condition on line 297 was always true

298 form_data = Form() 

299 form_data.raw_tags.append( 

300 clean_node(wxr, None, table_row.children[0]) 

301 ) 

302 form_text = clean_node(wxr, None, table_row.children[1]) 

303 for form_line in form_text.splitlines(): 

304 if form_line in IGNORE_TABLE_CELL: 304 ↛ 305line 304 didn't jump to line 305 because the condition on line 304 was never true

305 continue 

306 elif is_ipa_text(form_line): 

307 insert_ipa(form_data, form_line) 

308 else: 

309 form_data.form = form_line 

310 if form_data.form != word_entry.word and len(form_data.form) > 0: 

311 translate_raw_tags(form_data) 

312 word_entry.forms.append(form_data) 

313 

314 

315def extract_fro_adj_template( 

316 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

317): 

318 # https://fr.wiktionary.org/wiki/Modèle:fro-adj 

319 expanded_node = wxr.wtp.parse( 

320 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

321 ) 

322 col_headers = [] 

323 row_headers = [] 

324 for table in expanded_node.find_child(NodeKind.TABLE): 

325 for row_index, row in enumerate(table.find_child(NodeKind.TABLE_ROW)): 

326 row_has_data = row.contain_node(NodeKind.TABLE_CELL) 

327 for col_index, cell_node in enumerate( 

328 row.find_child(NodeKind.TABLE_HEADER_CELL) 

329 ): 

330 cell_text = clean_node(wxr, None, cell_node) 

331 if cell_text == "" or cell_text.lower() in IGNORE_TABLE_HEADERS: 

332 continue 

333 if not row_has_data: 

334 col_headers.append(cell_text) 

335 else: 

336 rowspan_str = cell_node.attrs.get("rowspan", "1") 

337 rowspan = 1 

338 if re.fullmatch(r"\d+", rowspan_str) is not None: 338 ↛ 340line 338 didn't jump to line 340 because the condition on line 338 was always true

339 rowspan = int(rowspan_str) 

340 row_headers.append( 

341 TableHeader(cell_text, row_index, rowspan) 

342 ) 

343 

344 for row_index, row in enumerate(table.find_child(NodeKind.TABLE_ROW)): 

345 for col_index, cell_node in enumerate( 

346 row.find_child(NodeKind.TABLE_CELL) 

347 ): 

348 cell_text = clean_node(wxr, None, cell_node) 

349 if cell_text in ["", wxr.wtp.title]: 

350 continue 

351 form = Form(form=cell_text) 

352 if col_index < len(col_headers): 352 ↛ 354line 352 didn't jump to line 354 because the condition on line 352 was always true

353 form.raw_tags.append(col_headers[col_index]) 

354 rowspan_str = cell_node.attrs.get("rowspan", "1") 

355 rowspan = 1 

356 if re.fullmatch(r"\d+", rowspan_str) is not None: 356 ↛ 358line 356 didn't jump to line 358 because the condition on line 356 was always true

357 rowspan = int(rowspan_str) 

358 for header in row_headers: 

359 if ( 

360 header.index < row_index + rowspan 

361 and row_index < header.index + header.span 

362 and header.text not in form.raw_tags 

363 ): 

364 form.raw_tags.append(header.text) 

365 translate_raw_tags(form) 

366 word_entry.forms.append(form)