Coverage for src/wiktextract/extractor/fr/inflection.py: 89%

142 statements  

« prev     ^ index     » next       coverage.py v7.10.3, created at 2025-08-15 05:18 +0000

1from dataclasses import dataclass 

2 

3from wikitextprocessor.parser import NodeKind, TemplateNode, WikiNode 

4 

5from ...page import clean_node 

6from ...wxr_context import WiktextractContext 

7from .models import Form, WordEntry 

8from .pronunciation import is_ipa_text 

9from .tags import translate_raw_tags 

10 

11 

12def extract_inflection( 

13 wxr: WiktextractContext, 

14 page_data: list[WordEntry], 

15 template_node: TemplateNode, 

16) -> None: 

17 # inflection templates 

18 # https://fr.wiktionary.org/wiki/Catégorie:Modèles_d’accord_en_français 

19 if template_node.template_name.startswith("en-adj"): 

20 process_en_adj_table(wxr, page_data, template_node) 

21 else: 

22 process_inflection_table(wxr, page_data, template_node) 

23 

24 

25IGNORE_TABLE_HEADERS = frozenset( 

26 { 

27 "terme", # https://fr.wiktionary.org/wiki/Modèle:de-adj 

28 "forme", # br-flex-adj 

29 "temps", # en-conj-rég, 

30 "cas", # lt_décl_as, ro-nom-tab(lower case) 

31 "commun", # sv-nom-c-ar 

32 "personne", # hu-pos-otok 

33 "pronom personnel", # it-enclise 

34 "mutation", # br-nom 

35 "nombre", # ca-accord-mixte2 

36 "nature", # de-adj 

37 "genre", # es-accord-oa 

38 "conjugaison présent indicatif", # avk-tab-conjug 

39 "mode", # eo-conj 

40 "avec suffixes possessifs", # fi-décl-valo 

41 "en kurmandji", # flex-ku-nomf 

42 } 

43) 

44IGNORE_TABLE_HEADER_PREFIXES = ( 

45 "voir la conjugaison du verbe ", # Modèle:fr-verbe-flexion 

46 "conjugaison de ", # sv-conj-ar 

47 "déclinaison de ", # da-adj 

48) 

49IGNORE_TABLE_CELL = frozenset( 

50 { 

51 "Déclinaisons", # de-adj 

52 "—", # https://fr.wiktionary.org/wiki/Modèle:vls-nom 

53 } 

54) 

55IGNORE_TABLE_CELL_PREFIXES = ( 

56 "voir conjugaison ", # en-conj, avk-conj 

57) 

58 

59 

60@dataclass 

61class ColspanHeader: 

62 text: str 

63 index: int 

64 span: int 

65 

66 

67def table_data_cell_is_header( 

68 wxr: WiktextractContext, cell_node: WikiNode, page_title: str 

69) -> bool: 

70 # first child is bold node 

71 if cell_node.kind == NodeKind.TABLE_CELL: 71 ↛ 82line 71 didn't jump to line 82 because the condition on line 71 was always true

72 for child in cell_node.filter_empty_str_child(): 72 ↛ 82line 72 didn't jump to line 82 because the loop on line 72 didn't complete

73 cell_text = clean_node(wxr, None, child) 

74 return ( 

75 isinstance(child, WikiNode) 

76 and child.kind == NodeKind.BOLD 

77 and len(cell_text) > 0 

78 and cell_text[0].isupper() 

79 and cell_text != page_title 

80 ) 

81 

82 return False 

83 

84 

85def process_inflection_table( 

86 wxr: WiktextractContext, 

87 page_data: list[WordEntry], 

88 table_template: TemplateNode, 

89) -> None: 

90 from .form_line import is_conj_link, process_conj_link_node 

91 

92 expanded_node = wxr.wtp.parse( 

93 wxr.wtp.node_to_wikitext(table_template), expand_all=True 

94 ) 

95 table_nodes = list(expanded_node.find_child(NodeKind.TABLE)) 

96 if len(table_nodes) == 0: 96 ↛ 97line 96 didn't jump to line 97 because the condition on line 96 was never true

97 return 

98 table_node = table_nodes[0] 

99 column_headers = [] 

100 rowspan_headers = [] 

101 colspan_headers = [] 

102 for row_num, table_row in enumerate( 

103 table_node.find_child(NodeKind.TABLE_ROW) 

104 ): 

105 # filter empty table cells 

106 table_row_nodes = [ 

107 row_node_child 

108 for row_node_child in table_row.children 

109 if isinstance(row_node_child, WikiNode) 

110 and ( 

111 row_node_child.kind == NodeKind.TABLE_HEADER_CELL 

112 or ( 

113 row_node_child.kind == NodeKind.TABLE_CELL 

114 and len(list(row_node_child.filter_empty_str_child())) > 0 

115 ) 

116 ) 

117 and row_node_child.attrs.get("style") != "display:none" 

118 and "invisible" not in row_node_child.attrs.get("class", "") 

119 ] 

120 current_row_has_data_cell = any( 

121 isinstance(cell, WikiNode) 

122 and cell.kind == NodeKind.TABLE_CELL 

123 and not table_data_cell_is_header(wxr, cell, page_data[-1].word) 

124 for cell in table_row_nodes 

125 ) 

126 if not current_row_has_data_cell: 

127 column_headers.clear() 

128 row_headers = [] 

129 new_rowspan_headers = [] 

130 for rowspan_text, rowspan_count in rowspan_headers: 

131 row_headers.append(rowspan_text) 

132 if rowspan_count - 1 > 0: 132 ↛ 133line 132 didn't jump to line 133 because the condition on line 132 was never true

133 new_rowspan_headers.append((rowspan_text, rowspan_count - 1)) 

134 rowspan_headers = new_rowspan_headers 

135 

136 column_cell_index = 0 

137 for column_num, table_cell in enumerate(table_row_nodes): 

138 form_data = Form() 

139 if isinstance(table_cell, WikiNode): 139 ↛ 137line 139 didn't jump to line 137 because the condition on line 139 was always true

140 if ( 

141 table_cell.kind == NodeKind.TABLE_HEADER_CELL 

142 or table_data_cell_is_header( 

143 wxr, table_cell, page_data[-1].word 

144 ) 

145 ): 

146 if any( 

147 table_cell.find_html( 

148 "span", 

149 attr_name="class", 

150 attr_value="ligne-de-forme", 

151 ) 

152 ): 

153 # ignore gender header in template "ro-nom-tab" 

154 continue 

155 table_header_text = clean_node( 

156 wxr, None, table_cell 

157 ).replace("\n", " ") 

158 if ( 

159 table_header_text.lower() in IGNORE_TABLE_HEADERS 

160 or table_header_text.lower().startswith( 

161 IGNORE_TABLE_HEADER_PREFIXES 

162 ) 

163 or len(table_header_text.strip()) == 0 

164 ): 

165 continue 

166 rsplit_header = table_header_text.rsplit(maxsplit=1) 

167 if len(rsplit_header) > 1 and rsplit_header[-1].isdecimal(): 

168 # "Pluriel 1" in template "br-nom" 

169 table_header_text = rsplit_header[0] 

170 

171 if not current_row_has_data_cell: 

172 # if all cells of the row are header cells 

173 # then the header cells are column headers 

174 if "colspan" in table_cell.attrs: 

175 colspan_headers.append( 

176 ColspanHeader( 

177 table_header_text, 

178 column_cell_index, 

179 int(table_cell.attrs.get("colspan")), 

180 ) 

181 ) 

182 else: 

183 column_headers.append(table_header_text) 

184 column_cell_index += int( 

185 table_cell.attrs.get("colspan", 1) 

186 ) 

187 else: 

188 if table_header_text not in row_headers: 188 ↛ 190line 188 didn't jump to line 190 because the condition on line 188 was always true

189 row_headers.append(table_header_text) 

190 if "rowspan" in table_cell.attrs: 

191 rowspan_headers.append( 

192 ( 

193 table_header_text, 

194 int(table_cell.attrs.get("rowspan")) - 1, 

195 ) 

196 ) 

197 elif table_cell.kind == NodeKind.TABLE_CELL: 197 ↛ 137line 197 didn't jump to line 137 because the condition on line 197 was always true

198 has_conj_link = False 

199 for link_node in table_cell.find_child(NodeKind.LINK): 

200 if is_conj_link(wxr, link_node): 200 ↛ 201line 200 didn't jump to line 201 because the condition on line 200 was never true

201 process_conj_link_node(wxr, link_node, page_data) 

202 has_conj_link = True 

203 break 

204 if has_conj_link: 204 ↛ 205line 204 didn't jump to line 205 because the condition on line 204 was never true

205 continue 

206 table_cell_lines = clean_node(wxr, None, table_cell) 

207 for table_cell_line in table_cell_lines.splitlines(): 

208 if is_ipa_text(table_cell_line): 

209 insert_ipa(form_data, table_cell_line) 

210 elif ( 

211 table_cell_line != page_data[-1].word 

212 and table_cell_line not in IGNORE_TABLE_CELL 

213 and not table_cell_line.lower().startswith( 

214 IGNORE_TABLE_CELL_PREFIXES 

215 ) 

216 ): 

217 if form_data.form == "": 

218 form_data.form = table_cell_line 

219 else: 

220 form_data.form += "\n" + table_cell_line 

221 for colspan_header in colspan_headers: 

222 if ( 

223 column_cell_index >= colspan_header.index 

224 and column_cell_index 

225 < colspan_header.index + colspan_header.span 

226 ): 

227 form_data.raw_tags.append(colspan_header.text) 

228 if ( 

229 "colspan" not in table_cell.attrs 

230 and len(column_headers) > column_cell_index 

231 and column_headers[column_cell_index].lower() 

232 not in IGNORE_TABLE_HEADERS 

233 ): 

234 form_data.raw_tags.append( 

235 column_headers[column_cell_index] 

236 ) 

237 

238 if len(row_headers) > 0: 

239 form_data.raw_tags.extend(row_headers) 

240 if form_data.form != "": 

241 for form in form_data.form.splitlines(): 

242 if form.startswith("(") and form.endswith(")"): 242 ↛ 243line 242 didn't jump to line 243 because the condition on line 242 was never true

243 form_data.raw_tags.append(form.strip("()")) 

244 continue 

245 new_form_data = form_data.model_copy(deep=True) 

246 new_form_data.form = form.removeprefix("ou ") 

247 translate_raw_tags( 

248 new_form_data, table_template.template_name 

249 ) 

250 if len(new_form_data.form.strip()) > 0: 250 ↛ 241line 250 didn't jump to line 241 because the condition on line 250 was always true

251 page_data[-1].forms.append(new_form_data) 

252 

253 colspan_text = table_cell.attrs.get("colspan", "1") 

254 if colspan_text.isdecimal(): 254 ↛ 137line 254 didn't jump to line 137 because the condition on line 254 was always true

255 column_cell_index += int(colspan_text) 

256 

257 

258def split_ipa(text: str) -> list[str]: 

259 # break IPA text if it contains "ou"(or) 

260 if " ou " in text: 

261 # two ipa texts in the same line: "en-conj-rég" template 

262 return text.split(" ou ") 

263 if text.startswith("ou "): 

264 return [text.removeprefix("ou ")] 

265 if text.endswith("Prononciation ?\\"): 

266 # inflection table templates use a edit link when the ipa data is 

267 # missing, and the link usually ends with "Prononciation ?" 

268 return [] 

269 return [text] 

270 

271 

272def insert_ipa(form: Form, ipa_text: str) -> None: 

273 ipa_data = split_ipa(ipa_text) 

274 if len(ipa_data) == 0: 

275 return 

276 form.ipas.extend(ipa_data) 

277 

278 

279def process_en_adj_table( 

280 wxr: WiktextractContext, 

281 page_data: list[WordEntry], 

282 template_node: WikiNode, 

283) -> None: 

284 # https://fr.wiktionary.org/wiki/Modèle:en-adj 

285 # and other en-adj* templates 

286 # these templates use normal table cell for column table header 

287 expanded_node = wxr.wtp.parse( 

288 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

289 ) 

290 table_nodes = list(expanded_node.find_child(NodeKind.TABLE)) 

291 if len(table_nodes) == 0: 291 ↛ 292line 291 didn't jump to line 292 because the condition on line 291 was never true

292 return 

293 table_node = table_nodes[0] 

294 for row_num, table_row in enumerate( 

295 table_node.find_child(NodeKind.TABLE_ROW) 

296 ): 

297 if row_num == 0: 

298 # skip header 

299 continue 

300 if len(table_row.children) > 1: 300 ↛ 294line 300 didn't jump to line 294 because the condition on line 300 was always true

301 form_data = Form() 

302 form_data.raw_tags.append( 

303 clean_node(wxr, None, table_row.children[0]) 

304 ) 

305 form_text = clean_node(wxr, None, table_row.children[1]) 

306 for form_line in form_text.splitlines(): 

307 if form_line in IGNORE_TABLE_CELL: 307 ↛ 308line 307 didn't jump to line 308 because the condition on line 307 was never true

308 continue 

309 elif is_ipa_text(form_line): 

310 insert_ipa(form_data, form_line) 

311 else: 

312 form_data.form = form_line 

313 if form_data.form != page_data[-1].word and len(form_data.form) > 0: 

314 translate_raw_tags(form_data) 

315 page_data[-1].forms.append(form_data)