Coverage for src/wiktextract/extractor/fr/inflection.py: 92%

130 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-10-25 10:11 +0000

1from dataclasses import dataclass 

2 

3from wikitextprocessor.parser import NodeKind, TemplateNode, WikiNode 

4 

5from ...page import clean_node 

6from ...wxr_context import WiktextractContext 

7from .models import Form, WordEntry 

8from .pronunciation import is_ipa_text 

9from .tags import translate_raw_tags 

10 

11 

12def extract_inflection( 

13 wxr: WiktextractContext, 

14 page_data: list[WordEntry], 

15 template_node: TemplateNode, 

16) -> None: 

17 # inflection templates 

18 # https://fr.wiktionary.org/wiki/Catégorie:Modèles_d’accord_en_français 

19 if template_node.template_name.startswith("en-adj"): 

20 process_en_adj_table(wxr, page_data, template_node) 

21 else: 

22 process_inflection_table(wxr, page_data, template_node) 

23 

24 

25IGNORE_TABLE_HEADERS = frozenset( 

26 { 

27 "terme", # https://fr.wiktionary.org/wiki/Modèle:de-adj 

28 "forme", # br-flex-adj 

29 "temps", # en-conj-rég, 

30 "cas", # lt_décl_as, ro-nom-tab(lower case) 

31 "commun", # sv-nom-c-ar 

32 "personne", # hu-pos-otok 

33 "pronom personnel", # it-enclise 

34 "mutation", # br-nom 

35 "nombre", # ca-accord-mixte2 

36 "nature", # de-adj 

37 "genre", # es-accord-oa 

38 "conjugaison présent indicatif", # avk-tab-conjug 

39 "mode", # eo-conj 

40 "avec suffixes possessifs", # fi-décl-valo 

41 } 

42) 

43IGNORE_TABLE_HEADER_PREFIXES = ( 

44 "voir la conjugaison du verbe ", # Modèle:fr-verbe-flexion 

45 "conjugaison de ", # sv-conj-ar 

46 "déclinaison de ", # da-adj 

47) 

48IGNORE_TABLE_CELL = frozenset( 

49 { 

50 "Déclinaisons", # de-adj 

51 "—", # https://fr.wiktionary.org/wiki/Modèle:vls-nom 

52 } 

53) 

54IGNORE_TABLE_CELL_PREFIXES = ( 

55 "voir conjugaison ", # en-conj, avk-conj 

56) 

57 

58 

59@dataclass 

60class ColspanHeader: 

61 text: str 

62 index: int 

63 span: int 

64 

65 

66def table_data_cell_is_header( 

67 wxr: WiktextractContext, cell_node: WikiNode, page_title: str 

68) -> bool: 

69 # first child is bold node 

70 if cell_node.kind == NodeKind.TABLE_CELL: 70 ↛ 81line 70 didn't jump to line 81 because the condition on line 70 was always true

71 for child in cell_node.filter_empty_str_child(): 71 ↛ 81line 71 didn't jump to line 81 because the loop on line 71 didn't complete

72 cell_text = clean_node(wxr, None, child) 

73 return ( 

74 isinstance(child, WikiNode) 

75 and child.kind == NodeKind.BOLD 

76 and len(cell_text) > 0 

77 and cell_text[0].isupper() 

78 and cell_text != page_title 

79 ) 

80 

81 return False 

82 

83 

84def process_inflection_table( 

85 wxr: WiktextractContext, 

86 page_data: list[WordEntry], 

87 table_template: TemplateNode, 

88) -> None: 

89 expanded_node = wxr.wtp.parse( 

90 wxr.wtp.node_to_wikitext(table_template), expand_all=True 

91 ) 

92 table_nodes = list(expanded_node.find_child(NodeKind.TABLE)) 

93 if len(table_nodes) == 0: 93 ↛ 94line 93 didn't jump to line 94 because the condition on line 93 was never true

94 return 

95 table_node = table_nodes[0] 

96 column_headers = [] 

97 rowspan_headers = [] 

98 colspan_headers = [] 

99 for row_num, table_row in enumerate( 

100 table_node.find_child(NodeKind.TABLE_ROW) 

101 ): 

102 # filter empty table cells 

103 table_row_nodes = [ 

104 row_node_child 

105 for row_node_child in table_row.children 

106 if isinstance(row_node_child, WikiNode) 

107 and ( 

108 row_node_child.kind == NodeKind.TABLE_HEADER_CELL 

109 or ( 

110 row_node_child.kind == NodeKind.TABLE_CELL 

111 and len(list(row_node_child.filter_empty_str_child())) > 0 

112 ) 

113 ) 

114 and row_node_child.attrs.get("style") != "display:none" 

115 and "invisible" not in row_node_child.attrs.get("class", "") 

116 ] 

117 current_row_has_data_cell = any( 

118 isinstance(cell, WikiNode) 

119 and cell.kind == NodeKind.TABLE_CELL 

120 and not table_data_cell_is_header(wxr, cell, page_data[-1].word) 

121 for cell in table_row_nodes 

122 ) 

123 if not current_row_has_data_cell: 

124 column_headers.clear() 

125 row_headers = [] 

126 new_rowspan_headers = [] 

127 for rowspan_text, rowspan_count in rowspan_headers: 

128 row_headers.append(rowspan_text) 

129 if rowspan_count - 1 > 0: 129 ↛ 130line 129 didn't jump to line 130 because the condition on line 129 was never true

130 new_rowspan_headers.append((rowspan_text, rowspan_count - 1)) 

131 rowspan_headers = new_rowspan_headers 

132 

133 column_cell_index = 0 

134 for column_num, table_cell in enumerate(table_row_nodes): 

135 form_data = Form() 

136 if isinstance(table_cell, WikiNode): 136 ↛ 134line 136 didn't jump to line 134 because the condition on line 136 was always true

137 if ( 

138 table_cell.kind == NodeKind.TABLE_HEADER_CELL 

139 or table_data_cell_is_header( 

140 wxr, table_cell, page_data[-1].word 

141 ) 

142 ): 

143 if any( 

144 table_cell.find_html( 

145 "span", 

146 attr_name="class", 

147 attr_value="ligne-de-forme", 

148 ) 

149 ): 

150 # ignore gender header in template "ro-nom-tab" 

151 continue 

152 table_header_text = clean_node( 

153 wxr, None, table_cell 

154 ).replace("\n", " ") 

155 if ( 

156 table_header_text.lower() in IGNORE_TABLE_HEADERS 

157 or table_header_text.lower().startswith( 

158 IGNORE_TABLE_HEADER_PREFIXES 

159 ) 

160 or len(table_header_text.strip()) == 0 

161 ): 

162 continue 

163 rsplit_header = table_header_text.rsplit(maxsplit=1) 

164 if len(rsplit_header) > 1 and rsplit_header[-1].isdecimal(): 

165 # "Pluriel 1" in template "br-nom" 

166 table_header_text = rsplit_header[0] 

167 

168 if not current_row_has_data_cell: 

169 # if all cells of the row are header cells 

170 # then the header cells are column headers 

171 if "colspan" in table_cell.attrs: 

172 colspan_headers.append( 

173 ColspanHeader( 

174 table_header_text, 

175 column_cell_index, 

176 int(table_cell.attrs.get("colspan")), 

177 ) 

178 ) 

179 else: 

180 column_headers.append(table_header_text) 

181 column_cell_index += int( 

182 table_cell.attrs.get("colspan", 1) 

183 ) 

184 else: 

185 if table_header_text not in row_headers: 185 ↛ 187line 185 didn't jump to line 187 because the condition on line 185 was always true

186 row_headers.append(table_header_text) 

187 if "rowspan" in table_cell.attrs: 

188 rowspan_headers.append( 

189 ( 

190 table_header_text, 

191 int(table_cell.attrs.get("rowspan")) - 1, 

192 ) 

193 ) 

194 elif table_cell.kind == NodeKind.TABLE_CELL: 194 ↛ 134line 194 didn't jump to line 134 because the condition on line 194 was always true

195 table_cell_lines = clean_node(wxr, None, table_cell) 

196 for table_cell_line in table_cell_lines.splitlines(): 

197 if is_ipa_text(table_cell_line): 

198 insert_ipa(form_data, table_cell_line) 

199 elif ( 

200 table_cell_line != page_data[-1].word 

201 and table_cell_line not in IGNORE_TABLE_CELL 

202 and not table_cell_line.lower().startswith( 

203 IGNORE_TABLE_CELL_PREFIXES 

204 ) 

205 ): 

206 if form_data.form == "": 

207 form_data.form = table_cell_line 

208 else: 

209 form_data.form += "\n" + table_cell_line 

210 for colspan_header in colspan_headers: 

211 if ( 

212 column_cell_index >= colspan_header.index 

213 and column_cell_index 

214 < colspan_header.index + colspan_header.span 

215 ): 

216 form_data.raw_tags.append(colspan_header.text) 

217 if ( 

218 "colspan" not in table_cell.attrs 

219 and len(column_headers) > column_cell_index 

220 and column_headers[column_cell_index].lower() 

221 not in IGNORE_TABLE_HEADERS 

222 ): 

223 form_data.raw_tags.append( 

224 column_headers[column_cell_index] 

225 ) 

226 

227 if len(row_headers) > 0: 

228 form_data.raw_tags.extend(row_headers) 

229 if form_data.form != "": 

230 for form in form_data.form.splitlines(): 

231 new_form_data = form_data.model_copy(deep=True) 

232 new_form_data.form = form.removeprefix("ou ") 

233 translate_raw_tags( 

234 new_form_data, table_template.template_name 

235 ) 

236 if len(new_form_data.form.strip()) > 0: 236 ↛ 230line 236 didn't jump to line 230 because the condition on line 236 was always true

237 page_data[-1].forms.append(new_form_data) 

238 

239 colspan_text = table_cell.attrs.get("colspan", "1") 

240 if colspan_text.isdecimal(): 240 ↛ 134line 240 didn't jump to line 134 because the condition on line 240 was always true

241 column_cell_index += int(colspan_text) 

242 

243 

244def split_ipa(text: str) -> list[str]: 

245 # break IPA text if it contains "ou"(or) 

246 if " ou " in text: 

247 # two ipa texts in the same line: "en-conj-rég" template 

248 return text.split(" ou ") 

249 if text.startswith("ou "): 

250 return [text.removeprefix("ou ")] 

251 if text.endswith("Prononciation ?\\"): 

252 # inflection table templates use a edit link when the ipa data is 

253 # missing, and the link usually ends with "Prononciation ?" 

254 return [] 

255 return [text] 

256 

257 

258def insert_ipa(form: Form, ipa_text: str) -> None: 

259 ipa_data = split_ipa(ipa_text) 

260 if len(ipa_data) == 0: 

261 return 

262 form.ipas.extend(ipa_data) 

263 

264 

265def process_en_adj_table( 

266 wxr: WiktextractContext, 

267 page_data: list[WordEntry], 

268 template_node: WikiNode, 

269) -> None: 

270 # https://fr.wiktionary.org/wiki/Modèle:en-adj 

271 # and other en-adj* templates 

272 # these templates use normal table cell for column table header 

273 expanded_node = wxr.wtp.parse( 

274 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

275 ) 

276 table_nodes = list(expanded_node.find_child(NodeKind.TABLE)) 

277 if len(table_nodes) == 0: 277 ↛ 278line 277 didn't jump to line 278 because the condition on line 277 was never true

278 return 

279 table_node = table_nodes[0] 

280 for row_num, table_row in enumerate( 

281 table_node.find_child(NodeKind.TABLE_ROW) 

282 ): 

283 if row_num == 0: 

284 # skip header 

285 continue 

286 if len(table_row.children) > 1: 286 ↛ 280line 286 didn't jump to line 280 because the condition on line 286 was always true

287 form_data = Form() 

288 form_data.raw_tags.append( 

289 clean_node(wxr, None, table_row.children[0]) 

290 ) 

291 form_text = clean_node(wxr, None, table_row.children[1]) 

292 for form_line in form_text.splitlines(): 

293 if form_line in IGNORE_TABLE_CELL: 293 ↛ 294line 293 didn't jump to line 294 because the condition on line 293 was never true

294 continue 

295 elif is_ipa_text(form_line): 

296 insert_ipa(form_data, form_line) 

297 else: 

298 form_data.form = form_line 

299 if form_data.form != page_data[-1].word and len(form_data.form) > 0: 

300 translate_raw_tags(form_data) 

301 page_data[-1].forms.append(form_data)