Coverage for src/wiktextract/extractor/fr/conjugation.py: 94%

180 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2024-12-27 08:07 +0000

1from wikitextprocessor.parser import ( 

2 LEVEL_KIND_FLAGS, 

3 HTMLNode, 

4 NodeKind, 

5 TemplateNode, 

6 WikiNode, 

7) 

8 

9from ...page import clean_node 

10from ...wxr_context import WiktextractContext 

11from .models import Form, WordEntry 

12from .tags import translate_raw_tags 

13 

14 

15def extract_conjugation( 

16 wxr: WiktextractContext, 

17 entry: WordEntry, 

18 conj_page_title: str, 

19 select_tab: str = "1", 

20) -> None: 

21 """ 

22 Find and extract conjugation page. 

23 

24 https://fr.wiktionary.org/wiki/Conjugaison:français 

25 https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_de_tous_les_modèles/Français/Conjugaison 

26 https://fr.wiktionary.org/wiki/Aide:Conjugaisons 

27 """ 

28 conj_page = wxr.wtp.get_page_body( 

29 conj_page_title, wxr.wtp.NAMESPACE_DATA["Conjugaison"]["id"] 

30 ) 

31 if conj_page is None: 31 ↛ 32line 31 didn't jump to line 32 because the condition on line 31 was never true

32 return 

33 conj_root = wxr.wtp.parse(conj_page) 

34 for conj_template in conj_root.find_child(NodeKind.TEMPLATE): 

35 if conj_template.template_name.endswith("-intro"): 35 ↛ 36line 35 didn't jump to line 36 because the condition on line 35 was never true

36 continue 

37 elif "-conj" in conj_template.template_name: 

38 process_conj_template(wxr, entry, conj_template, conj_page_title) 

39 elif conj_template.template_name == "Onglets conjugaison": 

40 process_onglets_conjugaison_template( 

41 wxr, entry, conj_template, conj_page_title, select_tab 

42 ) 

43 elif conj_template.template_name.removeprefix(":").startswith( 

44 "Conjugaison:" 

45 ): 

46 extract_conjugation( 

47 wxr, 

48 entry, 

49 conj_template.template_name.removeprefix(":"), 

50 clean_node( 

51 wxr, None, conj_template.template_parameters.get("sél", "2") 

52 ), 

53 ) 

54 elif conj_template.template_name.startswith("ja-flx-adj"): 

55 proces_ja_flx_adj_template( 

56 wxr, entry, conj_template, conj_page_title 

57 ) 

58 elif conj_template.template_name.startswith("ja-"): 58 ↛ 34line 58 didn't jump to line 34 because the condition on line 58 was always true

59 proces_ja_conj_template(wxr, entry, conj_template, conj_page_title) 

60 

61 

62def process_onglets_conjugaison_template( 

63 wxr: WiktextractContext, 

64 entry: WordEntry, 

65 node: TemplateNode, 

66 conj_page_title: str, 

67 select_tab: str, 

68) -> None: 

69 # https://fr.wiktionary.org/wiki/Modèle:Onglets_conjugaison 

70 # this template expands to two tabs of tables 

71 selected_tabs = [] 

72 if select_tab != "1" or ( 72 ↛ 80line 72 didn't jump to line 80 because the condition on line 72 was always true

73 select_tab == "1" 

74 and clean_node(wxr, None, node.template_parameters.get("onglet1", "")) 

75 == "Conjugaison active" 

76 ): 

77 # don't extract or only extract "Conjugaison pronominale" tab 

78 selected_tabs = [select_tab] 

79 else: 

80 selected_tabs = [str(i) for i in range(1, 7)] 

81 

82 for tab_index in selected_tabs: 

83 arg_name = f"contenu{tab_index}" 

84 if arg_name not in node.template_parameters: 84 ↛ 85line 84 didn't jump to line 85 because the condition on line 84 was never true

85 break 

86 arg_value = node.template_parameters[arg_name] 

87 if ( 87 ↛ 91line 87 didn't jump to line 91 because the condition on line 87 was never true

88 isinstance(arg_value, TemplateNode) 

89 and "-conj" in arg_value.template_name 

90 ): 

91 process_conj_template(wxr, entry, arg_value, conj_page_title) 

92 elif isinstance(arg_value, list): 92 ↛ 82line 92 didn't jump to line 82 because the condition on line 92 was always true

93 for arg_node in arg_value: 

94 if ( 

95 isinstance(arg_node, TemplateNode) 

96 and "-conj" in arg_node.template_name 

97 ): 

98 process_conj_template(wxr, entry, arg_node, conj_page_title) 

99 

100 

101def process_conj_template( 

102 wxr: WiktextractContext, 

103 entry: WordEntry, 

104 template_node: TemplateNode, 

105 conj_page_title: str, 

106) -> None: 

107 # https://fr.wiktionary.org/wiki/Catégorie:Modèles_de_conjugaison_en_français 

108 # https://fr.wiktionary.org/wiki/Modèle:fr-conj-1-ger 

109 expanded_template = wxr.wtp.parse( 

110 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

111 ) 

112 process_expanded_conj_template( 

113 wxr, entry, expanded_template, conj_page_title 

114 ) 

115 

116 

117def process_expanded_conj_template( 

118 wxr: WiktextractContext, 

119 entry: WordEntry, 

120 node: WikiNode, 

121 conj_page_title: str, 

122) -> None: 

123 h3_text = "" 

124 for child in node.find_child(NodeKind.HTML | LEVEL_KIND_FLAGS): 

125 if child.kind in LEVEL_KIND_FLAGS: 125 ↛ 126line 125 didn't jump to line 126 because the condition on line 125 was never true

126 process_expanded_conj_template(wxr, entry, child, conj_page_title) 

127 elif child.kind == NodeKind.HTML: 127 ↛ 124line 127 didn't jump to line 124 because the condition on line 127 was always true

128 if child.tag == "h3": 

129 h3_text = clean_node(wxr, None, child) 

130 elif child.tag == "div": 130 ↛ 124line 130 didn't jump to line 124 because the condition on line 130 was always true

131 if h3_text == "Modes impersonnels": 

132 process_fr_conj_modes_table( 

133 wxr, entry, child, conj_page_title 

134 ) 

135 else: 

136 process_fr_conj_table( 

137 wxr, entry, child, h3_text, conj_page_title 

138 ) 

139 

140 

141def process_fr_conj_modes_table( 

142 wxr: WiktextractContext, 

143 entry: WordEntry, 

144 div_node: HTMLNode, 

145 conj_page_title: str, 

146) -> None: 

147 # the first "Modes impersonnels" table 

148 added_forms = {f.form for f in entry.forms} 

149 

150 for table_node in div_node.find_child(NodeKind.TABLE): 

151 for row_index, row in enumerate( 

152 table_node.find_child(NodeKind.TABLE_ROW) 

153 ): 

154 if row_index == 0: 

155 continue # skip header 

156 form_text = "" 

157 tags = [] 

158 for cell_index, cell in enumerate( 

159 row.find_child(NodeKind.TABLE_CELL) 

160 ): 

161 if cell_index == 0: 

162 tags.append(clean_node(wxr, None, cell)) 

163 elif cell_index % 3 == 0: 

164 form = Form( 

165 form=form_text, 

166 raw_tags=tags.copy(), 

167 ipas=[clean_node(wxr, None, cell)], 

168 source=conj_page_title, 

169 ) 

170 form.raw_tags.append( 

171 "Présent" if cell_index == 3 else "Passé" 

172 ) 

173 translate_raw_tags(form) 

174 if form.form not in added_forms: 174 ↛ 177line 174 didn't jump to line 177 because the condition on line 174 was always true

175 entry.forms.append(form) 

176 added_forms.add(form.form) 

177 form_text = "" 

178 else: 

179 if len(form_text) > 0 and not form_text.endswith("’"): 

180 form_text += " " 

181 form_text += clean_node(wxr, None, cell) 

182 

183 

184def process_fr_conj_table( 

185 wxr: WiktextractContext, 

186 entry: WordEntry, 

187 div_node: HTMLNode, 

188 h3_text: str, 

189 conj_page_title: str, 

190) -> None: 

191 for table_node in div_node.find_child(NodeKind.TABLE): 

192 for row_index, row in enumerate( 

193 table_node.find_child(NodeKind.TABLE_ROW) 

194 ): 

195 for cell_index, cell in enumerate( 

196 row.find_child(NodeKind.TABLE_CELL) 

197 ): 

198 for cell_child in cell.children: 

199 if isinstance(cell_child, WikiNode): 

200 if ( 

201 cell_child.kind == NodeKind.HTML 

202 and cell_child.tag == "table" 

203 ): 

204 process_fr_conj_html_table( 

205 wxr, entry, cell_child, h3_text, conj_page_title 

206 ) 

207 elif cell_child.kind == NodeKind.TABLE: 207 ↛ 198line 207 didn't jump to line 198 because the condition on line 207 was always true

208 process_fr_conj_wiki_table( 

209 wxr, entry, cell_child, h3_text, conj_page_title 

210 ) 

211 

212 

213def process_fr_conj_html_table( 

214 wxr: WiktextractContext, 

215 entry: WordEntry, 

216 table_node: HTMLNode, 

217 h3_text: str, 

218 conj_page_title: str, 

219): 

220 tags = [h3_text] 

221 for tr_index, tr_node in enumerate(table_node.find_html_recursively("tr")): 

222 if tr_index == 0: 

223 tags.append(clean_node(wxr, None, tr_node.children)) 

224 else: 

225 form = Form(raw_tags=tags, source=conj_page_title) 

226 for td_index, td_node in enumerate( 

227 tr_node.find_html_recursively("td") 

228 ): 

229 td_text = clean_node(wxr, None, td_node) 

230 if td_index < 2: 

231 form.form += td_text 

232 if td_index == 0 and not td_text.endswith("’"): 

233 form.form += " " 

234 else: 

235 if len(form.ipas) > 0: 

236 form.ipas[0] += td_text 

237 else: 

238 if not td_text.endswith("‿"): 238 ↛ 240line 238 didn't jump to line 240 because the condition on line 238 was always true

239 td_text += " " 

240 form.ipas.append(td_text) 

241 

242 translate_raw_tags(form) 

243 entry.forms.append(form) 

244 

245 

246def process_fr_conj_wiki_table( 

247 wxr: WiktextractContext, 

248 entry: WordEntry, 

249 table_node: WikiNode, 

250 h3_text: str, 

251 conj_page_title: str, 

252): 

253 tags = [h3_text] 

254 for row_index, row in enumerate(table_node.find_child(NodeKind.TABLE_ROW)): 

255 if row_index == 0: 

256 tags.append(clean_node(wxr, None, row.children)) 

257 else: 

258 form = Form(raw_tags=tags, source=conj_page_title) 

259 for cell_index, cell in enumerate( 

260 row.find_child(NodeKind.TABLE_CELL) 

261 ): 

262 cell_text = clean_node(wxr, None, cell) 

263 if cell_index < 2: 

264 if cell_text == "—": 

265 continue 

266 if cell_text.startswith("-"): 

267 form.form = form.form.strip() 

268 form.form += cell_text 

269 if cell_index == 0 and len(cell_text) > 0: 

270 form.form += " " 

271 else: 

272 form.ipas.append(cell_text) 

273 

274 if len(form.form) > 0: 

275 translate_raw_tags(form) 

276 entry.forms.append(form) 

277 

278 

279def proces_ja_flx_adj_template( 

280 wxr: WiktextractContext, 

281 entry: WordEntry, 

282 template_node: TemplateNode, 

283 conj_page_title: str, 

284) -> None: 

285 # https://fr.wiktionary.org/wiki/Modèle:ja-adj 

286 # https://fr.wiktionary.org/wiki/Modèle:ja-flx-adj-な 

287 expanded_template = wxr.wtp.parse( 

288 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

289 ) 

290 for table_node in expanded_template.find_child(NodeKind.TABLE): 

291 first_tag = "" 

292 for row in table_node.find_child(NodeKind.TABLE_ROW): 

293 forms = [] 

294 tags = [first_tag] 

295 for cell_index, row_child in enumerate( 

296 row.find_child(NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL) 

297 ): 

298 row_child_text = clean_node(wxr, None, row_child) 

299 if row_child.kind == NodeKind.TABLE_HEADER_CELL: 

300 first_tag = row_child_text 

301 else: 

302 for line_index, line in enumerate( 

303 row_child_text.splitlines() 

304 ): 

305 if cell_index == 0: 

306 tags.append(line) 

307 continue 

308 if line_index + 1 > len(forms): 

309 forms.append( 

310 translate_raw_tags( 

311 Form(raw_tags=tags, source=conj_page_title) 

312 ) 

313 ) 

314 if cell_index == 1: 

315 forms[line_index].form = line 

316 elif cell_index == 2: 

317 forms[line_index].hiragana = line 

318 elif cell_index == 3: 318 ↛ 302line 318 didn't jump to line 302 because the condition on line 318 was always true

319 forms[line_index].roman = line 

320 

321 entry.forms.extend(forms) 

322 

323 

324def proces_ja_conj_template( 

325 wxr: WiktextractContext, 

326 entry: WordEntry, 

327 template_node: TemplateNode, 

328 conj_page_title: str, 

329) -> None: 

330 # https://fr.wiktionary.org/wiki/Modèle:ja-verbe-conj 

331 # Modèle:ja-在る 

332 expanded_template = wxr.wtp.parse( 

333 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

334 ) 

335 for table_node in expanded_template.find_child(NodeKind.TABLE): 

336 first_tag = "" 

337 row_headers = {} 

338 for row in table_node.find_child(NodeKind.TABLE_ROW): 

339 if ( 

340 all( 

341 isinstance(c, WikiNode) 

342 and c.kind == NodeKind.TABLE_HEADER_CELL 

343 for c in row.children 

344 ) 

345 and len(row.children) > 1 

346 ): 

347 # skip header row of the "Clefs de constructions" table 

348 continue 

349 

350 for header in row.find_child(NodeKind.TABLE_HEADER_CELL): 

351 header_text = clean_node(wxr, None, header) 

352 if len(row.children) == 1: 

353 first_tag = header_text 

354 else: 

355 row_headers[header_text] = int( 

356 header.attrs.get("rowspan", "1") 

357 ) 

358 

359 tags = [first_tag] 

360 for tag, rowspan in row_headers.copy().items(): 

361 tags.append(tag) 

362 if rowspan == 1: 

363 del row_headers[tag] 

364 else: 

365 row_headers[tag] = rowspan - 1 

366 form = Form(raw_tags=tags, source=conj_page_title) 

367 for cell_index, cell in enumerate( 

368 row.find_child(NodeKind.TABLE_CELL) 

369 ): 

370 cell_text = clean_node(wxr, None, cell) 

371 if cell_index == 0: 

372 form.form = cell_text 

373 elif cell_index == 1: 

374 form.hiragana = cell_text 

375 elif cell_index == 2: 375 ↛ 367line 375 didn't jump to line 367 because the condition on line 375 was always true

376 form.roman = cell_text 

377 if len(form.form) > 0: 

378 translate_raw_tags(form) 

379 entry.forms.append(form)