Coverage for src/wiktextract/extractor/fr/conjugation.py: 92%

456 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2025-10-13 10:14 +0000

1import re 

2from dataclasses import dataclass 

3 

4from wikitextprocessor.parser import ( 

5 LEVEL_KIND_FLAGS, 

6 HTMLNode, 

7 NodeKind, 

8 TemplateNode, 

9 WikiNode, 

10) 

11 

12from ...page import clean_node 

13from ...wxr_context import WiktextractContext 

14from .models import Form, WordEntry 

15from .tags import translate_raw_tags 

16 

17 

18def extract_conjugation( 

19 wxr: WiktextractContext, 

20 entry: WordEntry, 

21 conj_page_title: str, 

22 select_tab: str = "1", 

23) -> None: 

24 """ 

25 Find and extract conjugation page. 

26 

27 https://fr.wiktionary.org/wiki/Conjugaison:français 

28 https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_de_tous_les_modèles/Français/Conjugaison 

29 https://fr.wiktionary.org/wiki/Aide:Conjugaisons 

30 """ 

31 conj_page = wxr.wtp.get_page_body( 

32 conj_page_title, wxr.wtp.NAMESPACE_DATA["Conjugaison"]["id"] 

33 ) 

34 if conj_page is None: 

35 return 

36 conj_root = wxr.wtp.parse(conj_page) 

37 for conj_template in conj_root.find_child(NodeKind.TEMPLATE): 

38 if conj_template.template_name.endswith("-intro"): 38 ↛ 39line 38 didn't jump to line 39 because the condition on line 38 was never true

39 continue 

40 if conj_template.template_name in ["ku-conj-trans", "ku-conj"]: 

41 extract_ku_conj_trans_template( 

42 wxr, entry, conj_template, conj_page_title 

43 ) 

44 elif conj_template.template_name == "ko-conj": 

45 extract_ko_conj_template(wxr, entry, conj_template, conj_page_title) 

46 elif conj_template.template_name == "de-conj": 

47 extract_de_conj_template(wxr, entry, conj_template, conj_page_title) 

48 elif "-conj" in conj_template.template_name: 

49 process_conj_template(wxr, entry, conj_template, conj_page_title) 

50 elif conj_template.template_name == "Onglets conjugaison": 

51 process_onglets_conjugaison_template( 

52 wxr, entry, conj_template, conj_page_title, select_tab 

53 ) 

54 elif conj_template.template_name.removeprefix(":").startswith( 

55 "Conjugaison:" 

56 ): 

57 extract_conjugation( 

58 wxr, 

59 entry, 

60 conj_template.template_name.removeprefix(":"), 

61 clean_node( 

62 wxr, None, conj_template.template_parameters.get("sél", "2") 

63 ), 

64 ) 

65 elif conj_template.template_name.startswith("ja-flx-adj"): 

66 process_ja_flx_adj_template( 

67 wxr, entry, conj_template, conj_page_title 

68 ) 

69 elif conj_template.template_name.startswith("ja-"): 69 ↛ 37line 69 didn't jump to line 37 because the condition on line 69 was always true

70 process_ja_conj_template(wxr, entry, conj_template, conj_page_title) 

71 

72 if conj_page_title.startswith("Conjugaison:kurde/"): 

73 for table in conj_root.find_child(NodeKind.TABLE): 73 ↛ 74line 73 didn't jump to line 74 because the loop on line 73 never started

74 extract_ku_conj_trans_table_node(wxr, entry, table, conj_page_title) 

75 

76 for link_node in conj_root.find_child(NodeKind.LINK): 76 ↛ 77line 76 didn't jump to line 77 because the loop on line 76 never started

77 clean_node(wxr, None, link_node) 

78 

79 

80def process_onglets_conjugaison_template( 

81 wxr: WiktextractContext, 

82 entry: WordEntry, 

83 node: TemplateNode, 

84 conj_page_title: str, 

85 select_tab: str, 

86) -> None: 

87 # https://fr.wiktionary.org/wiki/Modèle:Onglets_conjugaison 

88 # this template expands to two tabs of tables 

89 selected_tabs = [] 

90 if select_tab != "1" or ( 90 ↛ 98line 90 didn't jump to line 98 because the condition on line 90 was always true

91 select_tab == "1" 

92 and clean_node(wxr, None, node.template_parameters.get("onglet1", "")) 

93 == "Conjugaison active" 

94 ): 

95 # don't extract or only extract "Conjugaison pronominale" tab 

96 selected_tabs = [select_tab] 

97 else: 

98 selected_tabs = [str(i) for i in range(1, 7)] 

99 

100 for tab_index in selected_tabs: 

101 arg_name = f"contenu{tab_index}" 

102 if arg_name not in node.template_parameters: 102 ↛ 103line 102 didn't jump to line 103 because the condition on line 102 was never true

103 break 

104 arg_value = node.template_parameters[arg_name] 

105 if ( 105 ↛ 109line 105 didn't jump to line 109 because the condition on line 105 was never true

106 isinstance(arg_value, TemplateNode) 

107 and "-conj" in arg_value.template_name 

108 ): 

109 process_conj_template(wxr, entry, arg_value, conj_page_title) 

110 elif isinstance(arg_value, list): 110 ↛ 100line 110 didn't jump to line 100 because the condition on line 110 was always true

111 for arg_node in arg_value: 

112 if ( 

113 isinstance(arg_node, TemplateNode) 

114 and "-conj" in arg_node.template_name 

115 ): 

116 process_conj_template(wxr, entry, arg_node, conj_page_title) 

117 

118 

119def process_conj_template( 

120 wxr: WiktextractContext, 

121 entry: WordEntry, 

122 template_node: TemplateNode, 

123 conj_page_title: str, 

124) -> None: 

125 # https://fr.wiktionary.org/wiki/Catégorie:Modèles_de_conjugaison_en_français 

126 # https://fr.wiktionary.org/wiki/Modèle:fr-conj-1-ger 

127 expanded_template = wxr.wtp.parse( 

128 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

129 ) 

130 process_expanded_conj_template( 

131 wxr, entry, expanded_template, conj_page_title 

132 ) 

133 

134 

135def process_expanded_conj_template( 

136 wxr: WiktextractContext, 

137 entry: WordEntry, 

138 node: WikiNode, 

139 conj_page_title: str, 

140) -> None: 

141 h3_text = ( 

142 clean_node(wxr, None, node.largs) 

143 if node.kind == NodeKind.LEVEL3 

144 else "" 

145 ) 

146 for child in node.find_child(NodeKind.HTML | LEVEL_KIND_FLAGS): 

147 if child.kind in LEVEL_KIND_FLAGS: 147 ↛ 148line 147 didn't jump to line 148 because the condition on line 147 was never true

148 process_expanded_conj_template(wxr, entry, child, conj_page_title) 

149 elif child.kind == NodeKind.HTML: 149 ↛ 146line 149 didn't jump to line 146 because the condition on line 149 was always true

150 if child.tag == "h3": 

151 h3_text = clean_node(wxr, None, child) 

152 elif child.tag == "div": 152 ↛ 146line 152 didn't jump to line 146 because the condition on line 152 was always true

153 if h3_text == "Modes impersonnels": 

154 process_fr_conj_modes_table( 

155 wxr, entry, child, conj_page_title 

156 ) 

157 else: 

158 process_fr_conj_table( 

159 wxr, entry, child, h3_text, conj_page_title 

160 ) 

161 

162 

163@dataclass 

164class TableHeader: 

165 text: str 

166 col_index: int = 0 

167 colspan: int = 0 

168 row_index: int = 0 

169 rowspan: int = 0 

170 

171 

172def process_fr_conj_modes_table( 

173 wxr: WiktextractContext, 

174 entry: WordEntry, 

175 div_node: HTMLNode, 

176 conj_page_title: str, 

177) -> None: 

178 # the first "Modes impersonnels" table 

179 

180 for table_node in div_node.find_child(NodeKind.TABLE): 

181 col_headers = [] 

182 for row in table_node.find_child(NodeKind.TABLE_ROW): 

183 row_header = "" 

184 is_header_row = not row.contain_node(NodeKind.TABLE_CELL) 

185 col_index = 0 

186 form_text = "" 

187 for node in row.find_child( 

188 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

189 ): 

190 if node.kind == NodeKind.TABLE_HEADER_CELL or ( 

191 node.contain_node(NodeKind.BOLD) and col_index == 0 

192 ): 

193 if is_header_row: 

194 header_text = clean_node(wxr, None, node) 

195 if header_text == "Mode": 195 ↛ 196line 195 didn't jump to line 196 because the condition on line 195 was never true

196 continue 

197 else: 

198 colspan = 1 

199 colspan_str = node.attrs.get("colspan", "1") 

200 if re.fullmatch(r"\d+", colspan_str) is not None: 200 ↛ 202line 200 didn't jump to line 202 because the condition on line 200 was always true

201 colspan = int(colspan_str) 

202 col_headers.append( 

203 TableHeader(header_text, col_index, colspan) 

204 ) 

205 col_index += colspan 

206 else: 

207 row_header = clean_node(wxr, None, node) 

208 else: 

209 node_text = clean_node(wxr, None, node) 

210 if ( 

211 node_text.endswith(("]", "\\", "Prononciation ?")) 

212 and form_text != "" 

213 ): 

214 form = Form( 

215 form=form_text, 

216 ipas=[node_text] 

217 if node_text.endswith(("]", "\\")) 

218 else [], 

219 source=conj_page_title, 

220 ) 

221 if row_header != "": 221 ↛ 223line 221 didn't jump to line 223 because the condition on line 221 was always true

222 form.raw_tags.append(row_header) 

223 for col_header in col_headers: 

224 if ( 

225 col_index >= col_header.col_index 

226 and col_index 

227 < col_header.col_index + col_header.colspan 

228 ): 

229 form.raw_tags.append(col_header.text) 

230 translate_raw_tags(form) 

231 entry.forms.append(form) 

232 form_text = "" 

233 elif node_text != "": 

234 if not form_text.endswith("’") and form_text != "": 

235 form_text += " " 

236 form_text += node_text 

237 col_index += 1 

238 

239 

240def process_fr_conj_table( 

241 wxr: WiktextractContext, 

242 entry: WordEntry, 

243 div_node: HTMLNode, 

244 h3_text: str, 

245 conj_page_title: str, 

246) -> None: 

247 for table_node in div_node.find_child(NodeKind.TABLE): 

248 for row_index, row in enumerate( 

249 table_node.find_child(NodeKind.TABLE_ROW) 

250 ): 

251 for cell_index, cell in enumerate( 

252 row.find_child(NodeKind.TABLE_CELL) 

253 ): 

254 for cell_child in cell.children: 

255 if isinstance(cell_child, WikiNode): 

256 if ( 

257 cell_child.kind == NodeKind.HTML 

258 and cell_child.tag == "table" 

259 ): 

260 process_fr_conj_html_table( 

261 wxr, entry, cell_child, h3_text, conj_page_title 

262 ) 

263 elif cell_child.kind == NodeKind.TABLE: 263 ↛ 254line 263 didn't jump to line 254 because the condition on line 263 was always true

264 process_fr_conj_wiki_table( 

265 wxr, entry, cell_child, h3_text, conj_page_title 

266 ) 

267 

268 

269def process_fr_conj_html_table( 

270 wxr: WiktextractContext, 

271 entry: WordEntry, 

272 table_node: HTMLNode, 

273 h3_text: str, 

274 conj_page_title: str, 

275): 

276 tags = [h3_text] if h3_text != "" else [] 

277 for tr_index, tr_node in enumerate(table_node.find_html_recursively("tr")): 

278 if tr_index == 0: 

279 tags.append(clean_node(wxr, None, tr_node.children)) 

280 else: 

281 form = Form(raw_tags=tags, source=conj_page_title) 

282 for td_index, td_node in enumerate( 

283 tr_node.find_html_recursively("td") 

284 ): 

285 td_text = clean_node(wxr, None, td_node) 

286 if td_index < 2: 

287 form.form += td_text 

288 if td_index == 0 and not td_text.endswith("’"): 

289 form.form += " " 

290 else: 

291 if len(form.ipas) > 0: 

292 form.ipas[0] += td_text 

293 else: 

294 if not td_text.endswith("‿"): 294 ↛ 296line 294 didn't jump to line 296 because the condition on line 294 was always true

295 td_text += " " 

296 form.ipas.append(td_text) 

297 

298 translate_raw_tags(form) 

299 entry.forms.append(form) 

300 

301 

302def process_fr_conj_wiki_table( 

303 wxr: WiktextractContext, 

304 entry: WordEntry, 

305 table_node: WikiNode, 

306 h3_text: str, 

307 conj_page_title: str, 

308): 

309 tags = [h3_text] if h3_text != "" else [] 

310 for row_index, row in enumerate(table_node.find_child(NodeKind.TABLE_ROW)): 

311 if row_index == 0: 

312 tags.append(clean_node(wxr, None, row.children)) 

313 else: 

314 form = Form(raw_tags=tags, source=conj_page_title) 

315 for cell_index, cell in enumerate( 

316 row.find_child(NodeKind.TABLE_CELL) 

317 ): 

318 cell_text = clean_node(wxr, None, cell) 

319 if cell_index < 2: 

320 if cell_text == "—" or cell_text.endswith( 

321 "Prononciation ?" 

322 ): 

323 continue 

324 if cell_text.startswith( 

325 "-" 

326 ) and not form.form.strip().endswith(")"): 

327 form.form = form.form.strip() 

328 form.form += cell_text 

329 if cell_index == 0 and len(cell_text) > 0: 

330 form.form += " " 

331 elif not cell_text.endswith("Prononciation ?"): 331 ↛ 315line 331 didn't jump to line 315 because the condition on line 331 was always true

332 form.ipas.append(cell_text) 

333 

334 if len(form.form) > 0: 

335 translate_raw_tags(form) 

336 entry.forms.append(form) 

337 

338 

339def process_ja_flx_adj_template( 

340 wxr: WiktextractContext, 

341 entry: WordEntry, 

342 template_node: TemplateNode, 

343 conj_page_title: str, 

344) -> None: 

345 # https://fr.wiktionary.org/wiki/Modèle:ja-adj 

346 # https://fr.wiktionary.org/wiki/Modèle:ja-flx-adj-な 

347 expanded_template = wxr.wtp.parse( 

348 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

349 ) 

350 for table_node in expanded_template.find_child(NodeKind.TABLE): 

351 first_tag = "" 

352 for row in table_node.find_child(NodeKind.TABLE_ROW): 

353 forms = [] 

354 tags = [first_tag] 

355 for cell_index, row_child in enumerate( 

356 row.find_child(NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL) 

357 ): 

358 row_child_text = clean_node(wxr, None, row_child) 

359 if row_child.kind == NodeKind.TABLE_HEADER_CELL: 

360 first_tag = row_child_text 

361 else: 

362 for line_index, line in enumerate( 

363 row_child_text.splitlines() 

364 ): 

365 if cell_index == 0: 

366 tags.append(line) 

367 continue 

368 if line_index + 1 > len(forms): 

369 forms.append( 

370 translate_raw_tags( 

371 Form(raw_tags=tags, source=conj_page_title) 

372 ) 

373 ) 

374 if cell_index == 1: 

375 forms[line_index].form = line 

376 elif cell_index == 2: 

377 forms[line_index].hiragana = line 

378 elif cell_index == 3: 378 ↛ 362line 378 didn't jump to line 362 because the condition on line 378 was always true

379 forms[line_index].roman = line 

380 

381 entry.forms.extend(forms) 

382 

383 

384def process_ja_conj_template( 

385 wxr: WiktextractContext, 

386 entry: WordEntry, 

387 template_node: TemplateNode, 

388 conj_page_title: str, 

389) -> None: 

390 # https://fr.wiktionary.org/wiki/Modèle:ja-verbe-conj 

391 # Modèle:ja-在る 

392 expanded_template = wxr.wtp.parse( 

393 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

394 ) 

395 for table_node in expanded_template.find_child(NodeKind.TABLE): 

396 first_tag = "" 

397 row_headers = {} 

398 for row in table_node.find_child(NodeKind.TABLE_ROW): 

399 if ( 

400 all( 

401 isinstance(c, WikiNode) 

402 and c.kind == NodeKind.TABLE_HEADER_CELL 

403 for c in row.children 

404 ) 

405 and len(row.children) > 1 

406 ): 

407 # skip header row of the "Clefs de constructions" table 

408 continue 

409 

410 for header in row.find_child(NodeKind.TABLE_HEADER_CELL): 

411 header_text = clean_node(wxr, None, header) 

412 if len(row.children) == 1: 

413 first_tag = header_text 

414 else: 

415 row_headers[header_text] = int( 

416 header.attrs.get("rowspan", "1") 

417 ) 

418 

419 tags = [first_tag] 

420 for tag, rowspan in row_headers.copy().items(): 

421 tags.append(tag) 

422 if rowspan == 1: 

423 del row_headers[tag] 

424 else: 

425 row_headers[tag] = rowspan - 1 

426 form = Form(raw_tags=tags, source=conj_page_title) 

427 for cell_index, cell in enumerate( 

428 row.find_child(NodeKind.TABLE_CELL) 

429 ): 

430 cell_text = clean_node(wxr, None, cell) 

431 if cell_index == 0: 

432 form.form = cell_text 

433 elif cell_index == 1: 

434 form.hiragana = cell_text 

435 elif cell_index == 2: 435 ↛ 427line 435 didn't jump to line 427 because the condition on line 435 was always true

436 form.roman = cell_text 

437 if len(form.form) > 0: 

438 translate_raw_tags(form) 

439 entry.forms.append(form) 

440 

441 

442def extract_ku_conj_trans_template( 

443 wxr: WiktextractContext, 

444 entry: WordEntry, 

445 t_node: TemplateNode, 

446 conj_page_title: str, 

447) -> None: 

448 expanded_node = wxr.wtp.parse( 

449 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

450 ) 

451 for table in expanded_node.find_child(NodeKind.TABLE): 

452 extract_ku_conj_trans_table_node(wxr, entry, table, conj_page_title) 

453 for link_node in expanded_node.find_child(NodeKind.LINK): 

454 clean_node(wxr, entry, link_node) 

455 

456 

457def extract_ku_conj_trans_table_node( 

458 wxr: WiktextractContext, 

459 entry: WordEntry, 

460 table_node: WikiNode, 

461 conj_page_title: str, 

462) -> None: 

463 @dataclass 

464 class TableHeader: 

465 text: str 

466 index: int 

467 span: int 

468 

469 ignore_headers = ( 

470 "Conjugaison du verbe", 

471 "TEMPS DU PRÉSENT ET DU FUTUR", 

472 "TEMPS DU PRESENT ET DU FUTUR", 

473 "TEMPS DU PASSÉ", 

474 "TEMPS DU PASSE", 

475 ) 

476 col_headers = [] 

477 last_row_has_header = False 

478 last_header = "" 

479 for row in table_node.find_child(NodeKind.TABLE_ROW): 

480 col_index = 0 

481 current_row_has_header = row.contain_node(NodeKind.TABLE_HEADER_CELL) 

482 if not last_row_has_header and current_row_has_header: 

483 col_headers.clear() 

484 for cell in row.find_child( 

485 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

486 ): 

487 cell_str = clean_node(wxr, None, cell) 

488 if cell_str == "": 

489 col_index += 1 

490 continue 

491 if cell.kind == NodeKind.TABLE_HEADER_CELL: 

492 if cell_str.startswith(ignore_headers): 

493 last_header = cell_str 

494 continue 

495 colspan = 1 

496 colspan_str = cell.attrs.get("colspan", "1") 

497 if re.fullmatch(r"\d+", colspan_str) is not None: 497 ↛ 499line 497 didn't jump to line 499 because the condition on line 497 was always true

498 colspan = int(colspan_str) 

499 col_headers.append( 

500 TableHeader(text=cell_str, index=col_index, span=colspan) 

501 ) 

502 last_header = cell_str 

503 col_index += colspan 

504 elif last_header == "TEMPS DU PASSÉ": 

505 continue 

506 elif cell_str == "(inusité)": 

507 col_index += 1 

508 elif cell_str != wxr.wtp.title: 508 ↛ 484line 508 didn't jump to line 484 because the condition on line 508 was always true

509 form = Form(form=cell_str, source=conj_page_title) 

510 for header in col_headers: 

511 if ( 

512 col_index >= header.index 

513 and col_index < header.index + header.span 

514 ): 

515 form.raw_tags.append(header.text) 

516 translate_raw_tags(form) 

517 entry.forms.append(form) 

518 col_index += 1 

519 last_row_has_header = current_row_has_header 

520 

521 

522def extract_ko_conj_template( 

523 wxr: WiktextractContext, 

524 entry: WordEntry, 

525 t_node: TemplateNode, 

526 conj_page_title: str, 

527) -> None: 

528 word_page_title = wxr.wtp.title 

529 wxr.wtp.title = conj_page_title 

530 expanded_node = wxr.wtp.parse( 

531 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

532 ) 

533 for h3 in expanded_node.find_html("h3"): 

534 clean_node(wxr, entry, h3) 

535 for table_index, table in enumerate( 

536 expanded_node.find_child(NodeKind.TABLE) 

537 ): 

538 if table_index == 0: 

539 continue 

540 shared_raw_tags = [] 

541 for caption_node in table.find_child(NodeKind.TABLE_CAPTION): 

542 caption = clean_node(wxr, None, caption_node.children) 

543 if caption != "": 543 ↛ 541line 543 didn't jump to line 541 because the condition on line 543 was always true

544 shared_raw_tags.append(caption) 

545 col_headers = [] 

546 row_headers = [] 

547 row_index = 0 

548 row_header_indexes = [0] 

549 for row in table.find_child(NodeKind.TABLE_ROW): 

550 col_index = 0 

551 for header_cell in row.find_child(NodeKind.TABLE_HEADER_CELL): 

552 cell_str = clean_node(wxr, None, header_cell) 

553 if cell_str == "": 

554 continue 

555 colspan, rowspan = get_cell_span(header_cell) 

556 if row.contain_node(NodeKind.TABLE_CELL): 

557 header_added = False 

558 current_row_index = row_index 

559 for index, row_header_index in enumerate( 559 ↛ 567line 559 didn't jump to line 567 because the loop on line 559 didn't complete

560 row_header_indexes 

561 ): 

562 if row_index >= row_header_index: 

563 current_row_index = row_header_indexes[index] 

564 row_header_indexes[index] += rowspan 

565 header_added = True 

566 break 

567 if not header_added: 567 ↛ 568line 567 didn't jump to line 568 because the condition on line 567 was never true

568 row_header_indexes.append(rowspan) 

569 row_headers.append( 

570 TableHeader( 

571 text=cell_str, 

572 row_index=current_row_index, 

573 rowspan=rowspan, 

574 ) 

575 ) 

576 else: 

577 col_headers.append( 

578 TableHeader( 

579 text=cell_str, 

580 col_index=col_index, 

581 colspan=colspan, 

582 ) 

583 ) 

584 col_index += colspan 

585 if row.contain_node(NodeKind.TABLE_CELL): 

586 row_index += 1 

587 

588 row_index = 0 

589 for row in table.find_child(NodeKind.TABLE_ROW): 

590 col_index = 0 

591 for cell in row.find_child(NodeKind.TABLE_CELL): 

592 cell_str = clean_node(wxr, None, cell) 

593 colspan, rowspan = get_cell_span(cell) 

594 if cell_str == "—": 594 ↛ 595line 594 didn't jump to line 595 because the condition on line 594 was never true

595 col_index += 1 

596 else: 

597 form = Form( 

598 source=conj_page_title, raw_tags=shared_raw_tags 

599 ) 

600 for line_index, line in enumerate(cell_str.splitlines()): 

601 match line_index: 

602 case 0: 

603 form.form = line 

604 case 1: 

605 form.roman = line 

606 case 2: 606 ↛ 600line 606 didn't jump to line 600 because the pattern on line 606 always matched

607 form.ipas.append(line) 

608 for header in col_headers: 

609 if ( 

610 col_index >= header.col_index 

611 and col_index < header.col_index + header.colspan 

612 ): 

613 form.raw_tags.append(header.text) 

614 for header in row_headers: 

615 if ( 

616 row_index < header.row_index + header.rowspan 

617 and row_index + rowspan > header.row_index 

618 ): 

619 form.raw_tags.append(header.text) 

620 if form.form not in ["", wxr.wtp.title]: 620 ↛ 623line 620 didn't jump to line 623 because the condition on line 620 was always true

621 translate_raw_tags(form) 

622 entry.forms.append(form) 

623 col_index += 1 

624 if row.contain_node(NodeKind.TABLE_CELL): 

625 row_index += 1 

626 

627 for link in expanded_node.find_child(NodeKind.LINK): 

628 clean_node(wxr, entry, link) 

629 wxr.wtp.title = word_page_title 

630 

631 

632def get_cell_span(cell: WikiNode) -> tuple[int, int]: 

633 colspan = 1 

634 colspan_str = cell.attrs.get("colspan", "1") 

635 if re.fullmatch(r"\d+", colspan_str) is not None: 635 ↛ 637line 635 didn't jump to line 637 because the condition on line 635 was always true

636 colspan = int(colspan_str) 

637 rowspan = 1 

638 rowspan_str = cell.attrs.get("rowspan", "1") 

639 if re.fullmatch(r"\d+", rowspan_str) is not None: 639 ↛ 641line 639 didn't jump to line 641 because the condition on line 639 was always true

640 rowspan = int(rowspan_str) 

641 return colspan, rowspan 

642 

643 

644def extract_de_conj_template( 

645 wxr: WiktextractContext, 

646 word_entry: WordEntry, 

647 t_node: TemplateNode, 

648 conj_page_title: str, 

649): 

650 word_page_title = wxr.wtp.title 

651 wxr.wtp.title = conj_page_title 

652 expanded_node = wxr.wtp.parse( 

653 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

654 ) 

655 wxr.wtp.title = word_page_title 

656 for table_index, table in enumerate( 

657 expanded_node.find_child(NodeKind.TABLE) 

658 ): 

659 table_header = "" 

660 col_headers = [] 

661 for row in table.find_child(NodeKind.TABLE_ROW): 

662 word_part = "" 

663 col_index = 0 

664 if table_index >= 2 and row.contain_node( 

665 NodeKind.TABLE_HEADER_CELL 

666 ): 

667 col_headers.clear() 

668 for cell in row.find_child( 

669 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

670 ): 

671 cell_text = clean_node(wxr, None, cell) 

672 if cell_text == "": 

673 continue 

674 elif cell.kind == NodeKind.TABLE_HEADER_CELL: 

675 if len(row.children) == 1: 

676 table_header = clean_node(wxr, None, cell) 

677 else: 

678 col_headers.append(clean_node(wxr, None, cell)) 

679 elif table_index < 2: 

680 form = Form(form=cell_text, source=conj_page_title) 

681 if ":" in cell_text: 

682 colon_index = cell_text.index(":") 

683 raw_tag = cell_text[:colon_index].strip() 

684 if raw_tag != "": 684 ↛ 686line 684 didn't jump to line 686 because the condition on line 684 was always true

685 form.raw_tags.append(raw_tag) 

686 form.form = cell_text[colon_index + 1 :].strip() 

687 if table_header != "": 687 ↛ 689line 687 didn't jump to line 689 because the condition on line 687 was always true

688 form.raw_tags.append(table_header) 

689 if col_index < len(col_headers): 689 ↛ 691line 689 didn't jump to line 691 because the condition on line 689 was always true

690 form.raw_tags.append(col_headers[col_index]) 

691 if form.form not in ["", wxr.wtp.title]: 

692 translate_raw_tags(form) 

693 word_entry.forms.append(form) 

694 elif col_index % 2 == 0: 

695 word_part = cell_text 

696 else: 

697 form = Form( 

698 form=f"{word_part} {cell_text}", source=conj_page_title 

699 ) 

700 if table_header != "": 700 ↛ 702line 700 didn't jump to line 702 because the condition on line 700 was always true

701 form.raw_tags.append(table_header) 

702 if col_index // 2 < len(col_headers): 702 ↛ 704line 702 didn't jump to line 704 because the condition on line 702 was always true

703 form.raw_tags.append(col_headers[col_index // 2]) 

704 if form.form not in ["", wxr.wtp.title]: 704 ↛ 707line 704 didn't jump to line 707 because the condition on line 704 was always true

705 translate_raw_tags(form) 

706 word_entry.forms.append(form) 

707 col_index += 1 

708 

709 for cat_link in expanded_node.find_child(NodeKind.LINK): 

710 clean_node(wxr, word_entry, cat_link) 

711 

712 

713def extract_declension_page( 

714 wxr: WiktextractContext, word_entry: WordEntry, page_title: str 

715): 

716 page_body = wxr.wtp.get_page_body( 

717 page_title, wxr.wtp.NAMESPACE_DATA["Appendix"]["id"] 

718 ) 

719 if page_body is None: 719 ↛ 720line 719 didn't jump to line 720 because the condition on line 719 was never true

720 return 

721 root = wxr.wtp.parse(page_body) 

722 for t_node in root.find_child(NodeKind.TEMPLATE): 

723 extract_declension_template(wxr, word_entry, page_title, t_node, "") 

724 

725 

726def extract_declension_template( 

727 wxr: WiktextractContext, 

728 word_entry: WordEntry, 

729 page_title: str, 

730 t_node: TemplateNode, 

731 tab_name: str, 

732): 

733 if t_node.template_name in [ 733 ↛ 740line 733 didn't jump to line 740 because the condition on line 733 was always true

734 "de-adjectif-déclinaisons", 

735 "de-adj-déclinaisons", 

736 ]: 

737 extract_de_adj_declension_template( 

738 wxr, word_entry, page_title, t_node, tab_name 

739 ) 

740 elif t_node.template_name == "Onglets conjugaison": 

741 for index in range(1, 7): 

742 tab_name_arg = f"onglet{index}" 

743 if tab_name_arg not in t_node.template_parameters: 

744 break 

745 tab_name = clean_node( 

746 wxr, None, t_node.template_parameters[tab_name_arg] 

747 ) 

748 tab_content = wxr.wtp.parse( 

749 wxr.wtp.node_to_wikitext( 

750 t_node.template_parameters[f"contenu{index}"] 

751 ) 

752 ) 

753 for node in tab_content.find_child(NodeKind.TEMPLATE): 

754 extract_declension_template( 

755 wxr, word_entry, page_title, node, tab_name 

756 ) 

757 

758 

759def extract_de_adj_declension_template( 

760 wxr: WiktextractContext, 

761 word_entry: WordEntry, 

762 page_title: str, 

763 t_node: TemplateNode, 

764 tab_name: str, 

765): 

766 # https://fr.wiktionary.org/wiki/Modèle:de-adjectif-déclinaisons 

767 expanded_node = wxr.wtp.parse( 

768 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

769 ) 

770 for level_node in expanded_node.find_child(LEVEL_KIND_FLAGS): 

771 section_title = clean_node(wxr, None, level_node.largs) 

772 for table in level_node.find_child(NodeKind.TABLE): 

773 table_caption = "" 

774 for cap_node in table.find_child(NodeKind.TABLE_CAPTION): 

775 table_caption = clean_node(wxr, None, cap_node.children) 

776 col_headers = [] 

777 for row in table.find_child(NodeKind.TABLE_ROW): 

778 col_index = 0 

779 row_header = "" 

780 row_has_data = row.contain_node(NodeKind.TABLE_CELL) 

781 article = "" 

782 for cell in row.find_child( 

783 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

784 ): 

785 colspan = int(cell.attrs.get("colspan", "1")) 

786 cell_text = clean_node(wxr, None, cell) 

787 if cell.kind == NodeKind.TABLE_HEADER_CELL: 

788 if row_has_data: 

789 row_header = clean_node(wxr, None, cell) 

790 elif cell_text != "Forme": 

791 col_headers.append( 

792 TableHeader( 

793 clean_node(wxr, None, cell), 

794 col_index=col_index, 

795 colspan=colspan, 

796 ) 

797 ) 

798 else: 

799 use_col_headers = [] 

800 for col_header in col_headers: 

801 if ( 

802 col_index >= col_header.col_index 

803 and col_index 

804 < col_header.col_index + col_header.colspan 

805 ): 

806 use_col_headers.append(col_header.text) 

807 if "Article" in use_col_headers: 

808 if cell_text != "—": 808 ↛ 828line 808 didn't jump to line 828 because the condition on line 808 was always true

809 article = cell_text 

810 else: 

811 form = Form( 

812 form=cell_text, 

813 article=article, 

814 raw_tags=use_col_headers, 

815 source=page_title, 

816 ) 

817 for raw_tag in [ 

818 tab_name, 

819 section_title, 

820 table_caption, 

821 row_header, 

822 ]: 

823 if raw_tag != "": 

824 form.raw_tags.append(raw_tag) 

825 if form.form not in ["", wxr.wtp.title]: 825 ↛ 828line 825 didn't jump to line 828 because the condition on line 825 was always true

826 translate_raw_tags(form) 

827 word_entry.forms.append(form) 

828 col_index += colspan 

829 

830 for link in level_node.find_child(NodeKind.LINK): 

831 clean_node(wxr, word_entry, link)