Coverage for src/wiktextract/extractor/fr/conjugation.py: 92%

458 statements  

« prev     ^ index     » next       coverage.py v7.11.0, created at 2025-10-17 05:52 +0000

1import re 

2from dataclasses import dataclass 

3 

4from wikitextprocessor.parser import ( 

5 LEVEL_KIND_FLAGS, 

6 HTMLNode, 

7 NodeKind, 

8 TemplateNode, 

9 WikiNode, 

10) 

11 

12from ...page import clean_node 

13from ...wxr_context import WiktextractContext 

14from .models import Form, WordEntry 

15from .tags import translate_raw_tags 

16 

17 

18def extract_conjugation( 

19 wxr: WiktextractContext, 

20 entry: WordEntry, 

21 conj_page_title: str, 

22 select_tab: str = "1", 

23) -> None: 

24 """ 

25 Find and extract conjugation page. 

26 

27 https://fr.wiktionary.org/wiki/Conjugaison:français 

28 https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_de_tous_les_modèles/Français/Conjugaison 

29 https://fr.wiktionary.org/wiki/Aide:Conjugaisons 

30 """ 

31 conj_page = wxr.wtp.get_page_body( 

32 conj_page_title, wxr.wtp.NAMESPACE_DATA["Conjugaison"]["id"] 

33 ) 

34 if conj_page is None: 

35 return 

36 conj_root = wxr.wtp.parse(conj_page) 

37 for conj_template in conj_root.find_child(NodeKind.TEMPLATE): 

38 if conj_template.template_name.endswith("-intro"): 38 ↛ 39line 38 didn't jump to line 39 because the condition on line 38 was never true

39 continue 

40 if conj_template.template_name in ["ku-conj-trans", "ku-conj"]: 

41 extract_ku_conj_trans_template( 

42 wxr, entry, conj_template, conj_page_title 

43 ) 

44 elif conj_template.template_name == "ko-conj": 

45 extract_ko_conj_template(wxr, entry, conj_template, conj_page_title) 

46 elif conj_template.template_name == "de-conj": 

47 extract_de_conj_template(wxr, entry, conj_template, conj_page_title) 

48 elif ( 

49 "-conj" in conj_template.template_name 

50 # https://fr.wiktionary.org/wiki/Catégorie:Modèles_de_conjugaison_en_italien 

51 # Italian table templates 

52 or conj_template.template_name.startswith("it-") 

53 ): 

54 process_conj_template(wxr, entry, conj_template, conj_page_title) 

55 elif conj_template.template_name == "Onglets conjugaison": 

56 process_onglets_conjugaison_template( 

57 wxr, entry, conj_template, conj_page_title, select_tab 

58 ) 

59 elif conj_template.template_name.removeprefix(":").startswith( 

60 "Conjugaison:" 

61 ): 

62 extract_conjugation( 

63 wxr, 

64 entry, 

65 conj_template.template_name.removeprefix(":"), 

66 clean_node( 

67 wxr, None, conj_template.template_parameters.get("sél", "2") 

68 ), 

69 ) 

70 elif conj_template.template_name.startswith("ja-flx-adj"): 

71 process_ja_flx_adj_template( 

72 wxr, entry, conj_template, conj_page_title 

73 ) 

74 elif conj_template.template_name.startswith("ja-"): 74 ↛ 37line 74 didn't jump to line 37 because the condition on line 74 was always true

75 process_ja_conj_template(wxr, entry, conj_template, conj_page_title) 

76 

77 if conj_page_title.startswith("Conjugaison:kurde/"): 

78 for table in conj_root.find_child(NodeKind.TABLE): 78 ↛ 79line 78 didn't jump to line 79 because the loop on line 78 never started

79 extract_ku_conj_trans_table_node(wxr, entry, table, conj_page_title) 

80 

81 for link_node in conj_root.find_child(NodeKind.LINK): 81 ↛ 82line 81 didn't jump to line 82 because the loop on line 81 never started

82 clean_node(wxr, None, link_node) 

83 

84 

85def process_onglets_conjugaison_template( 

86 wxr: WiktextractContext, 

87 entry: WordEntry, 

88 node: TemplateNode, 

89 conj_page_title: str, 

90 select_tab: str, 

91) -> None: 

92 # https://fr.wiktionary.org/wiki/Modèle:Onglets_conjugaison 

93 # this template expands to two tabs of tables 

94 selected_tabs = [] 

95 if select_tab != "1" or ( 95 ↛ 103line 95 didn't jump to line 103 because the condition on line 95 was always true

96 select_tab == "1" 

97 and clean_node(wxr, None, node.template_parameters.get("onglet1", "")) 

98 == "Conjugaison active" 

99 ): 

100 # don't extract or only extract "Conjugaison pronominale" tab 

101 selected_tabs = [select_tab] 

102 else: 

103 selected_tabs = [str(i) for i in range(1, 7)] 

104 

105 for tab_index in selected_tabs: 

106 arg_name = f"contenu{tab_index}" 

107 if arg_name not in node.template_parameters: 107 ↛ 108line 107 didn't jump to line 108 because the condition on line 107 was never true

108 break 

109 arg_value = node.template_parameters[arg_name] 

110 if ( 110 ↛ 114line 110 didn't jump to line 114 because the condition on line 110 was never true

111 isinstance(arg_value, TemplateNode) 

112 and "-conj" in arg_value.template_name 

113 ): 

114 process_conj_template(wxr, entry, arg_value, conj_page_title) 

115 elif isinstance(arg_value, list): 115 ↛ 105line 115 didn't jump to line 105 because the condition on line 115 was always true

116 for arg_node in arg_value: 

117 if isinstance(arg_node, TemplateNode) and ( 

118 "-conj" in arg_node.template_name 

119 or arg_node.template_name.startswith("it-") 

120 ): 

121 process_conj_template(wxr, entry, arg_node, conj_page_title) 

122 

123 

124def process_conj_template( 

125 wxr: WiktextractContext, 

126 entry: WordEntry, 

127 template_node: TemplateNode, 

128 conj_page_title: str, 

129) -> None: 

130 # https://fr.wiktionary.org/wiki/Catégorie:Modèles_de_conjugaison_en_français 

131 # https://fr.wiktionary.org/wiki/Modèle:fr-conj-1-ger 

132 expanded_template = wxr.wtp.parse( 

133 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

134 ) 

135 process_expanded_conj_template( 

136 wxr, entry, expanded_template, conj_page_title 

137 ) 

138 

139 

140def process_expanded_conj_template( 

141 wxr: WiktextractContext, 

142 entry: WordEntry, 

143 node: WikiNode, 

144 conj_page_title: str, 

145) -> None: 

146 h3_text = ( 

147 clean_node(wxr, None, node.largs) 

148 if node.kind == NodeKind.LEVEL3 

149 else "" 

150 ) 

151 for child in node.find_child(NodeKind.HTML | LEVEL_KIND_FLAGS): 

152 if child.kind in LEVEL_KIND_FLAGS: 152 ↛ 153line 152 didn't jump to line 153 because the condition on line 152 was never true

153 process_expanded_conj_template(wxr, entry, child, conj_page_title) 

154 elif child.kind == NodeKind.HTML: 154 ↛ 151line 154 didn't jump to line 151 because the condition on line 154 was always true

155 if child.tag == "h3": 

156 h3_text = clean_node(wxr, None, child) 

157 elif child.tag == "div": 157 ↛ 151line 157 didn't jump to line 151 because the condition on line 157 was always true

158 if h3_text == "Modes impersonnels": 

159 process_fr_conj_modes_table( 

160 wxr, entry, child, conj_page_title 

161 ) 

162 else: 

163 process_fr_conj_table( 

164 wxr, entry, child, h3_text, conj_page_title 

165 ) 

166 

167 

168@dataclass 

169class TableHeader: 

170 text: str 

171 col_index: int = 0 

172 colspan: int = 0 

173 row_index: int = 0 

174 rowspan: int = 0 

175 

176 

177def process_fr_conj_modes_table( 

178 wxr: WiktextractContext, 

179 entry: WordEntry, 

180 div_node: HTMLNode, 

181 conj_page_title: str, 

182) -> None: 

183 # the first "Modes impersonnels" table 

184 

185 for table_node in div_node.find_child(NodeKind.TABLE): 

186 col_headers = [] 

187 for row in table_node.find_child(NodeKind.TABLE_ROW): 

188 row_header = "" 

189 is_header_row = not row.contain_node(NodeKind.TABLE_CELL) 

190 col_index = 0 

191 form_text = "" 

192 for node in row.find_child( 

193 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

194 ): 

195 if node.kind == NodeKind.TABLE_HEADER_CELL or ( 

196 node.contain_node(NodeKind.BOLD) and col_index == 0 

197 ): 

198 if is_header_row: 

199 header_text = clean_node(wxr, None, node) 

200 if header_text == "Mode": 200 ↛ 201line 200 didn't jump to line 201 because the condition on line 200 was never true

201 continue 

202 else: 

203 colspan = 1 

204 colspan_str = node.attrs.get("colspan", "1") 

205 if re.fullmatch(r"\d+", colspan_str) is not None: 205 ↛ 207line 205 didn't jump to line 207 because the condition on line 205 was always true

206 colspan = int(colspan_str) 

207 col_headers.append( 

208 TableHeader(header_text, col_index, colspan) 

209 ) 

210 col_index += colspan 

211 else: 

212 row_header = clean_node(wxr, None, node) 

213 else: 

214 node_text = clean_node(wxr, None, node) 

215 if ( 

216 node_text.endswith(("]", "\\", "Prononciation ?")) 

217 and form_text != "" 

218 ): 

219 form = Form( 

220 form=form_text, 

221 ipas=[node_text] 

222 if node_text.endswith(("]", "\\")) 

223 else [], 

224 source=conj_page_title, 

225 ) 

226 if row_header != "": 226 ↛ 228line 226 didn't jump to line 228 because the condition on line 226 was always true

227 form.raw_tags.append(row_header) 

228 for col_header in col_headers: 

229 if ( 

230 col_index >= col_header.col_index 

231 and col_index 

232 < col_header.col_index + col_header.colspan 

233 ): 

234 form.raw_tags.append(col_header.text) 

235 translate_raw_tags(form) 

236 entry.forms.append(form) 

237 form_text = "" 

238 elif node_text != "": 

239 if not form_text.endswith("’") and form_text != "": 

240 form_text += " " 

241 form_text += node_text 

242 col_index += 1 

243 

244 

245def process_fr_conj_table( 

246 wxr: WiktextractContext, 

247 entry: WordEntry, 

248 div_node: HTMLNode, 

249 h3_text: str, 

250 conj_page_title: str, 

251) -> None: 

252 for table_node in div_node.find_child(NodeKind.TABLE): 

253 for row_index, row in enumerate( 

254 table_node.find_child(NodeKind.TABLE_ROW) 

255 ): 

256 for cell_index, cell in enumerate( 

257 row.find_child(NodeKind.TABLE_CELL) 

258 ): 

259 for cell_child in cell.children: 

260 if isinstance(cell_child, WikiNode): 

261 if ( 

262 cell_child.kind == NodeKind.HTML 

263 and cell_child.tag == "table" 

264 ): 

265 process_fr_conj_html_table( 

266 wxr, entry, cell_child, h3_text, conj_page_title 

267 ) 

268 elif cell_child.kind == NodeKind.TABLE: 268 ↛ 259line 268 didn't jump to line 259 because the condition on line 268 was always true

269 process_fr_conj_wiki_table( 

270 wxr, entry, cell_child, h3_text, conj_page_title 

271 ) 

272 

273 

274def process_fr_conj_html_table( 

275 wxr: WiktextractContext, 

276 entry: WordEntry, 

277 table_node: HTMLNode, 

278 h3_text: str, 

279 conj_page_title: str, 

280): 

281 tags = [h3_text] if h3_text != "" else [] 

282 for tr_index, tr_node in enumerate(table_node.find_html_recursively("tr")): 

283 if tr_index == 0: 

284 tags.append(clean_node(wxr, None, tr_node.children)) 

285 else: 

286 form = Form(raw_tags=tags, source=conj_page_title) 

287 for td_index, td_node in enumerate( 

288 tr_node.find_html_recursively("td") 

289 ): 

290 td_text = clean_node(wxr, None, td_node) 

291 if td_index < 2: 

292 form.form += td_text 

293 if td_index == 0 and not td_text.endswith("’"): 

294 form.form += " " 

295 else: 

296 if len(form.ipas) > 0: 

297 form.ipas[0] += td_text 

298 else: 

299 if not td_text.endswith("‿"): 299 ↛ 301line 299 didn't jump to line 301 because the condition on line 299 was always true

300 td_text += " " 

301 form.ipas.append(td_text) 

302 

303 translate_raw_tags(form) 

304 entry.forms.append(form) 

305 

306 

307def process_fr_conj_wiki_table( 

308 wxr: WiktextractContext, 

309 entry: WordEntry, 

310 table_node: WikiNode, 

311 h3_text: str, 

312 conj_page_title: str, 

313): 

314 tags = [h3_text] if h3_text != "" else [] 

315 for row_index, row in enumerate(table_node.find_child(NodeKind.TABLE_ROW)): 

316 if row_index == 0: 

317 tags.append(clean_node(wxr, None, row.children)) 

318 else: 

319 form = Form(raw_tags=tags, source=conj_page_title) 

320 for cell_index, cell in enumerate( 

321 row.find_child(NodeKind.TABLE_CELL) 

322 ): 

323 cell_text = clean_node(wxr, None, cell) 

324 if cell_index < 2: 

325 if cell_text == "—" or cell_text.endswith( 

326 "Prononciation ?" 

327 ): 

328 continue 

329 if cell_text.startswith( 

330 "-" 

331 ) and not form.form.strip().endswith(")"): 

332 form.form = form.form.strip() 

333 form.form += cell_text 

334 if cell_index == 0 and len(cell_text) > 0: 

335 form.form += " " 

336 elif not cell_text.endswith("Prononciation ?"): 336 ↛ 320line 336 didn't jump to line 320 because the condition on line 336 was always true

337 form.ipas.append(cell_text) 

338 

339 if len(form.form) > 0: 

340 translate_raw_tags(form) 

341 entry.forms.append(form) 

342 

343 

344def process_ja_flx_adj_template( 

345 wxr: WiktextractContext, 

346 entry: WordEntry, 

347 template_node: TemplateNode, 

348 conj_page_title: str, 

349) -> None: 

350 # https://fr.wiktionary.org/wiki/Modèle:ja-adj 

351 # https://fr.wiktionary.org/wiki/Modèle:ja-flx-adj-な 

352 expanded_template = wxr.wtp.parse( 

353 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

354 ) 

355 for table_node in expanded_template.find_child(NodeKind.TABLE): 

356 first_tag = "" 

357 for row in table_node.find_child(NodeKind.TABLE_ROW): 

358 forms = [] 

359 tags = [first_tag] 

360 for cell_index, row_child in enumerate( 

361 row.find_child(NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL) 

362 ): 

363 row_child_text = clean_node(wxr, None, row_child) 

364 if row_child.kind == NodeKind.TABLE_HEADER_CELL: 

365 first_tag = row_child_text 

366 else: 

367 for line_index, line in enumerate( 

368 row_child_text.splitlines() 

369 ): 

370 if cell_index == 0: 

371 tags.append(line) 

372 continue 

373 if line_index + 1 > len(forms): 

374 forms.append( 

375 translate_raw_tags( 

376 Form(raw_tags=tags, source=conj_page_title) 

377 ) 

378 ) 

379 if cell_index == 1: 

380 forms[line_index].form = line 

381 elif cell_index == 2: 

382 forms[line_index].hiragana = line 

383 elif cell_index == 3: 383 ↛ 367line 383 didn't jump to line 367 because the condition on line 383 was always true

384 forms[line_index].roman = line 

385 

386 entry.forms.extend(forms) 

387 

388 

389def process_ja_conj_template( 

390 wxr: WiktextractContext, 

391 entry: WordEntry, 

392 template_node: TemplateNode, 

393 conj_page_title: str, 

394) -> None: 

395 # https://fr.wiktionary.org/wiki/Modèle:ja-verbe-conj 

396 # Modèle:ja-在る 

397 expanded_template = wxr.wtp.parse( 

398 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

399 ) 

400 for table_node in expanded_template.find_child(NodeKind.TABLE): 

401 first_tag = "" 

402 row_headers = {} 

403 for row in table_node.find_child(NodeKind.TABLE_ROW): 

404 if ( 

405 all( 

406 isinstance(c, WikiNode) 

407 and c.kind == NodeKind.TABLE_HEADER_CELL 

408 for c in row.children 

409 ) 

410 and len(row.children) > 1 

411 ): 

412 # skip header row of the "Clefs de constructions" table 

413 continue 

414 

415 for header in row.find_child(NodeKind.TABLE_HEADER_CELL): 

416 header_text = clean_node(wxr, None, header) 

417 if len(row.children) == 1: 

418 first_tag = header_text 

419 else: 

420 row_headers[header_text] = int( 

421 header.attrs.get("rowspan", "1") 

422 ) 

423 

424 tags = [first_tag] 

425 for tag, rowspan in row_headers.copy().items(): 

426 tags.append(tag) 

427 if rowspan == 1: 

428 del row_headers[tag] 

429 else: 

430 row_headers[tag] = rowspan - 1 

431 forms = [] 

432 for cell_index, cell in enumerate( 

433 row.find_child(NodeKind.TABLE_CELL) 

434 ): 

435 cell_text = clean_node(wxr, None, cell) 

436 for line_index, line in enumerate(cell_text.splitlines()): 

437 if cell_index == 0: 

438 forms.append( 

439 Form( 

440 form=line.strip(), 

441 raw_tags=tags, 

442 source=conj_page_title, 

443 ) 

444 ) 

445 elif cell_index == 1 and line_index < len(forms): 

446 forms[line_index].hiragana = line.strip() 

447 elif cell_index == 2 and line_index < len(forms): 447 ↛ 436line 447 didn't jump to line 436 because the condition on line 447 was always true

448 forms[line_index].roman = line.strip() 

449 for form in forms: 

450 if len(form.form) > 0: 450 ↛ 449line 450 didn't jump to line 449 because the condition on line 450 was always true

451 translate_raw_tags(form) 

452 entry.forms.append(form) 

453 

454 

455def extract_ku_conj_trans_template( 

456 wxr: WiktextractContext, 

457 entry: WordEntry, 

458 t_node: TemplateNode, 

459 conj_page_title: str, 

460) -> None: 

461 expanded_node = wxr.wtp.parse( 

462 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

463 ) 

464 for table in expanded_node.find_child(NodeKind.TABLE): 

465 extract_ku_conj_trans_table_node(wxr, entry, table, conj_page_title) 

466 for link_node in expanded_node.find_child(NodeKind.LINK): 

467 clean_node(wxr, entry, link_node) 

468 

469 

470def extract_ku_conj_trans_table_node( 

471 wxr: WiktextractContext, 

472 entry: WordEntry, 

473 table_node: WikiNode, 

474 conj_page_title: str, 

475) -> None: 

476 @dataclass 

477 class TableHeader: 

478 text: str 

479 index: int 

480 span: int 

481 

482 ignore_headers = ( 

483 "Conjugaison du verbe", 

484 "TEMPS DU PRÉSENT ET DU FUTUR", 

485 "TEMPS DU PRESENT ET DU FUTUR", 

486 "TEMPS DU PASSÉ", 

487 "TEMPS DU PASSE", 

488 ) 

489 col_headers = [] 

490 last_row_has_header = False 

491 last_header = "" 

492 for row in table_node.find_child(NodeKind.TABLE_ROW): 

493 col_index = 0 

494 current_row_has_header = row.contain_node(NodeKind.TABLE_HEADER_CELL) 

495 if not last_row_has_header and current_row_has_header: 

496 col_headers.clear() 

497 for cell in row.find_child( 

498 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

499 ): 

500 cell_str = clean_node(wxr, None, cell) 

501 if cell_str == "": 

502 col_index += 1 

503 continue 

504 if cell.kind == NodeKind.TABLE_HEADER_CELL: 

505 if cell_str.startswith(ignore_headers): 

506 last_header = cell_str 

507 continue 

508 colspan = 1 

509 colspan_str = cell.attrs.get("colspan", "1") 

510 if re.fullmatch(r"\d+", colspan_str) is not None: 510 ↛ 512line 510 didn't jump to line 512 because the condition on line 510 was always true

511 colspan = int(colspan_str) 

512 col_headers.append( 

513 TableHeader(text=cell_str, index=col_index, span=colspan) 

514 ) 

515 last_header = cell_str 

516 col_index += colspan 

517 elif last_header == "TEMPS DU PASSÉ": 

518 continue 

519 elif cell_str == "(inusité)": 

520 col_index += 1 

521 elif cell_str != wxr.wtp.title: 521 ↛ 497line 521 didn't jump to line 497 because the condition on line 521 was always true

522 form = Form(form=cell_str, source=conj_page_title) 

523 for header in col_headers: 

524 if ( 

525 col_index >= header.index 

526 and col_index < header.index + header.span 

527 ): 

528 form.raw_tags.append(header.text) 

529 translate_raw_tags(form) 

530 entry.forms.append(form) 

531 col_index += 1 

532 last_row_has_header = current_row_has_header 

533 

534 

535def extract_ko_conj_template( 

536 wxr: WiktextractContext, 

537 entry: WordEntry, 

538 t_node: TemplateNode, 

539 conj_page_title: str, 

540) -> None: 

541 word_page_title = wxr.wtp.title 

542 wxr.wtp.title = conj_page_title 

543 expanded_node = wxr.wtp.parse( 

544 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

545 ) 

546 for h3 in expanded_node.find_html("h3"): 

547 clean_node(wxr, entry, h3) 

548 for table_index, table in enumerate( 

549 expanded_node.find_child(NodeKind.TABLE) 

550 ): 

551 if table_index == 0: 

552 continue 

553 shared_raw_tags = [] 

554 for caption_node in table.find_child(NodeKind.TABLE_CAPTION): 

555 caption = clean_node(wxr, None, caption_node.children) 

556 if caption != "": 556 ↛ 554line 556 didn't jump to line 554 because the condition on line 556 was always true

557 shared_raw_tags.append(caption) 

558 col_headers = [] 

559 row_headers = [] 

560 row_index = 0 

561 row_header_indexes = [0] 

562 for row in table.find_child(NodeKind.TABLE_ROW): 

563 col_index = 0 

564 for header_cell in row.find_child(NodeKind.TABLE_HEADER_CELL): 

565 cell_str = clean_node(wxr, None, header_cell) 

566 if cell_str == "": 

567 continue 

568 colspan, rowspan = get_cell_span(header_cell) 

569 if row.contain_node(NodeKind.TABLE_CELL): 

570 header_added = False 

571 current_row_index = row_index 

572 for index, row_header_index in enumerate( 572 ↛ 580line 572 didn't jump to line 580 because the loop on line 572 didn't complete

573 row_header_indexes 

574 ): 

575 if row_index >= row_header_index: 

576 current_row_index = row_header_indexes[index] 

577 row_header_indexes[index] += rowspan 

578 header_added = True 

579 break 

580 if not header_added: 580 ↛ 581line 580 didn't jump to line 581 because the condition on line 580 was never true

581 row_header_indexes.append(rowspan) 

582 row_headers.append( 

583 TableHeader( 

584 text=cell_str, 

585 row_index=current_row_index, 

586 rowspan=rowspan, 

587 ) 

588 ) 

589 else: 

590 col_headers.append( 

591 TableHeader( 

592 text=cell_str, 

593 col_index=col_index, 

594 colspan=colspan, 

595 ) 

596 ) 

597 col_index += colspan 

598 if row.contain_node(NodeKind.TABLE_CELL): 

599 row_index += 1 

600 

601 row_index = 0 

602 for row in table.find_child(NodeKind.TABLE_ROW): 

603 col_index = 0 

604 for cell in row.find_child(NodeKind.TABLE_CELL): 

605 cell_str = clean_node(wxr, None, cell) 

606 colspan, rowspan = get_cell_span(cell) 

607 if cell_str == "—": 607 ↛ 608line 607 didn't jump to line 608 because the condition on line 607 was never true

608 col_index += 1 

609 else: 

610 form = Form( 

611 source=conj_page_title, raw_tags=shared_raw_tags 

612 ) 

613 for line_index, line in enumerate(cell_str.splitlines()): 

614 match line_index: 

615 case 0: 

616 form.form = line 

617 case 1: 

618 form.roman = line 

619 case 2: 619 ↛ 613line 619 didn't jump to line 613 because the pattern on line 619 always matched

620 form.ipas.append(line) 

621 for header in col_headers: 

622 if ( 

623 col_index >= header.col_index 

624 and col_index < header.col_index + header.colspan 

625 ): 

626 form.raw_tags.append(header.text) 

627 for header in row_headers: 

628 if ( 

629 row_index < header.row_index + header.rowspan 

630 and row_index + rowspan > header.row_index 

631 ): 

632 form.raw_tags.append(header.text) 

633 if form.form not in ["", wxr.wtp.title]: 633 ↛ 636line 633 didn't jump to line 636 because the condition on line 633 was always true

634 translate_raw_tags(form) 

635 entry.forms.append(form) 

636 col_index += 1 

637 if row.contain_node(NodeKind.TABLE_CELL): 

638 row_index += 1 

639 

640 for link in expanded_node.find_child(NodeKind.LINK): 

641 clean_node(wxr, entry, link) 

642 wxr.wtp.title = word_page_title 

643 

644 

645def get_cell_span(cell: WikiNode) -> tuple[int, int]: 

646 colspan = 1 

647 colspan_str = cell.attrs.get("colspan", "1") 

648 if re.fullmatch(r"\d+", colspan_str) is not None: 648 ↛ 650line 648 didn't jump to line 650 because the condition on line 648 was always true

649 colspan = int(colspan_str) 

650 rowspan = 1 

651 rowspan_str = cell.attrs.get("rowspan", "1") 

652 if re.fullmatch(r"\d+", rowspan_str) is not None: 652 ↛ 654line 652 didn't jump to line 654 because the condition on line 652 was always true

653 rowspan = int(rowspan_str) 

654 return colspan, rowspan 

655 

656 

657def extract_de_conj_template( 

658 wxr: WiktextractContext, 

659 word_entry: WordEntry, 

660 t_node: TemplateNode, 

661 conj_page_title: str, 

662): 

663 word_page_title = wxr.wtp.title 

664 wxr.wtp.title = conj_page_title 

665 expanded_node = wxr.wtp.parse( 

666 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

667 ) 

668 wxr.wtp.title = word_page_title 

669 for table_index, table in enumerate( 

670 expanded_node.find_child(NodeKind.TABLE) 

671 ): 

672 table_header = "" 

673 col_headers = [] 

674 for row in table.find_child(NodeKind.TABLE_ROW): 

675 word_part = "" 

676 col_index = 0 

677 if table_index >= 2 and row.contain_node( 

678 NodeKind.TABLE_HEADER_CELL 

679 ): 

680 col_headers.clear() 

681 for cell in row.find_child( 

682 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

683 ): 

684 cell_text = clean_node(wxr, None, cell) 

685 if cell_text == "": 

686 continue 

687 elif cell.kind == NodeKind.TABLE_HEADER_CELL: 

688 if len(row.children) == 1: 

689 table_header = clean_node(wxr, None, cell) 

690 else: 

691 col_headers.append(clean_node(wxr, None, cell)) 

692 elif table_index < 2: 

693 form = Form(form=cell_text, source=conj_page_title) 

694 if ":" in cell_text: 

695 colon_index = cell_text.index(":") 

696 raw_tag = cell_text[:colon_index].strip() 

697 if raw_tag != "": 697 ↛ 699line 697 didn't jump to line 699 because the condition on line 697 was always true

698 form.raw_tags.append(raw_tag) 

699 form.form = cell_text[colon_index + 1 :].strip() 

700 if table_header != "": 700 ↛ 702line 700 didn't jump to line 702 because the condition on line 700 was always true

701 form.raw_tags.append(table_header) 

702 if col_index < len(col_headers): 702 ↛ 704line 702 didn't jump to line 704 because the condition on line 702 was always true

703 form.raw_tags.append(col_headers[col_index]) 

704 if form.form not in ["", wxr.wtp.title]: 

705 translate_raw_tags(form) 

706 word_entry.forms.append(form) 

707 elif col_index % 2 == 0: 

708 word_part = cell_text 

709 else: 

710 form = Form( 

711 form=f"{word_part} {cell_text}", source=conj_page_title 

712 ) 

713 if table_header != "": 713 ↛ 715line 713 didn't jump to line 715 because the condition on line 713 was always true

714 form.raw_tags.append(table_header) 

715 if col_index // 2 < len(col_headers): 715 ↛ 717line 715 didn't jump to line 717 because the condition on line 715 was always true

716 form.raw_tags.append(col_headers[col_index // 2]) 

717 if form.form not in ["", wxr.wtp.title]: 717 ↛ 720line 717 didn't jump to line 720 because the condition on line 717 was always true

718 translate_raw_tags(form) 

719 word_entry.forms.append(form) 

720 col_index += 1 

721 

722 for cat_link in expanded_node.find_child(NodeKind.LINK): 

723 clean_node(wxr, word_entry, cat_link) 

724 

725 

726def extract_declension_page( 

727 wxr: WiktextractContext, word_entry: WordEntry, page_title: str 

728): 

729 page_body = wxr.wtp.get_page_body( 

730 page_title, wxr.wtp.NAMESPACE_DATA["Appendix"]["id"] 

731 ) 

732 if page_body is None: 732 ↛ 733line 732 didn't jump to line 733 because the condition on line 732 was never true

733 return 

734 root = wxr.wtp.parse(page_body) 

735 for t_node in root.find_child(NodeKind.TEMPLATE): 

736 extract_declension_template(wxr, word_entry, page_title, t_node, "") 

737 

738 

739def extract_declension_template( 

740 wxr: WiktextractContext, 

741 word_entry: WordEntry, 

742 page_title: str, 

743 t_node: TemplateNode, 

744 tab_name: str, 

745): 

746 if t_node.template_name in [ 746 ↛ 753line 746 didn't jump to line 753 because the condition on line 746 was always true

747 "de-adjectif-déclinaisons", 

748 "de-adj-déclinaisons", 

749 ]: 

750 extract_de_adj_declension_template( 

751 wxr, word_entry, page_title, t_node, tab_name 

752 ) 

753 elif t_node.template_name == "Onglets conjugaison": 

754 for index in range(1, 7): 

755 tab_name_arg = f"onglet{index}" 

756 if tab_name_arg not in t_node.template_parameters: 

757 break 

758 tab_name = clean_node( 

759 wxr, None, t_node.template_parameters[tab_name_arg] 

760 ) 

761 tab_content = wxr.wtp.parse( 

762 wxr.wtp.node_to_wikitext( 

763 t_node.template_parameters[f"contenu{index}"] 

764 ) 

765 ) 

766 for node in tab_content.find_child(NodeKind.TEMPLATE): 

767 extract_declension_template( 

768 wxr, word_entry, page_title, node, tab_name 

769 ) 

770 

771 

772def extract_de_adj_declension_template( 

773 wxr: WiktextractContext, 

774 word_entry: WordEntry, 

775 page_title: str, 

776 t_node: TemplateNode, 

777 tab_name: str, 

778): 

779 # https://fr.wiktionary.org/wiki/Modèle:de-adjectif-déclinaisons 

780 expanded_node = wxr.wtp.parse( 

781 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

782 ) 

783 for level_node in expanded_node.find_child(LEVEL_KIND_FLAGS): 

784 section_title = clean_node(wxr, None, level_node.largs) 

785 for table in level_node.find_child(NodeKind.TABLE): 

786 table_caption = "" 

787 for cap_node in table.find_child(NodeKind.TABLE_CAPTION): 

788 table_caption = clean_node(wxr, None, cap_node.children) 

789 col_headers = [] 

790 for row in table.find_child(NodeKind.TABLE_ROW): 

791 col_index = 0 

792 row_header = "" 

793 row_has_data = row.contain_node(NodeKind.TABLE_CELL) 

794 article = "" 

795 for cell in row.find_child( 

796 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

797 ): 

798 colspan = int(cell.attrs.get("colspan", "1")) 

799 cell_text = clean_node(wxr, None, cell) 

800 if cell.kind == NodeKind.TABLE_HEADER_CELL: 

801 if row_has_data: 

802 row_header = clean_node(wxr, None, cell) 

803 elif cell_text != "Forme": 

804 col_headers.append( 

805 TableHeader( 

806 clean_node(wxr, None, cell), 

807 col_index=col_index, 

808 colspan=colspan, 

809 ) 

810 ) 

811 else: 

812 use_col_headers = [] 

813 for col_header in col_headers: 

814 if ( 

815 col_index >= col_header.col_index 

816 and col_index 

817 < col_header.col_index + col_header.colspan 

818 ): 

819 use_col_headers.append(col_header.text) 

820 if "Article" in use_col_headers: 

821 if cell_text != "—": 821 ↛ 841line 821 didn't jump to line 841 because the condition on line 821 was always true

822 article = cell_text 

823 else: 

824 form = Form( 

825 form=cell_text, 

826 article=article, 

827 raw_tags=use_col_headers, 

828 source=page_title, 

829 ) 

830 for raw_tag in [ 

831 tab_name, 

832 section_title, 

833 table_caption, 

834 row_header, 

835 ]: 

836 if raw_tag != "": 

837 form.raw_tags.append(raw_tag) 

838 if form.form not in ["", wxr.wtp.title]: 838 ↛ 841line 838 didn't jump to line 841 because the condition on line 838 was always true

839 translate_raw_tags(form) 

840 word_entry.forms.append(form) 

841 col_index += colspan 

842 

843 for link in level_node.find_child(NodeKind.LINK): 

844 clean_node(wxr, word_entry, link)