Coverage for src / wiktextract / extractor / fr / conjugation.py: 92%

550 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2025-11-21 08:01 +0000

1import re 

2from dataclasses import dataclass 

3from itertools import chain 

4 

5from wikitextprocessor.parser import ( 

6 LEVEL_KIND_FLAGS, 

7 HTMLNode, 

8 NodeKind, 

9 TemplateNode, 

10 WikiNode, 

11) 

12 

13from ...page import clean_node 

14from ...wxr_context import WiktextractContext 

15from .models import Form, WordEntry 

16from .tags import translate_raw_tags 

17 

18 

19def extract_conjugation( 

20 wxr: WiktextractContext, 

21 entry: WordEntry, 

22 conj_page_title: str, 

23 select_tab: str = "1", 

24) -> None: 

25 """ 

26 Find and extract conjugation page. 

27 

28 https://fr.wiktionary.org/wiki/Conjugaison:français 

29 https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_de_tous_les_modèles/Français/Conjugaison 

30 https://fr.wiktionary.org/wiki/Aide:Conjugaisons 

31 """ 

32 conj_page = wxr.wtp.get_page_body( 

33 conj_page_title, wxr.wtp.NAMESPACE_DATA["Conjugaison"]["id"] 

34 ) 

35 if conj_page is None: 

36 return 

37 conj_root = wxr.wtp.parse(conj_page) 

38 for conj_template in conj_root.find_child(NodeKind.TEMPLATE): 

39 if conj_template.template_name.endswith("-intro"): 39 ↛ 40line 39 didn't jump to line 40 because the condition on line 39 was never true

40 continue 

41 if conj_template.template_name in ["ku-conj-trans", "ku-conj"]: 

42 extract_ku_conj_trans_template( 

43 wxr, entry, conj_template, conj_page_title 

44 ) 

45 elif conj_template.template_name == "ko-conj": 

46 extract_ko_conj_template(wxr, entry, conj_template, conj_page_title) 

47 elif conj_template.template_name == "de-conj": 

48 extract_de_conj_template(wxr, entry, conj_template, conj_page_title) 

49 elif conj_template.template_name.startswith("pt-conj/"): 

50 extract_pt_conj_template(wxr, entry, conj_template, conj_page_title) 

51 elif conj_template.template_name.startswith("cs-conj-"): 

52 extract_cs_conj_template(wxr, entry, conj_template, conj_page_title) 

53 elif conj_template.template_name.startswith(("ro-verb-", "se-conj-")): 53 ↛ 54line 53 didn't jump to line 54 because the condition on line 53 was never true

54 from .inflection import extract_inf_table_template 

55 

56 extract_inf_table_template( 

57 wxr, entry, conj_template, conj_page_title 

58 ) 

59 elif ( 

60 "-conj" in conj_template.template_name 

61 # https://fr.wiktionary.org/wiki/Catégorie:Modèles_de_conjugaison_en_italien 

62 # Italian table templates 

63 or conj_template.template_name.startswith("it-") 

64 ): 

65 process_conj_template(wxr, entry, conj_template, conj_page_title) 

66 elif conj_template.template_name == "Onglets conjugaison": 

67 process_onglets_conjugaison_template( 

68 wxr, entry, conj_template, conj_page_title, select_tab 

69 ) 

70 elif conj_template.template_name.removeprefix(":").startswith( 

71 "Conjugaison:" 

72 ): 

73 extract_conjugation( 

74 wxr, 

75 entry, 

76 conj_template.template_name.removeprefix(":"), 

77 clean_node( 

78 wxr, None, conj_template.template_parameters.get("sél", "2") 

79 ), 

80 ) 

81 elif conj_template.template_name.startswith("ja-flx-adj"): 

82 process_ja_flx_adj_template( 

83 wxr, entry, conj_template, conj_page_title 

84 ) 

85 elif conj_template.template_name.startswith("ja-"): 85 ↛ 38line 85 didn't jump to line 38 because the condition on line 85 was always true

86 process_ja_conj_template(wxr, entry, conj_template, conj_page_title) 

87 

88 if conj_page_title.startswith("Conjugaison:kurde/"): 

89 for table in conj_root.find_child(NodeKind.TABLE): 89 ↛ 90line 89 didn't jump to line 90 because the loop on line 89 never started

90 extract_ku_conj_trans_table_node(wxr, entry, table, conj_page_title) 

91 

92 for link_node in conj_root.find_child(NodeKind.LINK): 92 ↛ 93line 92 didn't jump to line 93 because the loop on line 92 never started

93 clean_node(wxr, None, link_node) 

94 

95 

96def process_onglets_conjugaison_template( 

97 wxr: WiktextractContext, 

98 entry: WordEntry, 

99 node: TemplateNode, 

100 conj_page_title: str, 

101 select_tab: str, 

102) -> None: 

103 # https://fr.wiktionary.org/wiki/Modèle:Onglets_conjugaison 

104 # this template expands to two tabs of tables 

105 selected_tabs = [] 

106 if select_tab != "1" or ( 106 ↛ 114line 106 didn't jump to line 114 because the condition on line 106 was always true

107 select_tab == "1" 

108 and clean_node(wxr, None, node.template_parameters.get("onglet1", "")) 

109 == "Conjugaison active" 

110 ): 

111 # don't extract or only extract "Conjugaison pronominale" tab 

112 selected_tabs = [select_tab] 

113 else: 

114 selected_tabs = [str(i) for i in range(1, 7)] 

115 

116 for tab_index in selected_tabs: 

117 arg_name = f"contenu{tab_index}" 

118 if arg_name not in node.template_parameters: 118 ↛ 119line 118 didn't jump to line 119 because the condition on line 118 was never true

119 break 

120 arg_value = node.template_parameters[arg_name] 

121 if ( 121 ↛ 125line 121 didn't jump to line 125 because the condition on line 121 was never true

122 isinstance(arg_value, TemplateNode) 

123 and "-conj" in arg_value.template_name 

124 ): 

125 process_conj_template(wxr, entry, arg_value, conj_page_title) 

126 elif isinstance(arg_value, list): 126 ↛ 116line 126 didn't jump to line 116 because the condition on line 126 was always true

127 for arg_node in arg_value: 

128 if isinstance(arg_node, TemplateNode) and ( 

129 "-conj" in arg_node.template_name 

130 or arg_node.template_name.startswith("it-") 

131 ): 

132 process_conj_template(wxr, entry, arg_node, conj_page_title) 

133 

134 

135def process_conj_template( 

136 wxr: WiktextractContext, 

137 entry: WordEntry, 

138 template_node: TemplateNode, 

139 conj_page_title: str, 

140) -> None: 

141 # https://fr.wiktionary.org/wiki/Catégorie:Modèles_de_conjugaison_en_français 

142 # https://fr.wiktionary.org/wiki/Modèle:fr-conj-1-ger 

143 expanded_template = wxr.wtp.parse( 

144 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

145 ) 

146 process_expanded_conj_template( 

147 wxr, entry, expanded_template, conj_page_title 

148 ) 

149 

150 

151def process_expanded_conj_template( 

152 wxr: WiktextractContext, 

153 entry: WordEntry, 

154 node: WikiNode, 

155 conj_page_title: str, 

156) -> None: 

157 h3_text = ( 

158 clean_node(wxr, None, node.largs) 

159 if node.kind == NodeKind.LEVEL3 

160 else "" 

161 ) 

162 for child in node.find_child(NodeKind.HTML | LEVEL_KIND_FLAGS): 

163 if child.kind in LEVEL_KIND_FLAGS: 163 ↛ 164line 163 didn't jump to line 164 because the condition on line 163 was never true

164 process_expanded_conj_template(wxr, entry, child, conj_page_title) 

165 elif child.kind == NodeKind.HTML: 165 ↛ 162line 165 didn't jump to line 162 because the condition on line 165 was always true

166 if child.tag == "h3": 

167 h3_text = clean_node(wxr, None, child) 

168 elif child.tag == "div": 168 ↛ 162line 168 didn't jump to line 162 because the condition on line 168 was always true

169 if h3_text == "Modes impersonnels": 

170 process_fr_conj_modes_table( 

171 wxr, entry, child, conj_page_title 

172 ) 

173 else: 

174 process_fr_conj_table( 

175 wxr, entry, child, h3_text, conj_page_title 

176 ) 

177 

178 

179@dataclass 

180class TableHeader: 

181 text: str 

182 col_index: int = 0 

183 colspan: int = 0 

184 row_index: int = 0 

185 rowspan: int = 0 

186 

187 

188def process_fr_conj_modes_table( 

189 wxr: WiktextractContext, 

190 entry: WordEntry, 

191 div_node: HTMLNode, 

192 conj_page_title: str, 

193) -> None: 

194 # the first "Modes impersonnels" table 

195 

196 for table_node in div_node.find_child(NodeKind.TABLE): 

197 col_headers = [] 

198 for row in table_node.find_child(NodeKind.TABLE_ROW): 

199 row_header = "" 

200 is_header_row = not row.contain_node(NodeKind.TABLE_CELL) 

201 col_index = 0 

202 form_text = "" 

203 for node in row.find_child( 

204 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

205 ): 

206 if node.kind == NodeKind.TABLE_HEADER_CELL or ( 

207 node.contain_node(NodeKind.BOLD) and col_index == 0 

208 ): 

209 if is_header_row: 

210 header_text = clean_node(wxr, None, node) 

211 if header_text == "Mode": 211 ↛ 212line 211 didn't jump to line 212 because the condition on line 211 was never true

212 continue 

213 else: 

214 colspan = 1 

215 colspan_str = node.attrs.get("colspan", "1") 

216 if re.fullmatch(r"\d+", colspan_str) is not None: 216 ↛ 218line 216 didn't jump to line 218 because the condition on line 216 was always true

217 colspan = int(colspan_str) 

218 col_headers.append( 

219 TableHeader(header_text, col_index, colspan) 

220 ) 

221 col_index += colspan 

222 else: 

223 row_header = clean_node(wxr, None, node) 

224 else: 

225 node_text = clean_node(wxr, None, node) 

226 if ( 

227 node_text.endswith(("]", "\\", "Prononciation ?")) 

228 and form_text != "" 

229 ): 

230 form = Form( 

231 form=form_text, 

232 ipas=[node_text] 

233 if node_text.endswith(("]", "\\")) 

234 else [], 

235 source=conj_page_title, 

236 ) 

237 if row_header != "": 237 ↛ 239line 237 didn't jump to line 239 because the condition on line 237 was always true

238 form.raw_tags.append(row_header) 

239 for col_header in col_headers: 

240 if ( 

241 col_index >= col_header.col_index 

242 and col_index 

243 < col_header.col_index + col_header.colspan 

244 ): 

245 form.raw_tags.append(col_header.text) 

246 translate_raw_tags(form) 

247 entry.forms.append(form) 

248 form_text = "" 

249 elif node_text != "": 

250 if not form_text.endswith("’") and form_text != "": 

251 form_text += " " 

252 form_text += node_text 

253 col_index += 1 

254 

255 

256def process_fr_conj_table( 

257 wxr: WiktextractContext, 

258 entry: WordEntry, 

259 div_node: HTMLNode, 

260 h3_text: str, 

261 conj_page_title: str, 

262) -> None: 

263 for table_node in div_node.find_child(NodeKind.TABLE): 

264 for row_index, row in enumerate( 

265 table_node.find_child(NodeKind.TABLE_ROW) 

266 ): 

267 for cell_index, cell in enumerate( 

268 row.find_child(NodeKind.TABLE_CELL) 

269 ): 

270 for cell_child in cell.children: 

271 if isinstance(cell_child, WikiNode): 

272 if ( 

273 cell_child.kind == NodeKind.HTML 

274 and cell_child.tag == "table" 

275 ): 

276 process_fr_conj_html_table( 

277 wxr, entry, cell_child, h3_text, conj_page_title 

278 ) 

279 elif cell_child.kind == NodeKind.TABLE: 279 ↛ 270line 279 didn't jump to line 270 because the condition on line 279 was always true

280 process_fr_conj_wiki_table( 

281 wxr, entry, cell_child, h3_text, conj_page_title 

282 ) 

283 

284 

285def process_fr_conj_html_table( 

286 wxr: WiktextractContext, 

287 entry: WordEntry, 

288 table_node: HTMLNode, 

289 h3_text: str, 

290 conj_page_title: str, 

291): 

292 tags = [h3_text] if h3_text != "" else [] 

293 for tr_index, tr_node in enumerate(table_node.find_html_recursively("tr")): 

294 if tr_index == 0: 

295 tags.append(clean_node(wxr, None, tr_node.children)) 

296 else: 

297 form = Form(raw_tags=tags, source=conj_page_title) 

298 for td_index, td_node in enumerate( 

299 tr_node.find_html_recursively("td") 

300 ): 

301 td_text = clean_node(wxr, None, td_node) 

302 if td_index < 2: 

303 form.form += td_text 

304 if td_index == 0 and not td_text.endswith("’"): 

305 form.form += " " 

306 else: 

307 if len(form.ipas) > 0: 

308 form.ipas[0] += td_text 

309 else: 

310 if not td_text.endswith("‿"): 310 ↛ 312line 310 didn't jump to line 312 because the condition on line 310 was always true

311 td_text += " " 

312 form.ipas.append(td_text) 

313 

314 translate_raw_tags(form) 

315 entry.forms.append(form) 

316 

317 

318def process_fr_conj_wiki_table( 

319 wxr: WiktextractContext, 

320 entry: WordEntry, 

321 table_node: WikiNode, 

322 h3_text: str, 

323 conj_page_title: str, 

324): 

325 tags = [h3_text] if h3_text != "" else [] 

326 for row_index, row in enumerate(table_node.find_child(NodeKind.TABLE_ROW)): 

327 if row_index == 0: 

328 tags.append(clean_node(wxr, None, row.children)) 

329 else: 

330 form = Form(raw_tags=tags, source=conj_page_title) 

331 for cell_index, cell in enumerate( 

332 row.find_child(NodeKind.TABLE_CELL) 

333 ): 

334 cell_text = clean_node(wxr, None, cell) 

335 if cell_index < 2: 

336 if cell_text == "—" or cell_text.endswith( 

337 "Prononciation ?" 

338 ): 

339 continue 

340 if cell_text.startswith( 

341 "-" 

342 ) and not form.form.strip().endswith(")"): 

343 form.form = form.form.strip() 

344 form.form += cell_text 

345 if cell_index == 0 and len(cell_text) > 0: 

346 form.form += " " 

347 elif not cell_text.endswith("Prononciation ?"): 347 ↛ 331line 347 didn't jump to line 331 because the condition on line 347 was always true

348 form.ipas.append(cell_text) 

349 

350 if len(form.form) > 0: 

351 translate_raw_tags(form) 

352 entry.forms.append(form) 

353 

354 

355def process_ja_flx_adj_template( 

356 wxr: WiktextractContext, 

357 entry: WordEntry, 

358 template_node: TemplateNode, 

359 conj_page_title: str, 

360) -> None: 

361 # https://fr.wiktionary.org/wiki/Modèle:ja-adj 

362 # https://fr.wiktionary.org/wiki/Modèle:ja-flx-adj-な 

363 expanded_template = wxr.wtp.parse( 

364 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

365 ) 

366 for table_node in expanded_template.find_child(NodeKind.TABLE): 

367 first_tag = "" 

368 for row in table_node.find_child(NodeKind.TABLE_ROW): 

369 forms = [] 

370 tags = [first_tag] 

371 for cell_index, row_child in enumerate( 

372 row.find_child(NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL) 

373 ): 

374 row_child_text = clean_node(wxr, None, row_child) 

375 if row_child.kind == NodeKind.TABLE_HEADER_CELL: 

376 first_tag = row_child_text 

377 else: 

378 for line_index, line in enumerate( 

379 row_child_text.splitlines() 

380 ): 

381 if cell_index == 0: 

382 tags.append(line) 

383 continue 

384 if line_index + 1 > len(forms): 

385 forms.append( 

386 translate_raw_tags( 

387 Form(raw_tags=tags, source=conj_page_title) 

388 ) 

389 ) 

390 if cell_index == 1: 

391 forms[line_index].form = line 

392 elif cell_index == 2: 

393 forms[line_index].hiragana = line 

394 elif cell_index == 3: 394 ↛ 378line 394 didn't jump to line 378 because the condition on line 394 was always true

395 forms[line_index].roman = line 

396 

397 entry.forms.extend(forms) 

398 

399 

400def process_ja_conj_template( 

401 wxr: WiktextractContext, 

402 entry: WordEntry, 

403 template_node: TemplateNode, 

404 conj_page_title: str, 

405) -> None: 

406 # https://fr.wiktionary.org/wiki/Modèle:ja-verbe-conj 

407 # Modèle:ja-在る 

408 expanded_template = wxr.wtp.parse( 

409 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

410 ) 

411 for table_node in expanded_template.find_child(NodeKind.TABLE): 

412 first_tag = "" 

413 row_headers = {} 

414 for row in table_node.find_child(NodeKind.TABLE_ROW): 

415 if ( 

416 all( 

417 isinstance(c, WikiNode) 

418 and c.kind == NodeKind.TABLE_HEADER_CELL 

419 for c in row.children 

420 ) 

421 and len(row.children) > 1 

422 ): 

423 # skip header row of the "Clefs de constructions" table 

424 continue 

425 

426 for header in row.find_child(NodeKind.TABLE_HEADER_CELL): 

427 header_text = clean_node(wxr, None, header) 

428 if len(row.children) == 1: 

429 first_tag = header_text 

430 else: 

431 row_headers[header_text] = int( 

432 header.attrs.get("rowspan", "1") 

433 ) 

434 

435 tags = [first_tag] 

436 for tag, rowspan in row_headers.copy().items(): 

437 tags.append(tag) 

438 if rowspan == 1: 

439 del row_headers[tag] 

440 else: 

441 row_headers[tag] = rowspan - 1 

442 forms = [] 

443 for cell_index, cell in enumerate( 

444 row.find_child(NodeKind.TABLE_CELL) 

445 ): 

446 cell_text = clean_node(wxr, None, cell) 

447 for line_index, line in enumerate(cell_text.splitlines()): 

448 if cell_index == 0: 

449 forms.append( 

450 Form( 

451 form=line.strip(), 

452 raw_tags=tags, 

453 source=conj_page_title, 

454 ) 

455 ) 

456 elif cell_index == 1 and line_index < len(forms): 

457 forms[line_index].hiragana = line.strip() 

458 elif cell_index == 2 and line_index < len(forms): 458 ↛ 447line 458 didn't jump to line 447 because the condition on line 458 was always true

459 forms[line_index].roman = line.strip() 

460 for form in forms: 

461 if len(form.form) > 0: 461 ↛ 460line 461 didn't jump to line 460 because the condition on line 461 was always true

462 translate_raw_tags(form) 

463 entry.forms.append(form) 

464 

465 

466def extract_ku_conj_trans_template( 

467 wxr: WiktextractContext, 

468 entry: WordEntry, 

469 t_node: TemplateNode, 

470 conj_page_title: str, 

471) -> None: 

472 expanded_node = wxr.wtp.parse( 

473 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

474 ) 

475 for table in expanded_node.find_child(NodeKind.TABLE): 

476 extract_ku_conj_trans_table_node(wxr, entry, table, conj_page_title) 

477 for link_node in expanded_node.find_child(NodeKind.LINK): 

478 clean_node(wxr, entry, link_node) 

479 

480 

481def extract_ku_conj_trans_table_node( 

482 wxr: WiktextractContext, 

483 entry: WordEntry, 

484 table_node: WikiNode, 

485 conj_page_title: str, 

486) -> None: 

487 @dataclass 

488 class TableHeader: 

489 text: str 

490 index: int 

491 span: int 

492 

493 ignore_headers = ( 

494 "Conjugaison du verbe", 

495 "TEMPS DU PRÉSENT ET DU FUTUR", 

496 "TEMPS DU PRESENT ET DU FUTUR", 

497 "TEMPS DU PASSÉ", 

498 "TEMPS DU PASSE", 

499 ) 

500 col_headers = [] 

501 last_row_has_header = False 

502 last_header = "" 

503 for row in table_node.find_child(NodeKind.TABLE_ROW): 

504 col_index = 0 

505 current_row_has_header = row.contain_node(NodeKind.TABLE_HEADER_CELL) 

506 if not last_row_has_header and current_row_has_header: 

507 col_headers.clear() 

508 for cell in row.find_child( 

509 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

510 ): 

511 cell_str = clean_node(wxr, None, cell) 

512 if cell_str == "": 

513 col_index += 1 

514 continue 

515 if cell.kind == NodeKind.TABLE_HEADER_CELL: 

516 if cell_str.startswith(ignore_headers): 

517 last_header = cell_str 

518 continue 

519 colspan = 1 

520 colspan_str = cell.attrs.get("colspan", "1") 

521 if re.fullmatch(r"\d+", colspan_str) is not None: 521 ↛ 523line 521 didn't jump to line 523 because the condition on line 521 was always true

522 colspan = int(colspan_str) 

523 col_headers.append( 

524 TableHeader(text=cell_str, index=col_index, span=colspan) 

525 ) 

526 last_header = cell_str 

527 col_index += colspan 

528 elif last_header == "TEMPS DU PASSÉ": 

529 continue 

530 elif cell_str == "(inusité)": 

531 col_index += 1 

532 elif cell_str != wxr.wtp.title: 532 ↛ 508line 532 didn't jump to line 508 because the condition on line 532 was always true

533 form = Form(form=cell_str, source=conj_page_title) 

534 for header in col_headers: 

535 if ( 

536 col_index >= header.index 

537 and col_index < header.index + header.span 

538 ): 

539 form.raw_tags.append(header.text) 

540 translate_raw_tags(form) 

541 entry.forms.append(form) 

542 col_index += 1 

543 last_row_has_header = current_row_has_header 

544 

545 

546def extract_ko_conj_template( 

547 wxr: WiktextractContext, 

548 entry: WordEntry, 

549 t_node: TemplateNode, 

550 conj_page_title: str, 

551) -> None: 

552 word_page_title = wxr.wtp.title 

553 wxr.wtp.title = conj_page_title 

554 expanded_node = wxr.wtp.parse( 

555 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

556 ) 

557 for h3 in expanded_node.find_html("h3"): 

558 clean_node(wxr, entry, h3) 

559 for table_index, table in enumerate( 

560 expanded_node.find_child(NodeKind.TABLE) 

561 ): 

562 if table_index == 0: 

563 continue 

564 shared_raw_tags = [] 

565 for caption_node in table.find_child(NodeKind.TABLE_CAPTION): 

566 caption = clean_node(wxr, None, caption_node.children) 

567 if caption != "": 567 ↛ 565line 567 didn't jump to line 565 because the condition on line 567 was always true

568 shared_raw_tags.append(caption) 

569 col_headers = [] 

570 row_headers = [] 

571 row_index = 0 

572 row_header_indexes = [0] 

573 for row in table.find_child(NodeKind.TABLE_ROW): 

574 col_index = 0 

575 for header_cell in row.find_child(NodeKind.TABLE_HEADER_CELL): 

576 cell_str = clean_node(wxr, None, header_cell) 

577 if cell_str == "": 

578 continue 

579 colspan, rowspan = get_cell_span(header_cell) 

580 if row.contain_node(NodeKind.TABLE_CELL): 

581 header_added = False 

582 current_row_index = row_index 

583 for index, row_header_index in enumerate( 583 ↛ 591line 583 didn't jump to line 591 because the loop on line 583 didn't complete

584 row_header_indexes 

585 ): 

586 if row_index >= row_header_index: 

587 current_row_index = row_header_indexes[index] 

588 row_header_indexes[index] += rowspan 

589 header_added = True 

590 break 

591 if not header_added: 591 ↛ 592line 591 didn't jump to line 592 because the condition on line 591 was never true

592 row_header_indexes.append(rowspan) 

593 row_headers.append( 

594 TableHeader( 

595 text=cell_str, 

596 row_index=current_row_index, 

597 rowspan=rowspan, 

598 ) 

599 ) 

600 else: 

601 col_headers.append( 

602 TableHeader( 

603 text=cell_str, 

604 col_index=col_index, 

605 colspan=colspan, 

606 ) 

607 ) 

608 col_index += colspan 

609 if row.contain_node(NodeKind.TABLE_CELL): 

610 row_index += 1 

611 

612 row_index = 0 

613 for row in table.find_child(NodeKind.TABLE_ROW): 

614 col_index = 0 

615 for cell in row.find_child(NodeKind.TABLE_CELL): 

616 cell_str = clean_node(wxr, None, cell) 

617 colspan, rowspan = get_cell_span(cell) 

618 if cell_str == "—": 618 ↛ 619line 618 didn't jump to line 619 because the condition on line 618 was never true

619 col_index += 1 

620 else: 

621 form = Form( 

622 source=conj_page_title, raw_tags=shared_raw_tags 

623 ) 

624 for line_index, line in enumerate(cell_str.splitlines()): 

625 match line_index: 

626 case 0: 

627 form.form = line 

628 case 1: 

629 form.roman = line 

630 case 2: 630 ↛ 624line 630 didn't jump to line 624 because the pattern on line 630 always matched

631 form.ipas.append(line) 

632 for header in col_headers: 

633 if ( 

634 col_index >= header.col_index 

635 and col_index < header.col_index + header.colspan 

636 ): 

637 form.raw_tags.append(header.text) 

638 for header in row_headers: 

639 if ( 

640 row_index < header.row_index + header.rowspan 

641 and row_index + rowspan > header.row_index 

642 ): 

643 form.raw_tags.append(header.text) 

644 if form.form not in ["", wxr.wtp.title]: 644 ↛ 647line 644 didn't jump to line 647 because the condition on line 644 was always true

645 translate_raw_tags(form) 

646 entry.forms.append(form) 

647 col_index += 1 

648 if row.contain_node(NodeKind.TABLE_CELL): 

649 row_index += 1 

650 

651 for link in expanded_node.find_child(NodeKind.LINK): 

652 clean_node(wxr, entry, link) 

653 wxr.wtp.title = word_page_title 

654 

655 

656def get_cell_span(cell: WikiNode) -> tuple[int, int]: 

657 colspan = 1 

658 colspan_str = cell.attrs.get("colspan", "1") 

659 if re.fullmatch(r"\d+", colspan_str) is not None: 659 ↛ 661line 659 didn't jump to line 661 because the condition on line 659 was always true

660 colspan = int(colspan_str) 

661 rowspan = 1 

662 rowspan_str = cell.attrs.get("rowspan", "1") 

663 if re.fullmatch(r"\d+", rowspan_str) is not None: 663 ↛ 665line 663 didn't jump to line 665 because the condition on line 663 was always true

664 rowspan = int(rowspan_str) 

665 return colspan, rowspan 

666 

667 

668def extract_de_conj_template( 

669 wxr: WiktextractContext, 

670 word_entry: WordEntry, 

671 t_node: TemplateNode, 

672 conj_page_title: str, 

673): 

674 word_page_title = wxr.wtp.title 

675 wxr.wtp.title = conj_page_title 

676 expanded_node = wxr.wtp.parse( 

677 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

678 ) 

679 wxr.wtp.title = word_page_title 

680 for table_index, table in enumerate( 

681 expanded_node.find_child(NodeKind.TABLE) 

682 ): 

683 table_header = "" 

684 col_headers = [] 

685 for row in table.find_child(NodeKind.TABLE_ROW): 

686 word_part = "" 

687 col_index = 0 

688 if table_index >= 2 and row.contain_node( 

689 NodeKind.TABLE_HEADER_CELL 

690 ): 

691 col_headers.clear() 

692 for cell in row.find_child( 

693 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

694 ): 

695 cell_text = clean_node(wxr, None, cell) 

696 if cell_text == "": 

697 continue 

698 elif cell.kind == NodeKind.TABLE_HEADER_CELL: 

699 if len(row.children) == 1: 

700 table_header = clean_node(wxr, None, cell) 

701 else: 

702 col_headers.append(clean_node(wxr, None, cell)) 

703 elif table_index < 2: 

704 form = Form(form=cell_text, source=conj_page_title) 

705 if ":" in cell_text: 

706 colon_index = cell_text.index(":") 

707 raw_tag = cell_text[:colon_index].strip() 

708 if raw_tag != "": 708 ↛ 710line 708 didn't jump to line 710 because the condition on line 708 was always true

709 form.raw_tags.append(raw_tag) 

710 form.form = cell_text[colon_index + 1 :].strip() 

711 if table_header != "": 711 ↛ 713line 711 didn't jump to line 713 because the condition on line 711 was always true

712 form.raw_tags.append(table_header) 

713 if col_index < len(col_headers): 713 ↛ 715line 713 didn't jump to line 715 because the condition on line 713 was always true

714 form.raw_tags.append(col_headers[col_index]) 

715 if form.form not in ["", wxr.wtp.title]: 

716 translate_raw_tags(form) 

717 word_entry.forms.append(form) 

718 elif col_index % 2 == 0: 

719 word_part = cell_text 

720 else: 

721 form = Form( 

722 form=f"{word_part} {cell_text}", source=conj_page_title 

723 ) 

724 if table_header != "": 724 ↛ 726line 724 didn't jump to line 726 because the condition on line 724 was always true

725 form.raw_tags.append(table_header) 

726 if col_index // 2 < len(col_headers): 726 ↛ 728line 726 didn't jump to line 728 because the condition on line 726 was always true

727 form.raw_tags.append(col_headers[col_index // 2]) 

728 if form.form not in ["", wxr.wtp.title]: 728 ↛ 731line 728 didn't jump to line 731 because the condition on line 728 was always true

729 translate_raw_tags(form) 

730 word_entry.forms.append(form) 

731 col_index += 1 

732 

733 for cat_link in expanded_node.find_child(NodeKind.LINK): 

734 clean_node(wxr, word_entry, cat_link) 

735 

736 

737def extract_declension_page( 

738 wxr: WiktextractContext, word_entry: WordEntry, page_title: str 

739): 

740 page_body = wxr.wtp.get_page_body( 

741 page_title, wxr.wtp.NAMESPACE_DATA["Appendix"]["id"] 

742 ) 

743 if page_body is None: 743 ↛ 744line 743 didn't jump to line 744 because the condition on line 743 was never true

744 return 

745 root = wxr.wtp.parse(page_body) 

746 for t_node in root.find_child(NodeKind.TEMPLATE): 

747 extract_declension_template(wxr, word_entry, page_title, t_node, "") 

748 

749 

750def extract_declension_template( 

751 wxr: WiktextractContext, 

752 word_entry: WordEntry, 

753 page_title: str, 

754 t_node: TemplateNode, 

755 tab_name: str, 

756): 

757 if t_node.template_name in [ 757 ↛ 764line 757 didn't jump to line 764 because the condition on line 757 was always true

758 "de-adjectif-déclinaisons", 

759 "de-adj-déclinaisons", 

760 ]: 

761 extract_de_adj_declension_template( 

762 wxr, word_entry, page_title, t_node, tab_name 

763 ) 

764 elif t_node.template_name == "Onglets conjugaison": 

765 for index in range(1, 7): 

766 tab_name_arg = f"onglet{index}" 

767 if tab_name_arg not in t_node.template_parameters: 

768 break 

769 tab_name = clean_node( 

770 wxr, None, t_node.template_parameters[tab_name_arg] 

771 ) 

772 tab_content = wxr.wtp.parse( 

773 wxr.wtp.node_to_wikitext( 

774 t_node.template_parameters[f"contenu{index}"] 

775 ) 

776 ) 

777 for node in tab_content.find_child(NodeKind.TEMPLATE): 

778 extract_declension_template( 

779 wxr, word_entry, page_title, node, tab_name 

780 ) 

781 

782 

783def extract_de_adj_declension_template( 

784 wxr: WiktextractContext, 

785 word_entry: WordEntry, 

786 page_title: str, 

787 t_node: TemplateNode, 

788 tab_name: str, 

789): 

790 # https://fr.wiktionary.org/wiki/Modèle:de-adjectif-déclinaisons 

791 expanded_node = wxr.wtp.parse( 

792 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

793 ) 

794 for level_node in expanded_node.find_child(LEVEL_KIND_FLAGS): 

795 section_title = clean_node(wxr, None, level_node.largs) 

796 for table in level_node.find_child(NodeKind.TABLE): 

797 table_caption = "" 

798 for cap_node in table.find_child(NodeKind.TABLE_CAPTION): 

799 table_caption = clean_node(wxr, None, cap_node.children) 

800 col_headers = [] 

801 for row in table.find_child(NodeKind.TABLE_ROW): 

802 col_index = 0 

803 row_header = "" 

804 row_has_data = row.contain_node(NodeKind.TABLE_CELL) 

805 article = "" 

806 for cell in row.find_child( 

807 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

808 ): 

809 colspan = int(cell.attrs.get("colspan", "1")) 

810 cell_text = clean_node(wxr, None, cell) 

811 if cell.kind == NodeKind.TABLE_HEADER_CELL: 

812 if row_has_data: 

813 row_header = clean_node(wxr, None, cell) 

814 elif cell_text != "Forme": 

815 col_headers.append( 

816 TableHeader( 

817 clean_node(wxr, None, cell), 

818 col_index=col_index, 

819 colspan=colspan, 

820 ) 

821 ) 

822 else: 

823 use_col_headers = [] 

824 for col_header in col_headers: 

825 if ( 

826 col_index >= col_header.col_index 

827 and col_index 

828 < col_header.col_index + col_header.colspan 

829 ): 

830 use_col_headers.append(col_header.text) 

831 if "Article" in use_col_headers: 

832 if cell_text != "—": 832 ↛ 852line 832 didn't jump to line 852 because the condition on line 832 was always true

833 article = cell_text 

834 else: 

835 form = Form( 

836 form=cell_text, 

837 article=article, 

838 raw_tags=use_col_headers, 

839 source=page_title, 

840 ) 

841 for raw_tag in [ 

842 tab_name, 

843 section_title, 

844 table_caption, 

845 row_header, 

846 ]: 

847 if raw_tag != "": 

848 form.raw_tags.append(raw_tag) 

849 if form.form not in ["", wxr.wtp.title]: 849 ↛ 852line 849 didn't jump to line 852 because the condition on line 849 was always true

850 translate_raw_tags(form) 

851 word_entry.forms.append(form) 

852 col_index += colspan 

853 

854 for link in level_node.find_child(NodeKind.LINK): 

855 clean_node(wxr, word_entry, link) 

856 

857 

858def extract_pt_conj_template( 

859 wxr: WiktextractContext, 

860 word_entry: WordEntry, 

861 t_node: TemplateNode, 

862 page_title: str, 

863): 

864 expanded_node = wxr.wtp.parse( 

865 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

866 ) 

867 for table in expanded_node.find_child(NodeKind.TABLE): 

868 col_headers = [] 

869 row_headers = [] 

870 for row_index, row in enumerate(table.find_child(NodeKind.TABLE_ROW)): 

871 row_has_data = row.contain_node(NodeKind.TABLE_CELL) 

872 col_index = 0 

873 for header in chain(col_headers, row_headers): 

874 if ( 

875 row_index > header.row_index 

876 and row_index < header.row_index + header.rowspan 

877 and header.col_index <= col_index 

878 ): 

879 col_index += header.colspan 

880 for cell_node in row.find_child( 

881 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

882 ): 

883 cell_text = clean_node(wxr, None, cell_node) 

884 colspan = int(cell_node.attrs.get("colspan", "1")) 

885 rowspan = int(cell_node.attrs.get("rowspan", "1")) 

886 if cell_node.kind == NodeKind.TABLE_HEADER_CELL: 

887 if row_has_data: 

888 row_headers.append( 

889 TableHeader( 

890 cell_text, 

891 col_index, 

892 colspan, 

893 row_index, 

894 rowspan, 

895 ) 

896 ) 

897 else: 

898 if ( 

899 cell_text 

900 == "Formas pessoais\n(formes personnelles)" 

901 ): 

902 col_headers.clear() 

903 row_headers.clear() 

904 col_headers.append( 

905 TableHeader( 

906 cell_text, 

907 col_index, 

908 colspan, 

909 row_index, 

910 rowspan, 

911 ) 

912 ) 

913 elif cell_node.contain_node(NodeKind.LIST): 913 ↛ 914line 913 didn't jump to line 914 because the condition on line 913 was never true

914 continue # skip end notes 

915 else: 

916 for line in cell_text.splitlines(): 

917 form_str = line.strip("/ \n") 

918 raw_tag = "" 

919 if ":" in form_str: 

920 colon_index = form_str.index(":") 

921 raw_tag = form_str[:colon_index].strip() 

922 form_str = form_str[colon_index + 1 :].strip() 

923 if form_str not in ["", "-", wxr.wtp.title]: 

924 form = Form(form=form_str, source=page_title) 

925 for col_header in col_headers: 

926 if ( 

927 ( 

928 ( 

929 col_header.col_index 

930 < col_index + colspan 

931 and col_index 

932 < col_header.col_index 

933 + col_header.colspan 

934 ) 

935 or ( 

936 # "Modo Subjuntivo" header 

937 col_header.col_index == 0 

938 and col_header.row_index 

939 < row_index + rowspan 

940 and col_header.row_index 

941 + col_header.rowspan 

942 > row_index 

943 ) 

944 ) 

945 and col_header.text != "" 

946 and col_header.text not in form.raw_tags 

947 ): 

948 form.raw_tags.append(col_header.text) 

949 for row_header in row_headers: 

950 if ( 

951 row_header.row_index < row_index + rowspan 

952 and row_index 

953 < row_header.row_index + row_header.rowspan 

954 and row_header.text != "" 

955 and row_header.text not in form.raw_tags 

956 ): 

957 form.raw_tags.append(row_header.text) 

958 if raw_tag != "": 

959 form.raw_tags.append(raw_tag) 

960 translate_raw_tags(form) 

961 word_entry.forms.append(form) 

962 col_index += colspan 

963 

964 

965def extract_cs_conj_template( 

966 wxr: WiktextractContext, 

967 word_entry: WordEntry, 

968 t_node: TemplateNode, 

969 page_title: str, 

970): 

971 def add_form(form_nodes, col_headers, col_index, row_header, raw_tags): 

972 form_str = clean_node(wxr, None, form_nodes) 

973 if form_str not in ["", "—", wxr.wtp.title]: 973 ↛ exitline 973 didn't return from function 'add_form' because the condition on line 973 was always true

974 form = Form(form=form_str, source=page_title) 

975 if col_index < len(col_headers): 975 ↛ 977line 975 didn't jump to line 977 because the condition on line 975 was always true

976 form.raw_tags.append(col_headers[col_index]) 

977 if row_header != "": 977 ↛ 979line 977 didn't jump to line 979 because the condition on line 977 was always true

978 form.raw_tags.append(row_header) 

979 form.raw_tags.extend(raw_tags) 

980 translate_raw_tags(form) 

981 word_entry.forms.append(form) 

982 form_nodes.clear() 

983 raw_tags.clear() 

984 

985 expanded_node = wxr.wtp.parse( 

986 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

987 ) 

988 for table in expanded_node.find_child(NodeKind.TABLE): 

989 col_headers = [] 

990 for row in table.find_child(NodeKind.TABLE_ROW): 

991 row_header = "" 

992 for col_index, cell in enumerate( 

993 row.find_child(NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL) 

994 ): 

995 if cell.kind == NodeKind.TABLE_HEADER_CELL: 

996 cell_scope = cell.attrs.get("scope", "") 

997 if cell_scope == "col": 

998 col_headers.append(clean_node(wxr, None, cell)) 

999 elif cell_scope == "row": 999 ↛ 992line 999 didn't jump to line 992 because the condition on line 999 was always true

1000 row_header = clean_node(wxr, None, cell) 

1001 else: 

1002 raw_tags = [] 

1003 form_nodes = [] 

1004 for node in cell.children: 

1005 if isinstance(node, HTMLNode) and node.tag == "span": 

1006 span_class = node.attrs.get("class", "").split() 

1007 if ( 1007 ↛ 1017line 1007 didn't jump to line 1017 because the condition on line 1007 was always true

1008 "ligne-de-forme" in span_class 

1009 or "registre" in span_class 

1010 ): 

1011 raw_tag = clean_node(wxr, None, node).strip( 

1012 "() " 

1013 ) 

1014 if raw_tag != "": 1014 ↛ 1004line 1014 didn't jump to line 1004 because the condition on line 1014 was always true

1015 raw_tags.append(raw_tag) 

1016 else: 

1017 form_nodes.append(node) 

1018 elif isinstance(node, HTMLNode) and node.tag == "br": 

1019 add_form( 

1020 form_nodes, 

1021 col_headers, 

1022 col_index, 

1023 row_header, 

1024 raw_tags, 

1025 ) 

1026 else: 

1027 form_nodes.append(node) 

1028 add_form( 

1029 form_nodes, col_headers, col_index, row_header, raw_tags 

1030 )