Coverage for src / wiktextract / extractor / fr / conjugation.py: 91%

556 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2025-12-12 08:09 +0000

1import re 

2from dataclasses import dataclass 

3from itertools import chain 

4 

5from wikitextprocessor.parser import ( 

6 LEVEL_KIND_FLAGS, 

7 HTMLNode, 

8 LevelNode, 

9 NodeKind, 

10 TemplateNode, 

11 WikiNode, 

12) 

13 

14from ...page import clean_node 

15from ...wxr_context import WiktextractContext 

16from .models import Form, WordEntry 

17from .tags import translate_raw_tags 

18 

19 

20def extract_conjugation( 

21 wxr: WiktextractContext, 

22 entry: WordEntry, 

23 conj_page_title: str, 

24 select_tab: str = "1", 

25) -> None: 

26 """ 

27 Find and extract conjugation page. 

28 

29 https://fr.wiktionary.org/wiki/Conjugaison:français 

30 https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_de_tous_les_modèles/Français/Conjugaison 

31 https://fr.wiktionary.org/wiki/Aide:Conjugaisons 

32 """ 

33 conj_page = wxr.wtp.get_page_body( 

34 conj_page_title, wxr.wtp.NAMESPACE_DATA["Conjugaison"]["id"] 

35 ) 

36 if conj_page is None: 

37 return 

38 conj_root = wxr.wtp.parse(conj_page) 

39 for node in conj_root.children: 

40 if isinstance(node, TemplateNode): 

41 extract_conj_templates( 

42 wxr, entry, conj_page_title, node, select_tab 

43 ) 

44 elif isinstance(node, LevelNode): 44 ↛ 45line 44 didn't jump to line 45 because the condition on line 44 was never true

45 for t_node in node.find_child(NodeKind.TEMPLATE): 

46 extract_conj_templates( 

47 wxr, entry, conj_page_title, t_node, select_tab 

48 ) 

49 

50 if conj_page_title.startswith("Conjugaison:kurde/"): 

51 for table in conj_root.find_child(NodeKind.TABLE): 51 ↛ 52line 51 didn't jump to line 52 because the loop on line 51 never started

52 extract_ku_conj_trans_table_node(wxr, entry, table, conj_page_title) 

53 

54 for link_node in conj_root.find_child(NodeKind.LINK): 54 ↛ 55line 54 didn't jump to line 55 because the loop on line 54 never started

55 clean_node(wxr, None, link_node) 

56 

57 

58def extract_conj_templates( 

59 wxr: WiktextractContext, 

60 entry: WordEntry, 

61 conj_page_title: str, 

62 conj_template: TemplateNode, 

63 select_tab: str = "1", 

64) -> None: 

65 if conj_template.template_name.endswith("-intro"): 65 ↛ 66line 65 didn't jump to line 66 because the condition on line 65 was never true

66 return 

67 if conj_template.template_name in ["ku-conj-trans", "ku-conj"]: 

68 extract_ku_conj_trans_template( 

69 wxr, entry, conj_template, conj_page_title 

70 ) 

71 elif conj_template.template_name == "ko-conj": 

72 extract_ko_conj_template(wxr, entry, conj_template, conj_page_title) 

73 elif conj_template.template_name == "de-conj": 

74 extract_de_conj_template(wxr, entry, conj_template, conj_page_title) 

75 elif conj_template.template_name.startswith("pt-conj/"): 

76 extract_pt_conj_template(wxr, entry, conj_template, conj_page_title) 

77 elif conj_template.template_name.startswith("cs-conj-"): 

78 extract_cs_conj_template(wxr, entry, conj_template, conj_page_title) 

79 elif conj_template.template_name.startswith(("ro-verb-", "se-conj-")): 79 ↛ 80line 79 didn't jump to line 80 because the condition on line 79 was never true

80 from .inflection import extract_inf_table_template 

81 

82 extract_inf_table_template(wxr, entry, conj_template, conj_page_title) 

83 elif ( 

84 "-conj" in conj_template.template_name 

85 # https://fr.wiktionary.org/wiki/Catégorie:Modèles_de_conjugaison_en_italien 

86 # Italian table templates 

87 or conj_template.template_name.startswith("it-") 

88 ): 

89 process_conj_template(wxr, entry, conj_template, conj_page_title) 

90 elif conj_template.template_name == "Onglets conjugaison": 

91 process_onglets_conjugaison_template( 

92 wxr, entry, conj_template, conj_page_title, select_tab 

93 ) 

94 elif conj_template.template_name.removeprefix(":").startswith( 

95 "Conjugaison:" 

96 ): 

97 extract_conjugation( 

98 wxr, 

99 entry, 

100 conj_template.template_name.removeprefix(":"), 

101 clean_node( 

102 wxr, None, conj_template.template_parameters.get("sél", "2") 

103 ), 

104 ) 

105 elif conj_template.template_name.startswith("ja-flx-adj"): 

106 process_ja_flx_adj_template(wxr, entry, conj_template, conj_page_title) 

107 elif conj_template.template_name.startswith("ja-"): 107 ↛ exitline 107 didn't return from function 'extract_conj_templates' because the condition on line 107 was always true

108 process_ja_conj_template(wxr, entry, conj_template, conj_page_title) 

109 

110 

111def process_onglets_conjugaison_template( 

112 wxr: WiktextractContext, 

113 entry: WordEntry, 

114 node: TemplateNode, 

115 conj_page_title: str, 

116 select_tab: str, 

117) -> None: 

118 # https://fr.wiktionary.org/wiki/Modèle:Onglets_conjugaison 

119 # this template expands to two tabs of tables 

120 selected_tabs = [] 

121 if select_tab != "1" or ( 121 ↛ 129line 121 didn't jump to line 129 because the condition on line 121 was always true

122 select_tab == "1" 

123 and clean_node(wxr, None, node.template_parameters.get("onglet1", "")) 

124 == "Conjugaison active" 

125 ): 

126 # don't extract or only extract "Conjugaison pronominale" tab 

127 selected_tabs = [select_tab] 

128 else: 

129 selected_tabs = [str(i) for i in range(1, 7)] 

130 

131 for tab_index in selected_tabs: 

132 arg_name = f"contenu{tab_index}" 

133 if arg_name not in node.template_parameters: 133 ↛ 134line 133 didn't jump to line 134 because the condition on line 133 was never true

134 break 

135 arg_value = node.template_parameters[arg_name] 

136 if ( 136 ↛ 140line 136 didn't jump to line 140 because the condition on line 136 was never true

137 isinstance(arg_value, TemplateNode) 

138 and "-conj" in arg_value.template_name 

139 ): 

140 process_conj_template(wxr, entry, arg_value, conj_page_title) 

141 elif isinstance(arg_value, list): 141 ↛ 131line 141 didn't jump to line 131 because the condition on line 141 was always true

142 for arg_node in arg_value: 

143 if isinstance(arg_node, TemplateNode) and ( 

144 "-conj" in arg_node.template_name 

145 or arg_node.template_name.startswith("it-") 

146 ): 

147 process_conj_template(wxr, entry, arg_node, conj_page_title) 

148 

149 

150def process_conj_template( 

151 wxr: WiktextractContext, 

152 entry: WordEntry, 

153 template_node: TemplateNode, 

154 conj_page_title: str, 

155) -> None: 

156 # https://fr.wiktionary.org/wiki/Catégorie:Modèles_de_conjugaison_en_français 

157 # https://fr.wiktionary.org/wiki/Modèle:fr-conj-1-ger 

158 expanded_template = wxr.wtp.parse( 

159 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

160 ) 

161 process_expanded_conj_template( 

162 wxr, entry, expanded_template, conj_page_title 

163 ) 

164 

165 

166def process_expanded_conj_template( 

167 wxr: WiktextractContext, 

168 entry: WordEntry, 

169 node: WikiNode, 

170 conj_page_title: str, 

171) -> None: 

172 h3_text = ( 

173 clean_node(wxr, None, node.largs) 

174 if node.kind == NodeKind.LEVEL3 

175 else "" 

176 ) 

177 for child in node.find_child(NodeKind.HTML | LEVEL_KIND_FLAGS): 

178 if child.kind in LEVEL_KIND_FLAGS: 178 ↛ 179line 178 didn't jump to line 179 because the condition on line 178 was never true

179 process_expanded_conj_template(wxr, entry, child, conj_page_title) 

180 elif child.kind == NodeKind.HTML: 180 ↛ 177line 180 didn't jump to line 177 because the condition on line 180 was always true

181 if child.tag == "h3": 

182 h3_text = clean_node(wxr, None, child) 

183 elif child.tag == "div": 183 ↛ 177line 183 didn't jump to line 177 because the condition on line 183 was always true

184 if h3_text == "Modes impersonnels": 

185 process_fr_conj_modes_table( 

186 wxr, entry, child, conj_page_title 

187 ) 

188 else: 

189 process_fr_conj_table( 

190 wxr, entry, child, h3_text, conj_page_title 

191 ) 

192 

193 

194@dataclass 

195class TableHeader: 

196 text: str 

197 col_index: int = 0 

198 colspan: int = 0 

199 row_index: int = 0 

200 rowspan: int = 0 

201 

202 

203def process_fr_conj_modes_table( 

204 wxr: WiktextractContext, 

205 entry: WordEntry, 

206 div_node: HTMLNode, 

207 conj_page_title: str, 

208) -> None: 

209 # the first "Modes impersonnels" table 

210 

211 for table_node in div_node.find_child(NodeKind.TABLE): 

212 col_headers = [] 

213 for row in table_node.find_child(NodeKind.TABLE_ROW): 

214 row_header = "" 

215 is_header_row = not row.contain_node(NodeKind.TABLE_CELL) 

216 col_index = 0 

217 form_text = "" 

218 for node in row.find_child( 

219 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

220 ): 

221 if node.kind == NodeKind.TABLE_HEADER_CELL or ( 

222 node.contain_node(NodeKind.BOLD) and col_index == 0 

223 ): 

224 if is_header_row: 

225 header_text = clean_node(wxr, None, node) 

226 if header_text == "Mode": 226 ↛ 227line 226 didn't jump to line 227 because the condition on line 226 was never true

227 continue 

228 else: 

229 colspan = 1 

230 colspan_str = node.attrs.get("colspan", "1") 

231 if re.fullmatch(r"\d+", colspan_str) is not None: 231 ↛ 233line 231 didn't jump to line 233 because the condition on line 231 was always true

232 colspan = int(colspan_str) 

233 col_headers.append( 

234 TableHeader(header_text, col_index, colspan) 

235 ) 

236 col_index += colspan 

237 else: 

238 row_header = clean_node(wxr, None, node) 

239 else: 

240 node_text = clean_node(wxr, None, node) 

241 if ( 

242 node_text.endswith(("]", "\\", "Prononciation ?")) 

243 and form_text != "" 

244 ): 

245 form = Form( 

246 form=form_text, 

247 ipas=[node_text] 

248 if node_text.endswith(("]", "\\")) 

249 else [], 

250 source=conj_page_title, 

251 ) 

252 if row_header != "": 252 ↛ 254line 252 didn't jump to line 254 because the condition on line 252 was always true

253 form.raw_tags.append(row_header) 

254 for col_header in col_headers: 

255 if ( 

256 col_index >= col_header.col_index 

257 and col_index 

258 < col_header.col_index + col_header.colspan 

259 ): 

260 form.raw_tags.append(col_header.text) 

261 translate_raw_tags(form) 

262 entry.forms.append(form) 

263 form_text = "" 

264 elif node_text != "": 

265 if not form_text.endswith("’") and form_text != "": 

266 form_text += " " 

267 form_text += node_text 

268 col_index += 1 

269 

270 

271def process_fr_conj_table( 

272 wxr: WiktextractContext, 

273 entry: WordEntry, 

274 div_node: HTMLNode, 

275 h3_text: str, 

276 conj_page_title: str, 

277) -> None: 

278 for table_node in div_node.find_child(NodeKind.TABLE): 

279 for row_index, row in enumerate( 

280 table_node.find_child(NodeKind.TABLE_ROW) 

281 ): 

282 for cell_index, cell in enumerate( 

283 row.find_child(NodeKind.TABLE_CELL) 

284 ): 

285 for cell_child in cell.children: 

286 if isinstance(cell_child, WikiNode): 

287 if ( 

288 cell_child.kind == NodeKind.HTML 

289 and cell_child.tag == "table" 

290 ): 

291 process_fr_conj_html_table( 

292 wxr, entry, cell_child, h3_text, conj_page_title 

293 ) 

294 elif cell_child.kind == NodeKind.TABLE: 294 ↛ 285line 294 didn't jump to line 285 because the condition on line 294 was always true

295 process_fr_conj_wiki_table( 

296 wxr, entry, cell_child, h3_text, conj_page_title 

297 ) 

298 

299 

300def process_fr_conj_html_table( 

301 wxr: WiktextractContext, 

302 entry: WordEntry, 

303 table_node: HTMLNode, 

304 h3_text: str, 

305 conj_page_title: str, 

306): 

307 tags = [h3_text] if h3_text != "" else [] 

308 for tr_index, tr_node in enumerate(table_node.find_html_recursively("tr")): 

309 if tr_index == 0: 

310 tags.append(clean_node(wxr, None, tr_node.children)) 

311 else: 

312 form = Form(raw_tags=tags, source=conj_page_title) 

313 for td_index, td_node in enumerate( 

314 tr_node.find_html_recursively("td") 

315 ): 

316 td_text = clean_node(wxr, None, td_node) 

317 if td_index < 2: 

318 form.form += td_text 

319 if td_index == 0 and not td_text.endswith("’"): 

320 form.form += " " 

321 else: 

322 if len(form.ipas) > 0: 

323 form.ipas[0] += td_text 

324 else: 

325 if not td_text.endswith("‿"): 325 ↛ 327line 325 didn't jump to line 327 because the condition on line 325 was always true

326 td_text += " " 

327 form.ipas.append(td_text) 

328 

329 translate_raw_tags(form) 

330 entry.forms.append(form) 

331 

332 

333def process_fr_conj_wiki_table( 

334 wxr: WiktextractContext, 

335 entry: WordEntry, 

336 table_node: WikiNode, 

337 h3_text: str, 

338 conj_page_title: str, 

339): 

340 tags = [h3_text] if h3_text != "" else [] 

341 for row_index, row in enumerate(table_node.find_child(NodeKind.TABLE_ROW)): 

342 if row_index == 0: 

343 tags.append(clean_node(wxr, None, row.children)) 

344 else: 

345 form = Form(raw_tags=tags, source=conj_page_title) 

346 for cell_index, cell in enumerate( 

347 row.find_child(NodeKind.TABLE_CELL) 

348 ): 

349 cell_text = clean_node(wxr, None, cell) 

350 if cell_index < 2: 

351 if cell_text == "—" or cell_text.endswith( 

352 "Prononciation ?" 

353 ): 

354 continue 

355 if cell_text.startswith( 

356 "-" 

357 ) and not form.form.strip().endswith(")"): 

358 form.form = form.form.strip() 

359 form.form += cell_text 

360 if cell_index == 0 and len(cell_text) > 0: 

361 form.form += " " 

362 elif not cell_text.endswith("Prononciation ?"): 362 ↛ 346line 362 didn't jump to line 346 because the condition on line 362 was always true

363 form.ipas.append(cell_text) 

364 

365 if len(form.form) > 0: 

366 translate_raw_tags(form) 

367 entry.forms.append(form) 

368 

369 

370def process_ja_flx_adj_template( 

371 wxr: WiktextractContext, 

372 entry: WordEntry, 

373 template_node: TemplateNode, 

374 conj_page_title: str, 

375) -> None: 

376 # https://fr.wiktionary.org/wiki/Modèle:ja-adj 

377 # https://fr.wiktionary.org/wiki/Modèle:ja-flx-adj-な 

378 expanded_template = wxr.wtp.parse( 

379 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

380 ) 

381 for table_node in expanded_template.find_child(NodeKind.TABLE): 

382 first_tag = "" 

383 for row in table_node.find_child(NodeKind.TABLE_ROW): 

384 forms = [] 

385 tags = [first_tag] 

386 for cell_index, row_child in enumerate( 

387 row.find_child(NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL) 

388 ): 

389 row_child_text = clean_node(wxr, None, row_child) 

390 if row_child.kind == NodeKind.TABLE_HEADER_CELL: 

391 first_tag = row_child_text 

392 else: 

393 for line_index, line in enumerate( 

394 row_child_text.splitlines() 

395 ): 

396 if cell_index == 0: 

397 tags.append(line) 

398 continue 

399 if line_index + 1 > len(forms): 

400 forms.append( 

401 translate_raw_tags( 

402 Form(raw_tags=tags, source=conj_page_title) 

403 ) 

404 ) 

405 if cell_index == 1: 

406 forms[line_index].form = line 

407 elif cell_index == 2: 

408 forms[line_index].hiragana = line 

409 elif cell_index == 3: 409 ↛ 393line 409 didn't jump to line 393 because the condition on line 409 was always true

410 forms[line_index].roman = line 

411 

412 entry.forms.extend(forms) 

413 

414 

415def process_ja_conj_template( 

416 wxr: WiktextractContext, 

417 entry: WordEntry, 

418 template_node: TemplateNode, 

419 conj_page_title: str, 

420) -> None: 

421 # https://fr.wiktionary.org/wiki/Modèle:ja-verbe-conj 

422 # Modèle:ja-在る 

423 expanded_template = wxr.wtp.parse( 

424 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

425 ) 

426 for table_node in expanded_template.find_child(NodeKind.TABLE): 

427 first_tag = "" 

428 row_headers = {} 

429 for row in table_node.find_child(NodeKind.TABLE_ROW): 

430 if ( 

431 all( 

432 isinstance(c, WikiNode) 

433 and c.kind == NodeKind.TABLE_HEADER_CELL 

434 for c in row.children 

435 ) 

436 and len(row.children) > 1 

437 ): 

438 # skip header row of the "Clefs de constructions" table 

439 continue 

440 

441 for header in row.find_child(NodeKind.TABLE_HEADER_CELL): 

442 header_text = clean_node(wxr, None, header) 

443 if len(row.children) == 1: 

444 first_tag = header_text 

445 else: 

446 row_headers[header_text] = int( 

447 header.attrs.get("rowspan", "1") 

448 ) 

449 

450 tags = [first_tag] 

451 for tag, rowspan in row_headers.copy().items(): 

452 tags.append(tag) 

453 if rowspan == 1: 

454 del row_headers[tag] 

455 else: 

456 row_headers[tag] = rowspan - 1 

457 forms = [] 

458 for cell_index, cell in enumerate( 

459 row.find_child(NodeKind.TABLE_CELL) 

460 ): 

461 cell_text = clean_node(wxr, None, cell) 

462 for line_index, line in enumerate(cell_text.splitlines()): 

463 if cell_index == 0: 

464 forms.append( 

465 Form( 

466 form=line.strip(), 

467 raw_tags=tags, 

468 source=conj_page_title, 

469 ) 

470 ) 

471 elif cell_index == 1 and line_index < len(forms): 

472 forms[line_index].hiragana = line.strip() 

473 elif cell_index == 2 and line_index < len(forms): 473 ↛ 462line 473 didn't jump to line 462 because the condition on line 473 was always true

474 forms[line_index].roman = line.strip() 

475 for form in forms: 

476 if len(form.form) > 0: 476 ↛ 475line 476 didn't jump to line 475 because the condition on line 476 was always true

477 translate_raw_tags(form) 

478 entry.forms.append(form) 

479 

480 

481def extract_ku_conj_trans_template( 

482 wxr: WiktextractContext, 

483 entry: WordEntry, 

484 t_node: TemplateNode, 

485 conj_page_title: str, 

486) -> None: 

487 expanded_node = wxr.wtp.parse( 

488 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

489 ) 

490 for table in expanded_node.find_child(NodeKind.TABLE): 

491 extract_ku_conj_trans_table_node(wxr, entry, table, conj_page_title) 

492 for link_node in expanded_node.find_child(NodeKind.LINK): 

493 clean_node(wxr, entry, link_node) 

494 

495 

496def extract_ku_conj_trans_table_node( 

497 wxr: WiktextractContext, 

498 entry: WordEntry, 

499 table_node: WikiNode, 

500 conj_page_title: str, 

501) -> None: 

502 @dataclass 

503 class TableHeader: 

504 text: str 

505 index: int 

506 span: int 

507 

508 ignore_headers = ( 

509 "Conjugaison du verbe", 

510 "TEMPS DU PRÉSENT ET DU FUTUR", 

511 "TEMPS DU PRESENT ET DU FUTUR", 

512 "TEMPS DU PASSÉ", 

513 "TEMPS DU PASSE", 

514 ) 

515 col_headers = [] 

516 last_row_has_header = False 

517 last_header = "" 

518 for row in table_node.find_child(NodeKind.TABLE_ROW): 

519 col_index = 0 

520 current_row_has_header = row.contain_node(NodeKind.TABLE_HEADER_CELL) 

521 if not last_row_has_header and current_row_has_header: 

522 col_headers.clear() 

523 for cell in row.find_child( 

524 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

525 ): 

526 cell_str = clean_node(wxr, None, cell) 

527 if cell_str == "": 

528 col_index += 1 

529 continue 

530 if cell.kind == NodeKind.TABLE_HEADER_CELL: 

531 if cell_str.startswith(ignore_headers): 

532 last_header = cell_str 

533 continue 

534 colspan = 1 

535 colspan_str = cell.attrs.get("colspan", "1") 

536 if re.fullmatch(r"\d+", colspan_str) is not None: 536 ↛ 538line 536 didn't jump to line 538 because the condition on line 536 was always true

537 colspan = int(colspan_str) 

538 col_headers.append( 

539 TableHeader(text=cell_str, index=col_index, span=colspan) 

540 ) 

541 last_header = cell_str 

542 col_index += colspan 

543 elif last_header == "TEMPS DU PASSÉ": 

544 continue 

545 elif cell_str == "(inusité)": 

546 col_index += 1 

547 elif cell_str != wxr.wtp.title: 547 ↛ 523line 547 didn't jump to line 523 because the condition on line 547 was always true

548 form = Form(form=cell_str, source=conj_page_title) 

549 for header in col_headers: 

550 if ( 

551 col_index >= header.index 

552 and col_index < header.index + header.span 

553 ): 

554 form.raw_tags.append(header.text) 

555 translate_raw_tags(form) 

556 entry.forms.append(form) 

557 col_index += 1 

558 last_row_has_header = current_row_has_header 

559 

560 

561def extract_ko_conj_template( 

562 wxr: WiktextractContext, 

563 entry: WordEntry, 

564 t_node: TemplateNode, 

565 conj_page_title: str, 

566) -> None: 

567 word_page_title = wxr.wtp.title 

568 wxr.wtp.title = conj_page_title 

569 expanded_node = wxr.wtp.parse( 

570 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

571 ) 

572 for h3 in expanded_node.find_html("h3"): 

573 clean_node(wxr, entry, h3) 

574 for table_index, table in enumerate( 

575 expanded_node.find_child(NodeKind.TABLE) 

576 ): 

577 if table_index == 0: 

578 continue 

579 shared_raw_tags = [] 

580 for caption_node in table.find_child(NodeKind.TABLE_CAPTION): 

581 caption = clean_node(wxr, None, caption_node.children) 

582 if caption != "": 582 ↛ 580line 582 didn't jump to line 580 because the condition on line 582 was always true

583 shared_raw_tags.append(caption) 

584 col_headers = [] 

585 row_headers = [] 

586 row_index = 0 

587 row_header_indexes = [0] 

588 for row in table.find_child(NodeKind.TABLE_ROW): 

589 col_index = 0 

590 for header_cell in row.find_child(NodeKind.TABLE_HEADER_CELL): 

591 cell_str = clean_node(wxr, None, header_cell) 

592 if cell_str == "": 

593 continue 

594 colspan, rowspan = get_cell_span(header_cell) 

595 if row.contain_node(NodeKind.TABLE_CELL): 

596 header_added = False 

597 current_row_index = row_index 

598 for index, row_header_index in enumerate( 598 ↛ 606line 598 didn't jump to line 606 because the loop on line 598 didn't complete

599 row_header_indexes 

600 ): 

601 if row_index >= row_header_index: 

602 current_row_index = row_header_indexes[index] 

603 row_header_indexes[index] += rowspan 

604 header_added = True 

605 break 

606 if not header_added: 606 ↛ 607line 606 didn't jump to line 607 because the condition on line 606 was never true

607 row_header_indexes.append(rowspan) 

608 row_headers.append( 

609 TableHeader( 

610 text=cell_str, 

611 row_index=current_row_index, 

612 rowspan=rowspan, 

613 ) 

614 ) 

615 else: 

616 col_headers.append( 

617 TableHeader( 

618 text=cell_str, 

619 col_index=col_index, 

620 colspan=colspan, 

621 ) 

622 ) 

623 col_index += colspan 

624 if row.contain_node(NodeKind.TABLE_CELL): 

625 row_index += 1 

626 

627 row_index = 0 

628 for row in table.find_child(NodeKind.TABLE_ROW): 

629 col_index = 0 

630 for cell in row.find_child(NodeKind.TABLE_CELL): 

631 cell_str = clean_node(wxr, None, cell) 

632 colspan, rowspan = get_cell_span(cell) 

633 if cell_str == "—": 633 ↛ 634line 633 didn't jump to line 634 because the condition on line 633 was never true

634 col_index += 1 

635 else: 

636 form = Form( 

637 source=conj_page_title, raw_tags=shared_raw_tags 

638 ) 

639 for line_index, line in enumerate(cell_str.splitlines()): 

640 match line_index: 

641 case 0: 

642 form.form = line 

643 case 1: 

644 form.roman = line 

645 case 2: 645 ↛ 639line 645 didn't jump to line 639 because the pattern on line 645 always matched

646 form.ipas.append(line) 

647 for header in col_headers: 

648 if ( 

649 col_index >= header.col_index 

650 and col_index < header.col_index + header.colspan 

651 ): 

652 form.raw_tags.append(header.text) 

653 for header in row_headers: 

654 if ( 

655 row_index < header.row_index + header.rowspan 

656 and row_index + rowspan > header.row_index 

657 ): 

658 form.raw_tags.append(header.text) 

659 if form.form not in ["", wxr.wtp.title]: 659 ↛ 662line 659 didn't jump to line 662 because the condition on line 659 was always true

660 translate_raw_tags(form) 

661 entry.forms.append(form) 

662 col_index += 1 

663 if row.contain_node(NodeKind.TABLE_CELL): 

664 row_index += 1 

665 

666 for link in expanded_node.find_child(NodeKind.LINK): 

667 clean_node(wxr, entry, link) 

668 wxr.wtp.title = word_page_title 

669 

670 

671def get_cell_span(cell: WikiNode) -> tuple[int, int]: 

672 colspan = 1 

673 colspan_str = cell.attrs.get("colspan", "1") 

674 if re.fullmatch(r"\d+", colspan_str) is not None: 674 ↛ 676line 674 didn't jump to line 676 because the condition on line 674 was always true

675 colspan = int(colspan_str) 

676 rowspan = 1 

677 rowspan_str = cell.attrs.get("rowspan", "1") 

678 if re.fullmatch(r"\d+", rowspan_str) is not None: 678 ↛ 680line 678 didn't jump to line 680 because the condition on line 678 was always true

679 rowspan = int(rowspan_str) 

680 return colspan, rowspan 

681 

682 

683def extract_de_conj_template( 

684 wxr: WiktextractContext, 

685 word_entry: WordEntry, 

686 t_node: TemplateNode, 

687 conj_page_title: str, 

688): 

689 word_page_title = wxr.wtp.title 

690 wxr.wtp.title = conj_page_title 

691 expanded_node = wxr.wtp.parse( 

692 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

693 ) 

694 wxr.wtp.title = word_page_title 

695 for table_index, table in enumerate( 

696 expanded_node.find_child(NodeKind.TABLE) 

697 ): 

698 table_header = "" 

699 col_headers = [] 

700 for row in table.find_child(NodeKind.TABLE_ROW): 

701 word_part = "" 

702 col_index = 0 

703 if table_index >= 2 and row.contain_node( 

704 NodeKind.TABLE_HEADER_CELL 

705 ): 

706 col_headers.clear() 

707 for cell in row.find_child( 

708 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

709 ): 

710 cell_text = clean_node(wxr, None, cell) 

711 if cell_text == "": 

712 continue 

713 elif cell.kind == NodeKind.TABLE_HEADER_CELL: 

714 if len(row.children) == 1: 

715 table_header = clean_node(wxr, None, cell) 

716 else: 

717 col_headers.append(clean_node(wxr, None, cell)) 

718 elif table_index < 2: 

719 form = Form(form=cell_text, source=conj_page_title) 

720 if ":" in cell_text: 

721 colon_index = cell_text.index(":") 

722 raw_tag = cell_text[:colon_index].strip() 

723 if raw_tag != "": 723 ↛ 725line 723 didn't jump to line 725 because the condition on line 723 was always true

724 form.raw_tags.append(raw_tag) 

725 form.form = cell_text[colon_index + 1 :].strip() 

726 if table_header != "": 726 ↛ 728line 726 didn't jump to line 728 because the condition on line 726 was always true

727 form.raw_tags.append(table_header) 

728 if col_index < len(col_headers): 728 ↛ 730line 728 didn't jump to line 730 because the condition on line 728 was always true

729 form.raw_tags.append(col_headers[col_index]) 

730 if form.form not in ["", wxr.wtp.title]: 

731 translate_raw_tags(form) 

732 word_entry.forms.append(form) 

733 elif col_index % 2 == 0: 

734 word_part = cell_text 

735 else: 

736 form = Form( 

737 form=f"{word_part} {cell_text}", source=conj_page_title 

738 ) 

739 if table_header != "": 739 ↛ 741line 739 didn't jump to line 741 because the condition on line 739 was always true

740 form.raw_tags.append(table_header) 

741 if col_index // 2 < len(col_headers): 741 ↛ 743line 741 didn't jump to line 743 because the condition on line 741 was always true

742 form.raw_tags.append(col_headers[col_index // 2]) 

743 if form.form not in ["", wxr.wtp.title]: 743 ↛ 746line 743 didn't jump to line 746 because the condition on line 743 was always true

744 translate_raw_tags(form) 

745 word_entry.forms.append(form) 

746 col_index += 1 

747 

748 for cat_link in expanded_node.find_child(NodeKind.LINK): 

749 clean_node(wxr, word_entry, cat_link) 

750 

751 

752def extract_declension_page( 

753 wxr: WiktextractContext, word_entry: WordEntry, page_title: str 

754): 

755 page_body = wxr.wtp.get_page_body( 

756 page_title, wxr.wtp.NAMESPACE_DATA["Appendix"]["id"] 

757 ) 

758 if page_body is None: 758 ↛ 759line 758 didn't jump to line 759 because the condition on line 758 was never true

759 return 

760 root = wxr.wtp.parse(page_body) 

761 for t_node in root.find_child(NodeKind.TEMPLATE): 

762 extract_declension_template(wxr, word_entry, page_title, t_node, "") 

763 

764 

765def extract_declension_template( 

766 wxr: WiktextractContext, 

767 word_entry: WordEntry, 

768 page_title: str, 

769 t_node: TemplateNode, 

770 tab_name: str, 

771): 

772 if t_node.template_name in [ 772 ↛ 779line 772 didn't jump to line 779 because the condition on line 772 was always true

773 "de-adjectif-déclinaisons", 

774 "de-adj-déclinaisons", 

775 ]: 

776 extract_de_adj_declension_template( 

777 wxr, word_entry, page_title, t_node, tab_name 

778 ) 

779 elif t_node.template_name == "Onglets conjugaison": 

780 for index in range(1, 7): 

781 tab_name_arg = f"onglet{index}" 

782 if tab_name_arg not in t_node.template_parameters: 

783 break 

784 tab_name = clean_node( 

785 wxr, None, t_node.template_parameters[tab_name_arg] 

786 ) 

787 tab_content = wxr.wtp.parse( 

788 wxr.wtp.node_to_wikitext( 

789 t_node.template_parameters[f"contenu{index}"] 

790 ) 

791 ) 

792 for node in tab_content.find_child(NodeKind.TEMPLATE): 

793 extract_declension_template( 

794 wxr, word_entry, page_title, node, tab_name 

795 ) 

796 

797 

798def extract_de_adj_declension_template( 

799 wxr: WiktextractContext, 

800 word_entry: WordEntry, 

801 page_title: str, 

802 t_node: TemplateNode, 

803 tab_name: str, 

804): 

805 # https://fr.wiktionary.org/wiki/Modèle:de-adjectif-déclinaisons 

806 expanded_node = wxr.wtp.parse( 

807 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

808 ) 

809 for level_node in expanded_node.find_child(LEVEL_KIND_FLAGS): 

810 section_title = clean_node(wxr, None, level_node.largs) 

811 for table in level_node.find_child(NodeKind.TABLE): 

812 table_caption = "" 

813 for cap_node in table.find_child(NodeKind.TABLE_CAPTION): 

814 table_caption = clean_node(wxr, None, cap_node.children) 

815 col_headers = [] 

816 for row in table.find_child(NodeKind.TABLE_ROW): 

817 col_index = 0 

818 row_header = "" 

819 row_has_data = row.contain_node(NodeKind.TABLE_CELL) 

820 article = "" 

821 for cell in row.find_child( 

822 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

823 ): 

824 colspan = int(cell.attrs.get("colspan", "1")) 

825 cell_text = clean_node(wxr, None, cell) 

826 if cell.kind == NodeKind.TABLE_HEADER_CELL: 

827 if row_has_data: 

828 row_header = clean_node(wxr, None, cell) 

829 elif cell_text != "Forme": 

830 col_headers.append( 

831 TableHeader( 

832 clean_node(wxr, None, cell), 

833 col_index=col_index, 

834 colspan=colspan, 

835 ) 

836 ) 

837 else: 

838 use_col_headers = [] 

839 for col_header in col_headers: 

840 if ( 

841 col_index >= col_header.col_index 

842 and col_index 

843 < col_header.col_index + col_header.colspan 

844 ): 

845 use_col_headers.append(col_header.text) 

846 if "Article" in use_col_headers: 

847 if cell_text != "—": 847 ↛ 867line 847 didn't jump to line 867 because the condition on line 847 was always true

848 article = cell_text 

849 else: 

850 form = Form( 

851 form=cell_text, 

852 article=article, 

853 raw_tags=use_col_headers, 

854 source=page_title, 

855 ) 

856 for raw_tag in [ 

857 tab_name, 

858 section_title, 

859 table_caption, 

860 row_header, 

861 ]: 

862 if raw_tag != "": 

863 form.raw_tags.append(raw_tag) 

864 if form.form not in ["", wxr.wtp.title]: 864 ↛ 867line 864 didn't jump to line 867 because the condition on line 864 was always true

865 translate_raw_tags(form) 

866 word_entry.forms.append(form) 

867 col_index += colspan 

868 

869 for link in level_node.find_child(NodeKind.LINK): 

870 clean_node(wxr, word_entry, link) 

871 

872 

873def extract_pt_conj_template( 

874 wxr: WiktextractContext, 

875 word_entry: WordEntry, 

876 t_node: TemplateNode, 

877 page_title: str, 

878): 

879 expanded_node = wxr.wtp.parse( 

880 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

881 ) 

882 for table in expanded_node.find_child(NodeKind.TABLE): 

883 col_headers = [] 

884 row_headers = [] 

885 for row_index, row in enumerate(table.find_child(NodeKind.TABLE_ROW)): 

886 row_has_data = row.contain_node(NodeKind.TABLE_CELL) 

887 col_index = 0 

888 for header in chain(col_headers, row_headers): 

889 if ( 

890 row_index > header.row_index 

891 and row_index < header.row_index + header.rowspan 

892 and header.col_index <= col_index 

893 ): 

894 col_index += header.colspan 

895 for cell_node in row.find_child( 

896 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

897 ): 

898 cell_text = clean_node(wxr, None, cell_node) 

899 colspan = int(cell_node.attrs.get("colspan", "1")) 

900 rowspan = int(cell_node.attrs.get("rowspan", "1")) 

901 if cell_node.kind == NodeKind.TABLE_HEADER_CELL: 

902 if row_has_data: 

903 row_headers.append( 

904 TableHeader( 

905 cell_text, 

906 col_index, 

907 colspan, 

908 row_index, 

909 rowspan, 

910 ) 

911 ) 

912 else: 

913 if ( 

914 cell_text 

915 == "Formas pessoais\n(formes personnelles)" 

916 ): 

917 col_headers.clear() 

918 row_headers.clear() 

919 col_headers.append( 

920 TableHeader( 

921 cell_text, 

922 col_index, 

923 colspan, 

924 row_index, 

925 rowspan, 

926 ) 

927 ) 

928 elif cell_node.contain_node(NodeKind.LIST): 928 ↛ 929line 928 didn't jump to line 929 because the condition on line 928 was never true

929 continue # skip end notes 

930 else: 

931 for line in cell_text.splitlines(): 

932 form_str = line.strip("/ \n") 

933 raw_tag = "" 

934 if ":" in form_str: 

935 colon_index = form_str.index(":") 

936 raw_tag = form_str[:colon_index].strip() 

937 form_str = form_str[colon_index + 1 :].strip() 

938 if form_str not in ["", "-", wxr.wtp.title]: 

939 form = Form(form=form_str, source=page_title) 

940 for col_header in col_headers: 

941 if ( 

942 ( 

943 ( 

944 col_header.col_index 

945 < col_index + colspan 

946 and col_index 

947 < col_header.col_index 

948 + col_header.colspan 

949 ) 

950 or ( 

951 # "Modo Subjuntivo" header 

952 col_header.col_index == 0 

953 and col_header.row_index 

954 < row_index + rowspan 

955 and col_header.row_index 

956 + col_header.rowspan 

957 > row_index 

958 ) 

959 ) 

960 and col_header.text != "" 

961 and col_header.text not in form.raw_tags 

962 ): 

963 form.raw_tags.append(col_header.text) 

964 for row_header in row_headers: 

965 if ( 

966 row_header.row_index < row_index + rowspan 

967 and row_index 

968 < row_header.row_index + row_header.rowspan 

969 and row_header.text != "" 

970 and row_header.text not in form.raw_tags 

971 ): 

972 form.raw_tags.append(row_header.text) 

973 if raw_tag != "": 

974 form.raw_tags.append(raw_tag) 

975 translate_raw_tags(form) 

976 word_entry.forms.append(form) 

977 col_index += colspan 

978 

979 

980def extract_cs_conj_template( 

981 wxr: WiktextractContext, 

982 word_entry: WordEntry, 

983 t_node: TemplateNode, 

984 page_title: str, 

985): 

986 def add_form(form_nodes, col_headers, col_index, row_header, raw_tags): 

987 form_str = clean_node(wxr, None, form_nodes) 

988 if form_str not in ["", "—", wxr.wtp.title]: 988 ↛ exitline 988 didn't return from function 'add_form' because the condition on line 988 was always true

989 form = Form(form=form_str, source=page_title) 

990 if col_index < len(col_headers): 990 ↛ 992line 990 didn't jump to line 992 because the condition on line 990 was always true

991 form.raw_tags.append(col_headers[col_index]) 

992 if row_header != "": 992 ↛ 994line 992 didn't jump to line 994 because the condition on line 992 was always true

993 form.raw_tags.append(row_header) 

994 form.raw_tags.extend(raw_tags) 

995 translate_raw_tags(form) 

996 word_entry.forms.append(form) 

997 form_nodes.clear() 

998 raw_tags.clear() 

999 

1000 expanded_node = wxr.wtp.parse( 

1001 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

1002 ) 

1003 for table in expanded_node.find_child(NodeKind.TABLE): 

1004 col_headers = [] 

1005 for row in table.find_child(NodeKind.TABLE_ROW): 

1006 row_header = "" 

1007 for col_index, cell in enumerate( 

1008 row.find_child(NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL) 

1009 ): 

1010 if cell.kind == NodeKind.TABLE_HEADER_CELL: 

1011 cell_scope = cell.attrs.get("scope", "") 

1012 if cell_scope == "col": 

1013 col_headers.append(clean_node(wxr, None, cell)) 

1014 elif cell_scope == "row": 1014 ↛ 1007line 1014 didn't jump to line 1007 because the condition on line 1014 was always true

1015 row_header = clean_node(wxr, None, cell) 

1016 else: 

1017 raw_tags = [] 

1018 form_nodes = [] 

1019 for node in cell.children: 

1020 if isinstance(node, HTMLNode) and node.tag == "span": 

1021 span_class = node.attrs.get("class", "").split() 

1022 if ( 1022 ↛ 1032line 1022 didn't jump to line 1032 because the condition on line 1022 was always true

1023 "ligne-de-forme" in span_class 

1024 or "registre" in span_class 

1025 ): 

1026 raw_tag = clean_node(wxr, None, node).strip( 

1027 "() " 

1028 ) 

1029 if raw_tag != "": 1029 ↛ 1019line 1029 didn't jump to line 1019 because the condition on line 1029 was always true

1030 raw_tags.append(raw_tag) 

1031 else: 

1032 form_nodes.append(node) 

1033 elif isinstance(node, HTMLNode) and node.tag == "br": 

1034 add_form( 

1035 form_nodes, 

1036 col_headers, 

1037 col_index, 

1038 row_header, 

1039 raw_tags, 

1040 ) 

1041 else: 

1042 form_nodes.append(node) 

1043 add_form( 

1044 form_nodes, col_headers, col_index, row_header, raw_tags 

1045 )