Coverage for src/wiktextract/extractor/fr/conjugation.py: 93%

1import re

2from dataclasses import dataclass

4from wikitextprocessor.parser import (

5 LEVEL_KIND_FLAGS,

6 HTMLNode,

7 NodeKind,

8 TemplateNode,

9 WikiNode,

10)

12from ...page import clean_node

13from ...wxr_context import WiktextractContext

14from .models import Form, WordEntry

15from .tags import translate_raw_tags

18def extract_conjugation(

19 wxr: WiktextractContext,

20 entry: WordEntry,

21 conj_page_title: str,

22 select_tab: str = "1",

23) -> None:

24 """

25 Find and extract conjugation page.

27 https://fr.wiktionary.org/wiki/Conjugaison:français

28 https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_de_tous_les_modèles/Français/Conjugaison

29 https://fr.wiktionary.org/wiki/Aide:Conjugaisons

30 """

31 conj_page = wxr.wtp.get_page_body(

32 conj_page_title, wxr.wtp.NAMESPACE_DATA["Conjugaison"]["id"]

33 )

34 if conj_page is None: 34 ↛ 35line 34 didn't jump to line 35 because the condition on line 34 was never true

35 return

36 conj_root = wxr.wtp.parse(conj_page)

37 for conj_template in conj_root.find_child(NodeKind.TEMPLATE):

38 if conj_template.template_name.endswith("-intro"): 38 ↛ 39line 38 didn't jump to line 39 because the condition on line 38 was never true

39 continue

40 if conj_template.template_name in ["ku-conj-trans", "ku-conj"]:

41 extract_ku_conj_trans_template(

42 wxr, entry, conj_template, conj_page_title

43 )

44 elif conj_template.template_name == "ko-conj":

45 extract_ko_conj_template(wxr, entry, conj_template, conj_page_title)

46 elif "-conj" in conj_template.template_name:

47 process_conj_template(wxr, entry, conj_template, conj_page_title)

48 elif conj_template.template_name == "Onglets conjugaison":

49 process_onglets_conjugaison_template(

50 wxr, entry, conj_template, conj_page_title, select_tab

51 )

52 elif conj_template.template_name.removeprefix(":").startswith(

53 "Conjugaison:"

54 ):

55 extract_conjugation(

56 wxr,

57 entry,

58 conj_template.template_name.removeprefix(":"),

59 clean_node(

60 wxr, None, conj_template.template_parameters.get("sél", "2")

61 ),

62 )

63 elif conj_template.template_name.startswith("ja-flx-adj"):

64 process_ja_flx_adj_template(

65 wxr, entry, conj_template, conj_page_title

66 )

67 elif conj_template.template_name.startswith("ja-"): 67 ↛ 37line 67 didn't jump to line 37 because the condition on line 67 was always true

68 process_ja_conj_template(wxr, entry, conj_template, conj_page_title)

70 if conj_page_title.startswith("Conjugaison:kurde/"):

71 for table in conj_root.find_child(NodeKind.TABLE): 71 ↛ 72line 71 didn't jump to line 72 because the loop on line 71 never started

72 extract_ku_conj_trans_table_node(wxr, entry, table, conj_page_title)

74 for link_node in conj_root.find_child(NodeKind.LINK): 74 ↛ 75line 74 didn't jump to line 75 because the loop on line 74 never started

75 clean_node(wxr, None, link_node)

78def process_onglets_conjugaison_template(

79 wxr: WiktextractContext,

80 entry: WordEntry,

81 node: TemplateNode,

82 conj_page_title: str,

83 select_tab: str,

84) -> None:

85 # https://fr.wiktionary.org/wiki/Modèle:Onglets_conjugaison

86 # this template expands to two tabs of tables

87 selected_tabs = []

88 if select_tab != "1" or ( 88 ↛ 96line 88 didn't jump to line 96 because the condition on line 88 was always true

89 select_tab == "1"

90 and clean_node(wxr, None, node.template_parameters.get("onglet1", ""))

91 == "Conjugaison active"

92 ):

93 # don't extract or only extract "Conjugaison pronominale" tab

94 selected_tabs = [select_tab]

95 else:

96 selected_tabs = [str(i) for i in range(1, 7)]

98 for tab_index in selected_tabs:

99 arg_name = f"contenu{tab_index}"

100 if arg_name not in node.template_parameters: 100 ↛ 101line 100 didn't jump to line 101 because the condition on line 100 was never true

101 break

102 arg_value = node.template_parameters[arg_name]

103 if ( 103 ↛ 107line 103 didn't jump to line 107 because the condition on line 103 was never true

104 isinstance(arg_value, TemplateNode)

105 and "-conj" in arg_value.template_name

106 ):

107 process_conj_template(wxr, entry, arg_value, conj_page_title)

108 elif isinstance(arg_value, list): 108 ↛ 98line 108 didn't jump to line 98 because the condition on line 108 was always true

109 for arg_node in arg_value:

110 if (

111 isinstance(arg_node, TemplateNode)

112 and "-conj" in arg_node.template_name

113 ):

114 process_conj_template(wxr, entry, arg_node, conj_page_title)

115

116

117def process_conj_template(

118 wxr: WiktextractContext,

119 entry: WordEntry,

120 template_node: TemplateNode,

121 conj_page_title: str,

122) -> None:

123 # https://fr.wiktionary.org/wiki/Catégorie:Modèles_de_conjugaison_en_français

124 # https://fr.wiktionary.org/wiki/Modèle:fr-conj-1-ger

125 expanded_template = wxr.wtp.parse(

126 wxr.wtp.node_to_wikitext(template_node), expand_all=True

127 )

128 process_expanded_conj_template(

129 wxr, entry, expanded_template, conj_page_title

130 )

131

132

133def process_expanded_conj_template(

134 wxr: WiktextractContext,

135 entry: WordEntry,

136 node: WikiNode,

137 conj_page_title: str,

138) -> None:

139 h3_text = (

140 clean_node(wxr, None, node.largs)

141 if node.kind == NodeKind.LEVEL3

142 else ""

143 )

144 for child in node.find_child(NodeKind.HTML | LEVEL_KIND_FLAGS):

145 if child.kind in LEVEL_KIND_FLAGS: 145 ↛ 146line 145 didn't jump to line 146 because the condition on line 145 was never true

146 process_expanded_conj_template(wxr, entry, child, conj_page_title)

147 elif child.kind == NodeKind.HTML: 147 ↛ 144line 147 didn't jump to line 144 because the condition on line 147 was always true

148 if child.tag == "h3":

149 h3_text = clean_node(wxr, None, child)

150 elif child.tag == "div": 150 ↛ 144line 150 didn't jump to line 144 because the condition on line 150 was always true

151 if h3_text == "Modes impersonnels":

152 process_fr_conj_modes_table(

153 wxr, entry, child, conj_page_title

154 )

155 else:

156 process_fr_conj_table(

157 wxr, entry, child, h3_text, conj_page_title

158 )

159

160

161@dataclass

162class TableHeader:

163 text: str

164 col_index: int = 0

165 colspan: int = 0

166 row_index: int = 0

167 rowspan: int = 0

168

169

170def process_fr_conj_modes_table(

171 wxr: WiktextractContext,

172 entry: WordEntry,

173 div_node: HTMLNode,

174 conj_page_title: str,

175) -> None:

176 # the first "Modes impersonnels" table

177

178 for table_node in div_node.find_child(NodeKind.TABLE):

179 col_headers = []

180 for row in table_node.find_child(NodeKind.TABLE_ROW):

181 row_header = ""

182 is_header_row = not row.contain_node(NodeKind.TABLE_CELL)

183 col_index = 0

184 form_text = ""

185 for node in row.find_child(

186 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL

187 ):

188 if node.kind == NodeKind.TABLE_HEADER_CELL or (

189 node.contain_node(NodeKind.BOLD) and col_index == 0

190 ):

191 if is_header_row:

192 header_text = clean_node(wxr, None, node)

193 if header_text == "Mode": 193 ↛ 194line 193 didn't jump to line 194 because the condition on line 193 was never true

194 continue

195 else:

196 colspan = 1

197 colspan_str = node.attrs.get("colspan", "1")

198 if re.fullmatch(r"\d+", colspan_str) is not None: 198 ↛ 200line 198 didn't jump to line 200 because the condition on line 198 was always true

199 colspan = int(colspan_str)

200 col_headers.append(

201 TableHeader(header_text, col_index, colspan)

202 )

203 col_index += colspan

204 else:

205 row_header = clean_node(wxr, None, node)

206 else:

207 node_text = clean_node(wxr, None, node)

208 if (

209 node_text.endswith(("]", "\\", "Prononciation ?"))

210 and form_text != ""

211 ):

212 form = Form(

213 form=form_text,

214 ipas=[node_text]

215 if node_text.endswith(("]", "\\"))

216 else [],

217 source=conj_page_title,

218 )

219 if row_header != "": 219 ↛ 221line 219 didn't jump to line 221 because the condition on line 219 was always true

220 form.raw_tags.append(row_header)

221 for col_header in col_headers:

222 if (

223 col_index >= col_header.col_index

224 and col_index

225 < col_header.col_index + col_header.colspan

226 ):

227 form.raw_tags.append(col_header.text)

228 translate_raw_tags(form)

229 entry.forms.append(form)

230 form_text = ""

231 elif node_text != "":

232 if not form_text.endswith("’") and form_text != "":

233 form_text += " "

234 form_text += node_text

235 col_index += 1

236

237

238def process_fr_conj_table(

239 wxr: WiktextractContext,

240 entry: WordEntry,

241 div_node: HTMLNode,

242 h3_text: str,

243 conj_page_title: str,

244) -> None:

245 for table_node in div_node.find_child(NodeKind.TABLE):

246 for row_index, row in enumerate(

247 table_node.find_child(NodeKind.TABLE_ROW)

248 ):

249 for cell_index, cell in enumerate(

250 row.find_child(NodeKind.TABLE_CELL)

251 ):

252 for cell_child in cell.children:

253 if isinstance(cell_child, WikiNode):

254 if (

255 cell_child.kind == NodeKind.HTML

256 and cell_child.tag == "table"

257 ):

258 process_fr_conj_html_table(

259 wxr, entry, cell_child, h3_text, conj_page_title

260 )

261 elif cell_child.kind == NodeKind.TABLE: 261 ↛ 252line 261 didn't jump to line 252 because the condition on line 261 was always true

262 process_fr_conj_wiki_table(

263 wxr, entry, cell_child, h3_text, conj_page_title

264 )

265

266

267def process_fr_conj_html_table(

268 wxr: WiktextractContext,

269 entry: WordEntry,

270 table_node: HTMLNode,

271 h3_text: str,

272 conj_page_title: str,

273):

274 tags = [h3_text] if h3_text != "" else []

275 for tr_index, tr_node in enumerate(table_node.find_html_recursively("tr")):

276 if tr_index == 0:

277 tags.append(clean_node(wxr, None, tr_node.children))

278 else:

279 form = Form(raw_tags=tags, source=conj_page_title)

280 for td_index, td_node in enumerate(

281 tr_node.find_html_recursively("td")

282 ):

283 td_text = clean_node(wxr, None, td_node)

284 if td_index < 2:

285 form.form += td_text

286 if td_index == 0 and not td_text.endswith("’"):

287 form.form += " "

288 else:

289 if len(form.ipas) > 0:

290 form.ipas[0] += td_text

291 else:

292 if not td_text.endswith("‿"): 292 ↛ 294line 292 didn't jump to line 294 because the condition on line 292 was always true

293 td_text += " "

294 form.ipas.append(td_text)

295

296 translate_raw_tags(form)

297 entry.forms.append(form)

298

299

300def process_fr_conj_wiki_table(

301 wxr: WiktextractContext,

302 entry: WordEntry,

303 table_node: WikiNode,

304 h3_text: str,

305 conj_page_title: str,

306):

307 tags = [h3_text] if h3_text != "" else []

308 for row_index, row in enumerate(table_node.find_child(NodeKind.TABLE_ROW)):

309 if row_index == 0:

310 tags.append(clean_node(wxr, None, row.children))

311 else:

312 form = Form(raw_tags=tags, source=conj_page_title)

313 for cell_index, cell in enumerate(

314 row.find_child(NodeKind.TABLE_CELL)

315 ):

316 cell_text = clean_node(wxr, None, cell)

317 if cell_index < 2:

318 if cell_text == "—" or cell_text.endswith(

319 "Prononciation ?"

320 ):

321 continue

322 if cell_text.startswith(

323 "-"

324 ) and not form.form.strip().endswith(")"):

325 form.form = form.form.strip()

326 form.form += cell_text

327 if cell_index == 0 and len(cell_text) > 0:

328 form.form += " "

329 elif not cell_text.endswith("Prononciation ?"): 329 ↛ 313line 329 didn't jump to line 313 because the condition on line 329 was always true

330 form.ipas.append(cell_text)

331

332 if len(form.form) > 0:

333 translate_raw_tags(form)

334 entry.forms.append(form)

335

336

337def process_ja_flx_adj_template(

338 wxr: WiktextractContext,

339 entry: WordEntry,

340 template_node: TemplateNode,

341 conj_page_title: str,

342) -> None:

343 # https://fr.wiktionary.org/wiki/Modèle:ja-adj

344 # https://fr.wiktionary.org/wiki/Modèle:ja-flx-adj-な

345 expanded_template = wxr.wtp.parse(

346 wxr.wtp.node_to_wikitext(template_node), expand_all=True

347 )

348 for table_node in expanded_template.find_child(NodeKind.TABLE):

349 first_tag = ""

350 for row in table_node.find_child(NodeKind.TABLE_ROW):

351 forms = []

352 tags = [first_tag]

353 for cell_index, row_child in enumerate(

354 row.find_child(NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL)

355 ):

356 row_child_text = clean_node(wxr, None, row_child)

357 if row_child.kind == NodeKind.TABLE_HEADER_CELL:

358 first_tag = row_child_text

359 else:

360 for line_index, line in enumerate(

361 row_child_text.splitlines()

362 ):

363 if cell_index == 0:

364 tags.append(line)

365 continue

366 if line_index + 1 > len(forms):

367 forms.append(

368 translate_raw_tags(

369 Form(raw_tags=tags, source=conj_page_title)

370 )

371 )

372 if cell_index == 1:

373 forms[line_index].form = line

374 elif cell_index == 2:

375 forms[line_index].hiragana = line

376 elif cell_index == 3: 376 ↛ 360line 376 didn't jump to line 360 because the condition on line 376 was always true

377 forms[line_index].roman = line

378

379 entry.forms.extend(forms)

380

381

382def process_ja_conj_template(

383 wxr: WiktextractContext,

384 entry: WordEntry,

385 template_node: TemplateNode,

386 conj_page_title: str,

387) -> None:

388 # https://fr.wiktionary.org/wiki/Modèle:ja-verbe-conj

389 # Modèle:ja-在る

390 expanded_template = wxr.wtp.parse(

391 wxr.wtp.node_to_wikitext(template_node), expand_all=True

392 )

393 for table_node in expanded_template.find_child(NodeKind.TABLE):

394 first_tag = ""

395 row_headers = {}

396 for row in table_node.find_child(NodeKind.TABLE_ROW):

397 if (

398 all(

399 isinstance(c, WikiNode)

400 and c.kind == NodeKind.TABLE_HEADER_CELL

401 for c in row.children

402 )

403 and len(row.children) > 1

404 ):

405 # skip header row of the "Clefs de constructions" table

406 continue

407

408 for header in row.find_child(NodeKind.TABLE_HEADER_CELL):

409 header_text = clean_node(wxr, None, header)

410 if len(row.children) == 1:

411 first_tag = header_text

412 else:

413 row_headers[header_text] = int(

414 header.attrs.get("rowspan", "1")

415 )

416

417 tags = [first_tag]

418 for tag, rowspan in row_headers.copy().items():

419 tags.append(tag)

420 if rowspan == 1:

421 del row_headers[tag]

422 else:

423 row_headers[tag] = rowspan - 1

424 form = Form(raw_tags=tags, source=conj_page_title)

425 for cell_index, cell in enumerate(

426 row.find_child(NodeKind.TABLE_CELL)

427 ):

428 cell_text = clean_node(wxr, None, cell)

429 if cell_index == 0:

430 form.form = cell_text

431 elif cell_index == 1:

432 form.hiragana = cell_text

433 elif cell_index == 2: 433 ↛ 425line 433 didn't jump to line 425 because the condition on line 433 was always true

434 form.roman = cell_text

435 if len(form.form) > 0:

436 translate_raw_tags(form)

437 entry.forms.append(form)

438

439

440def extract_ku_conj_trans_template(

441 wxr: WiktextractContext,

442 entry: WordEntry,

443 t_node: TemplateNode,

444 conj_page_title: str,

445) -> None:

446 expanded_node = wxr.wtp.parse(

447 wxr.wtp.node_to_wikitext(t_node), expand_all=True

448 )

449 for table in expanded_node.find_child(NodeKind.TABLE):

450 extract_ku_conj_trans_table_node(wxr, entry, table, conj_page_title)

451 for link_node in expanded_node.find_child(NodeKind.LINK):

452 clean_node(wxr, entry, link_node)

453

454

455def extract_ku_conj_trans_table_node(

456 wxr: WiktextractContext,

457 entry: WordEntry,

458 table_node: WikiNode,

459 conj_page_title: str,

460) -> None:

461 from .inflection import ColspanHeader

462

463 ignore_headers = (

464 "Conjugaison du verbe",

465 "TEMPS DU PRÉSENT ET DU FUTUR",

466 "TEMPS DU PRESENT ET DU FUTUR",

467 "TEMPS DU PASSÉ",

468 "TEMPS DU PASSE",

469 )

470 col_headers = []

471 last_row_has_header = False

472 last_header = ""

473 for row in table_node.find_child(NodeKind.TABLE_ROW):

474 col_index = 0

475 current_row_has_header = row.contain_node(NodeKind.TABLE_HEADER_CELL)

476 if not last_row_has_header and current_row_has_header:

477 col_headers.clear()

478 for cell in row.find_child(

479 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL

480 ):

481 cell_str = clean_node(wxr, None, cell)

482 if cell_str == "":

483 col_index += 1

484 continue

485 if cell.kind == NodeKind.TABLE_HEADER_CELL:

486 if cell_str.startswith(ignore_headers):

487 last_header = cell_str

488 continue

489 colspan = 1

490 colspan_str = cell.attrs.get("colspan", "1")

491 if re.fullmatch(r"\d+", colspan_str) is not None: 491 ↛ 493line 491 didn't jump to line 493 because the condition on line 491 was always true

492 colspan = int(colspan_str)

493 col_headers.append(

494 ColspanHeader(text=cell_str, index=col_index, span=colspan)

495 )

496 last_header = cell_str

497 col_index += colspan

498 elif last_header == "TEMPS DU PASSÉ":

499 continue

500 elif cell_str == "(inusité)":

501 col_index += 1

502 elif cell_str != wxr.wtp.title: 502 ↛ 478line 502 didn't jump to line 478 because the condition on line 502 was always true

503 form = Form(form=cell_str, source=conj_page_title)

504 for header in col_headers:

505 if (

506 col_index >= header.index

507 and col_index < header.index + header.span

508 ):

509 form.raw_tags.append(header.text)

510 translate_raw_tags(form)

511 entry.forms.append(form)

512 col_index += 1

513 last_row_has_header = current_row_has_header

514

515

516def extract_ko_conj_template(

517 wxr: WiktextractContext,

518 entry: WordEntry,

519 t_node: TemplateNode,

520 conj_page_title: str,

521) -> None:

522 word_page_title = wxr.wtp.title

523 wxr.wtp.title = conj_page_title

524 expanded_node = wxr.wtp.parse(

525 wxr.wtp.node_to_wikitext(t_node), expand_all=True

526 )

527 for h3 in expanded_node.find_html("h3"):

528 clean_node(wxr, entry, h3)

529 for table_index, table in enumerate(

530 expanded_node.find_child(NodeKind.TABLE)

531 ):

532 if table_index == 0:

533 continue

534 shared_raw_tags = []

535 for caption_node in table.find_child(NodeKind.TABLE_CAPTION):

536 caption = clean_node(wxr, None, caption_node.children)

537 if caption != "": 537 ↛ 535line 537 didn't jump to line 535 because the condition on line 537 was always true

538 shared_raw_tags.append(caption)

539 col_headers = []

540 row_headers = []

541 row_index = 0

542 row_header_indexes = [0]

543 for row in table.find_child(NodeKind.TABLE_ROW):

544 col_index = 0

545 for header_cell in row.find_child(NodeKind.TABLE_HEADER_CELL):

546 cell_str = clean_node(wxr, None, header_cell)

547 if cell_str == "":

548 continue

549 colspan, rowspan = get_cell_span(header_cell)

550 if row.contain_node(NodeKind.TABLE_CELL):

551 header_added = False

552 current_row_index = row_index

553 for index, row_header_index in enumerate( 553 ↛ 561line 553 didn't jump to line 561 because the loop on line 553 didn't complete

554 row_header_indexes

555 ):

556 if row_index >= row_header_index:

557 current_row_index = row_header_indexes[index]

558 row_header_indexes[index] += rowspan

559 header_added = True

560 break

561 if not header_added: 561 ↛ 562line 561 didn't jump to line 562 because the condition on line 561 was never true

562 row_header_indexes.append(rowspan)

563 row_headers.append(

564 TableHeader(

565 text=cell_str,

566 row_index=current_row_index,

567 rowspan=rowspan,

568 )

569 )

570 else:

571 col_headers.append(

572 TableHeader(

573 text=cell_str,

574 col_index=col_index,

575 colspan=colspan,

576 )

577 )

578 col_index += colspan

579 if row.contain_node(NodeKind.TABLE_CELL):

580 row_index += 1

581

582 row_index = 0

583 for row in table.find_child(NodeKind.TABLE_ROW):

584 col_index = 0

585 for cell in row.find_child(NodeKind.TABLE_CELL):

586 cell_str = clean_node(wxr, None, cell)

587 colspan, rowspan = get_cell_span(cell)

588 if cell_str == "—": 588 ↛ 589line 588 didn't jump to line 589 because the condition on line 588 was never true

589 col_index += 1

590 else:

591 form = Form(

592 source=conj_page_title, raw_tags=shared_raw_tags

593 )

594 for line_index, line in enumerate(cell_str.splitlines()):

595 match line_index:

596 case 0:

597 form.form = line

598 case 1:

599 form.roman = line

600 case 2: 600 ↛ 594line 600 didn't jump to line 594 because the pattern on line 600 always matched

601 form.ipas.append(line)

602 for header in col_headers:

603 if (

604 col_index >= header.col_index

605 and col_index < header.col_index + header.colspan

606 ):

607 form.raw_tags.append(header.text)

608 for header in row_headers:

609 if (

610 row_index < header.row_index + header.rowspan

611 and row_index + rowspan > header.row_index

612 ):

613 form.raw_tags.append(header.text)

614 if form.form not in ["", wxr.wtp.title]: 614 ↛ 617line 614 didn't jump to line 617 because the condition on line 614 was always true

615 translate_raw_tags(form)

616 entry.forms.append(form)

617 col_index += 1

618 if row.contain_node(NodeKind.TABLE_CELL):

619 row_index += 1

620

621 for link in expanded_node.find_child(NodeKind.LINK):

622 clean_node(wxr, entry, link)

623 wxr.wtp.title = word_page_title

624

625

626def get_cell_span(cell: WikiNode) -> tuple[int, int]:

627 colspan = 1

628 colspan_str = cell.attrs.get("colspan", "1")

629 if re.fullmatch(r"\d+", colspan_str) is not None: 629 ↛ 631line 629 didn't jump to line 631 because the condition on line 629 was always true

630 colspan = int(colspan_str)

631 rowspan = 1

632 rowspan_str = cell.attrs.get("rowspan", "1")

633 if re.fullmatch(r"\d+", rowspan_str) is not None: 633 ↛ 635line 633 didn't jump to line 635 because the condition on line 633 was always true

634 rowspan = int(rowspan_str)

635 return colspan, rowspan