Coverage for src/wiktextract/extractor/fr/conjugation.py: 93%
343 statements
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-04 10:58 +0000
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-04 10:58 +0000
1import re
2from dataclasses import dataclass
4from wikitextprocessor.parser import (
5 LEVEL_KIND_FLAGS,
6 HTMLNode,
7 NodeKind,
8 TemplateNode,
9 WikiNode,
10)
12from ...page import clean_node
13from ...wxr_context import WiktextractContext
14from .models import Form, WordEntry
15from .tags import translate_raw_tags
18def extract_conjugation(
19 wxr: WiktextractContext,
20 entry: WordEntry,
21 conj_page_title: str,
22 select_tab: str = "1",
23) -> None:
24 """
25 Find and extract conjugation page.
27 https://fr.wiktionary.org/wiki/Conjugaison:français
28 https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_de_tous_les_modèles/Français/Conjugaison
29 https://fr.wiktionary.org/wiki/Aide:Conjugaisons
30 """
31 conj_page = wxr.wtp.get_page_body(
32 conj_page_title, wxr.wtp.NAMESPACE_DATA["Conjugaison"]["id"]
33 )
34 if conj_page is None: 34 ↛ 35line 34 didn't jump to line 35 because the condition on line 34 was never true
35 return
36 conj_root = wxr.wtp.parse(conj_page)
37 for conj_template in conj_root.find_child(NodeKind.TEMPLATE):
38 if conj_template.template_name.endswith("-intro"): 38 ↛ 39line 38 didn't jump to line 39 because the condition on line 38 was never true
39 continue
40 if conj_template.template_name in ["ku-conj-trans", "ku-conj"]:
41 extract_ku_conj_trans_template(
42 wxr, entry, conj_template, conj_page_title
43 )
44 elif conj_template.template_name == "ko-conj":
45 extract_ko_conj_template(wxr, entry, conj_template, conj_page_title)
46 elif "-conj" in conj_template.template_name:
47 process_conj_template(wxr, entry, conj_template, conj_page_title)
48 elif conj_template.template_name == "Onglets conjugaison":
49 process_onglets_conjugaison_template(
50 wxr, entry, conj_template, conj_page_title, select_tab
51 )
52 elif conj_template.template_name.removeprefix(":").startswith(
53 "Conjugaison:"
54 ):
55 extract_conjugation(
56 wxr,
57 entry,
58 conj_template.template_name.removeprefix(":"),
59 clean_node(
60 wxr, None, conj_template.template_parameters.get("sél", "2")
61 ),
62 )
63 elif conj_template.template_name.startswith("ja-flx-adj"):
64 process_ja_flx_adj_template(
65 wxr, entry, conj_template, conj_page_title
66 )
67 elif conj_template.template_name.startswith("ja-"): 67 ↛ 37line 67 didn't jump to line 37 because the condition on line 67 was always true
68 process_ja_conj_template(wxr, entry, conj_template, conj_page_title)
70 if conj_page_title.startswith("Conjugaison:kurde/"):
71 for table in conj_root.find_child(NodeKind.TABLE): 71 ↛ 72line 71 didn't jump to line 72 because the loop on line 71 never started
72 extract_ku_conj_trans_table_node(wxr, entry, table, conj_page_title)
74 for link_node in conj_root.find_child(NodeKind.LINK): 74 ↛ 75line 74 didn't jump to line 75 because the loop on line 74 never started
75 clean_node(wxr, None, link_node)
78def process_onglets_conjugaison_template(
79 wxr: WiktextractContext,
80 entry: WordEntry,
81 node: TemplateNode,
82 conj_page_title: str,
83 select_tab: str,
84) -> None:
85 # https://fr.wiktionary.org/wiki/Modèle:Onglets_conjugaison
86 # this template expands to two tabs of tables
87 selected_tabs = []
88 if select_tab != "1" or ( 88 ↛ 96line 88 didn't jump to line 96 because the condition on line 88 was always true
89 select_tab == "1"
90 and clean_node(wxr, None, node.template_parameters.get("onglet1", ""))
91 == "Conjugaison active"
92 ):
93 # don't extract or only extract "Conjugaison pronominale" tab
94 selected_tabs = [select_tab]
95 else:
96 selected_tabs = [str(i) for i in range(1, 7)]
98 for tab_index in selected_tabs:
99 arg_name = f"contenu{tab_index}"
100 if arg_name not in node.template_parameters: 100 ↛ 101line 100 didn't jump to line 101 because the condition on line 100 was never true
101 break
102 arg_value = node.template_parameters[arg_name]
103 if ( 103 ↛ 107line 103 didn't jump to line 107 because the condition on line 103 was never true
104 isinstance(arg_value, TemplateNode)
105 and "-conj" in arg_value.template_name
106 ):
107 process_conj_template(wxr, entry, arg_value, conj_page_title)
108 elif isinstance(arg_value, list): 108 ↛ 98line 108 didn't jump to line 98 because the condition on line 108 was always true
109 for arg_node in arg_value:
110 if (
111 isinstance(arg_node, TemplateNode)
112 and "-conj" in arg_node.template_name
113 ):
114 process_conj_template(wxr, entry, arg_node, conj_page_title)
117def process_conj_template(
118 wxr: WiktextractContext,
119 entry: WordEntry,
120 template_node: TemplateNode,
121 conj_page_title: str,
122) -> None:
123 # https://fr.wiktionary.org/wiki/Catégorie:Modèles_de_conjugaison_en_français
124 # https://fr.wiktionary.org/wiki/Modèle:fr-conj-1-ger
125 expanded_template = wxr.wtp.parse(
126 wxr.wtp.node_to_wikitext(template_node), expand_all=True
127 )
128 process_expanded_conj_template(
129 wxr, entry, expanded_template, conj_page_title
130 )
133def process_expanded_conj_template(
134 wxr: WiktextractContext,
135 entry: WordEntry,
136 node: WikiNode,
137 conj_page_title: str,
138) -> None:
139 h3_text = (
140 clean_node(wxr, None, node.largs)
141 if node.kind == NodeKind.LEVEL3
142 else ""
143 )
144 for child in node.find_child(NodeKind.HTML | LEVEL_KIND_FLAGS):
145 if child.kind in LEVEL_KIND_FLAGS: 145 ↛ 146line 145 didn't jump to line 146 because the condition on line 145 was never true
146 process_expanded_conj_template(wxr, entry, child, conj_page_title)
147 elif child.kind == NodeKind.HTML: 147 ↛ 144line 147 didn't jump to line 144 because the condition on line 147 was always true
148 if child.tag == "h3":
149 h3_text = clean_node(wxr, None, child)
150 elif child.tag == "div": 150 ↛ 144line 150 didn't jump to line 144 because the condition on line 150 was always true
151 if h3_text == "Modes impersonnels":
152 process_fr_conj_modes_table(
153 wxr, entry, child, conj_page_title
154 )
155 else:
156 process_fr_conj_table(
157 wxr, entry, child, h3_text, conj_page_title
158 )
161@dataclass
162class TableHeader:
163 text: str
164 col_index: int = 0
165 colspan: int = 0
166 row_index: int = 0
167 rowspan: int = 0
170def process_fr_conj_modes_table(
171 wxr: WiktextractContext,
172 entry: WordEntry,
173 div_node: HTMLNode,
174 conj_page_title: str,
175) -> None:
176 # the first "Modes impersonnels" table
178 for table_node in div_node.find_child(NodeKind.TABLE):
179 col_headers = []
180 for row in table_node.find_child(NodeKind.TABLE_ROW):
181 row_header = ""
182 is_header_row = not row.contain_node(NodeKind.TABLE_CELL)
183 col_index = 0
184 form_text = ""
185 for node in row.find_child(
186 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
187 ):
188 if node.kind == NodeKind.TABLE_HEADER_CELL or (
189 node.contain_node(NodeKind.BOLD) and col_index == 0
190 ):
191 if is_header_row:
192 header_text = clean_node(wxr, None, node)
193 if header_text == "Mode": 193 ↛ 194line 193 didn't jump to line 194 because the condition on line 193 was never true
194 continue
195 else:
196 colspan = 1
197 colspan_str = node.attrs.get("colspan", "1")
198 if re.fullmatch(r"\d+", colspan_str) is not None: 198 ↛ 200line 198 didn't jump to line 200 because the condition on line 198 was always true
199 colspan = int(colspan_str)
200 col_headers.append(
201 TableHeader(header_text, col_index, colspan)
202 )
203 col_index += colspan
204 else:
205 row_header = clean_node(wxr, None, node)
206 else:
207 node_text = clean_node(wxr, None, node)
208 if (
209 node_text.endswith(("]", "\\", "Prononciation ?"))
210 and form_text != ""
211 ):
212 form = Form(
213 form=form_text,
214 ipas=[node_text]
215 if node_text.endswith(("]", "\\"))
216 else [],
217 source=conj_page_title,
218 )
219 if row_header != "": 219 ↛ 221line 219 didn't jump to line 221 because the condition on line 219 was always true
220 form.raw_tags.append(row_header)
221 for col_header in col_headers:
222 if (
223 col_index >= col_header.col_index
224 and col_index
225 < col_header.col_index + col_header.colspan
226 ):
227 form.raw_tags.append(col_header.text)
228 translate_raw_tags(form)
229 entry.forms.append(form)
230 form_text = ""
231 elif node_text != "":
232 if not form_text.endswith("’") and form_text != "":
233 form_text += " "
234 form_text += node_text
235 col_index += 1
238def process_fr_conj_table(
239 wxr: WiktextractContext,
240 entry: WordEntry,
241 div_node: HTMLNode,
242 h3_text: str,
243 conj_page_title: str,
244) -> None:
245 for table_node in div_node.find_child(NodeKind.TABLE):
246 for row_index, row in enumerate(
247 table_node.find_child(NodeKind.TABLE_ROW)
248 ):
249 for cell_index, cell in enumerate(
250 row.find_child(NodeKind.TABLE_CELL)
251 ):
252 for cell_child in cell.children:
253 if isinstance(cell_child, WikiNode):
254 if (
255 cell_child.kind == NodeKind.HTML
256 and cell_child.tag == "table"
257 ):
258 process_fr_conj_html_table(
259 wxr, entry, cell_child, h3_text, conj_page_title
260 )
261 elif cell_child.kind == NodeKind.TABLE: 261 ↛ 252line 261 didn't jump to line 252 because the condition on line 261 was always true
262 process_fr_conj_wiki_table(
263 wxr, entry, cell_child, h3_text, conj_page_title
264 )
267def process_fr_conj_html_table(
268 wxr: WiktextractContext,
269 entry: WordEntry,
270 table_node: HTMLNode,
271 h3_text: str,
272 conj_page_title: str,
273):
274 tags = [h3_text] if h3_text != "" else []
275 for tr_index, tr_node in enumerate(table_node.find_html_recursively("tr")):
276 if tr_index == 0:
277 tags.append(clean_node(wxr, None, tr_node.children))
278 else:
279 form = Form(raw_tags=tags, source=conj_page_title)
280 for td_index, td_node in enumerate(
281 tr_node.find_html_recursively("td")
282 ):
283 td_text = clean_node(wxr, None, td_node)
284 if td_index < 2:
285 form.form += td_text
286 if td_index == 0 and not td_text.endswith("’"):
287 form.form += " "
288 else:
289 if len(form.ipas) > 0:
290 form.ipas[0] += td_text
291 else:
292 if not td_text.endswith("‿"): 292 ↛ 294line 292 didn't jump to line 294 because the condition on line 292 was always true
293 td_text += " "
294 form.ipas.append(td_text)
296 translate_raw_tags(form)
297 entry.forms.append(form)
300def process_fr_conj_wiki_table(
301 wxr: WiktextractContext,
302 entry: WordEntry,
303 table_node: WikiNode,
304 h3_text: str,
305 conj_page_title: str,
306):
307 tags = [h3_text] if h3_text != "" else []
308 for row_index, row in enumerate(table_node.find_child(NodeKind.TABLE_ROW)):
309 if row_index == 0:
310 tags.append(clean_node(wxr, None, row.children))
311 else:
312 form = Form(raw_tags=tags, source=conj_page_title)
313 for cell_index, cell in enumerate(
314 row.find_child(NodeKind.TABLE_CELL)
315 ):
316 cell_text = clean_node(wxr, None, cell)
317 if cell_index < 2:
318 if cell_text == "—" or cell_text.endswith(
319 "Prononciation ?"
320 ):
321 continue
322 if cell_text.startswith(
323 "-"
324 ) and not form.form.strip().endswith(")"):
325 form.form = form.form.strip()
326 form.form += cell_text
327 if cell_index == 0 and len(cell_text) > 0:
328 form.form += " "
329 elif not cell_text.endswith("Prononciation ?"): 329 ↛ 313line 329 didn't jump to line 313 because the condition on line 329 was always true
330 form.ipas.append(cell_text)
332 if len(form.form) > 0:
333 translate_raw_tags(form)
334 entry.forms.append(form)
337def process_ja_flx_adj_template(
338 wxr: WiktextractContext,
339 entry: WordEntry,
340 template_node: TemplateNode,
341 conj_page_title: str,
342) -> None:
343 # https://fr.wiktionary.org/wiki/Modèle:ja-adj
344 # https://fr.wiktionary.org/wiki/Modèle:ja-flx-adj-な
345 expanded_template = wxr.wtp.parse(
346 wxr.wtp.node_to_wikitext(template_node), expand_all=True
347 )
348 for table_node in expanded_template.find_child(NodeKind.TABLE):
349 first_tag = ""
350 for row in table_node.find_child(NodeKind.TABLE_ROW):
351 forms = []
352 tags = [first_tag]
353 for cell_index, row_child in enumerate(
354 row.find_child(NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL)
355 ):
356 row_child_text = clean_node(wxr, None, row_child)
357 if row_child.kind == NodeKind.TABLE_HEADER_CELL:
358 first_tag = row_child_text
359 else:
360 for line_index, line in enumerate(
361 row_child_text.splitlines()
362 ):
363 if cell_index == 0:
364 tags.append(line)
365 continue
366 if line_index + 1 > len(forms):
367 forms.append(
368 translate_raw_tags(
369 Form(raw_tags=tags, source=conj_page_title)
370 )
371 )
372 if cell_index == 1:
373 forms[line_index].form = line
374 elif cell_index == 2:
375 forms[line_index].hiragana = line
376 elif cell_index == 3: 376 ↛ 360line 376 didn't jump to line 360 because the condition on line 376 was always true
377 forms[line_index].roman = line
379 entry.forms.extend(forms)
382def process_ja_conj_template(
383 wxr: WiktextractContext,
384 entry: WordEntry,
385 template_node: TemplateNode,
386 conj_page_title: str,
387) -> None:
388 # https://fr.wiktionary.org/wiki/Modèle:ja-verbe-conj
389 # Modèle:ja-在る
390 expanded_template = wxr.wtp.parse(
391 wxr.wtp.node_to_wikitext(template_node), expand_all=True
392 )
393 for table_node in expanded_template.find_child(NodeKind.TABLE):
394 first_tag = ""
395 row_headers = {}
396 for row in table_node.find_child(NodeKind.TABLE_ROW):
397 if (
398 all(
399 isinstance(c, WikiNode)
400 and c.kind == NodeKind.TABLE_HEADER_CELL
401 for c in row.children
402 )
403 and len(row.children) > 1
404 ):
405 # skip header row of the "Clefs de constructions" table
406 continue
408 for header in row.find_child(NodeKind.TABLE_HEADER_CELL):
409 header_text = clean_node(wxr, None, header)
410 if len(row.children) == 1:
411 first_tag = header_text
412 else:
413 row_headers[header_text] = int(
414 header.attrs.get("rowspan", "1")
415 )
417 tags = [first_tag]
418 for tag, rowspan in row_headers.copy().items():
419 tags.append(tag)
420 if rowspan == 1:
421 del row_headers[tag]
422 else:
423 row_headers[tag] = rowspan - 1
424 form = Form(raw_tags=tags, source=conj_page_title)
425 for cell_index, cell in enumerate(
426 row.find_child(NodeKind.TABLE_CELL)
427 ):
428 cell_text = clean_node(wxr, None, cell)
429 if cell_index == 0:
430 form.form = cell_text
431 elif cell_index == 1:
432 form.hiragana = cell_text
433 elif cell_index == 2: 433 ↛ 425line 433 didn't jump to line 425 because the condition on line 433 was always true
434 form.roman = cell_text
435 if len(form.form) > 0:
436 translate_raw_tags(form)
437 entry.forms.append(form)
440def extract_ku_conj_trans_template(
441 wxr: WiktextractContext,
442 entry: WordEntry,
443 t_node: TemplateNode,
444 conj_page_title: str,
445) -> None:
446 expanded_node = wxr.wtp.parse(
447 wxr.wtp.node_to_wikitext(t_node), expand_all=True
448 )
449 for table in expanded_node.find_child(NodeKind.TABLE):
450 extract_ku_conj_trans_table_node(wxr, entry, table, conj_page_title)
451 for link_node in expanded_node.find_child(NodeKind.LINK):
452 clean_node(wxr, entry, link_node)
455def extract_ku_conj_trans_table_node(
456 wxr: WiktextractContext,
457 entry: WordEntry,
458 table_node: WikiNode,
459 conj_page_title: str,
460) -> None:
461 from .inflection import ColspanHeader
463 ignore_headers = (
464 "Conjugaison du verbe",
465 "TEMPS DU PRÉSENT ET DU FUTUR",
466 "TEMPS DU PRESENT ET DU FUTUR",
467 "TEMPS DU PASSÉ",
468 "TEMPS DU PASSE",
469 )
470 col_headers = []
471 last_row_has_header = False
472 last_header = ""
473 for row in table_node.find_child(NodeKind.TABLE_ROW):
474 col_index = 0
475 current_row_has_header = row.contain_node(NodeKind.TABLE_HEADER_CELL)
476 if not last_row_has_header and current_row_has_header:
477 col_headers.clear()
478 for cell in row.find_child(
479 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
480 ):
481 cell_str = clean_node(wxr, None, cell)
482 if cell_str == "":
483 col_index += 1
484 continue
485 if cell.kind == NodeKind.TABLE_HEADER_CELL:
486 if cell_str.startswith(ignore_headers):
487 last_header = cell_str
488 continue
489 colspan = 1
490 colspan_str = cell.attrs.get("colspan", "1")
491 if re.fullmatch(r"\d+", colspan_str) is not None: 491 ↛ 493line 491 didn't jump to line 493 because the condition on line 491 was always true
492 colspan = int(colspan_str)
493 col_headers.append(
494 ColspanHeader(text=cell_str, index=col_index, span=colspan)
495 )
496 last_header = cell_str
497 col_index += colspan
498 elif last_header == "TEMPS DU PASSÉ":
499 continue
500 elif cell_str == "(inusité)":
501 col_index += 1
502 elif cell_str != wxr.wtp.title: 502 ↛ 478line 502 didn't jump to line 478 because the condition on line 502 was always true
503 form = Form(form=cell_str, source=conj_page_title)
504 for header in col_headers:
505 if (
506 col_index >= header.index
507 and col_index < header.index + header.span
508 ):
509 form.raw_tags.append(header.text)
510 translate_raw_tags(form)
511 entry.forms.append(form)
512 col_index += 1
513 last_row_has_header = current_row_has_header
516def extract_ko_conj_template(
517 wxr: WiktextractContext,
518 entry: WordEntry,
519 t_node: TemplateNode,
520 conj_page_title: str,
521) -> None:
522 word_page_title = wxr.wtp.title
523 wxr.wtp.title = conj_page_title
524 expanded_node = wxr.wtp.parse(
525 wxr.wtp.node_to_wikitext(t_node), expand_all=True
526 )
527 for h3 in expanded_node.find_html("h3"):
528 clean_node(wxr, entry, h3)
529 for table_index, table in enumerate(
530 expanded_node.find_child(NodeKind.TABLE)
531 ):
532 if table_index == 0:
533 continue
534 shared_raw_tags = []
535 for caption_node in table.find_child(NodeKind.TABLE_CAPTION):
536 caption = clean_node(wxr, None, caption_node.children)
537 if caption != "": 537 ↛ 535line 537 didn't jump to line 535 because the condition on line 537 was always true
538 shared_raw_tags.append(caption)
539 col_headers = []
540 row_headers = []
541 row_index = 0
542 row_header_indexes = [0]
543 for row in table.find_child(NodeKind.TABLE_ROW):
544 col_index = 0
545 for header_cell in row.find_child(NodeKind.TABLE_HEADER_CELL):
546 cell_str = clean_node(wxr, None, header_cell)
547 if cell_str == "":
548 continue
549 colspan, rowspan = get_cell_span(header_cell)
550 if row.contain_node(NodeKind.TABLE_CELL):
551 header_added = False
552 current_row_index = row_index
553 for index, row_header_index in enumerate( 553 ↛ 561line 553 didn't jump to line 561 because the loop on line 553 didn't complete
554 row_header_indexes
555 ):
556 if row_index >= row_header_index:
557 current_row_index = row_header_indexes[index]
558 row_header_indexes[index] += rowspan
559 header_added = True
560 break
561 if not header_added: 561 ↛ 562line 561 didn't jump to line 562 because the condition on line 561 was never true
562 row_header_indexes.append(rowspan)
563 row_headers.append(
564 TableHeader(
565 text=cell_str,
566 row_index=current_row_index,
567 rowspan=rowspan,
568 )
569 )
570 else:
571 col_headers.append(
572 TableHeader(
573 text=cell_str,
574 col_index=col_index,
575 colspan=colspan,
576 )
577 )
578 col_index += colspan
579 if row.contain_node(NodeKind.TABLE_CELL):
580 row_index += 1
582 row_index = 0
583 for row in table.find_child(NodeKind.TABLE_ROW):
584 col_index = 0
585 for cell in row.find_child(NodeKind.TABLE_CELL):
586 cell_str = clean_node(wxr, None, cell)
587 colspan, rowspan = get_cell_span(cell)
588 if cell_str == "—": 588 ↛ 589line 588 didn't jump to line 589 because the condition on line 588 was never true
589 col_index += 1
590 else:
591 form = Form(
592 source=conj_page_title, raw_tags=shared_raw_tags
593 )
594 for line_index, line in enumerate(cell_str.splitlines()):
595 match line_index:
596 case 0:
597 form.form = line
598 case 1:
599 form.roman = line
600 case 2: 600 ↛ 594line 600 didn't jump to line 594 because the pattern on line 600 always matched
601 form.ipas.append(line)
602 for header in col_headers:
603 if (
604 col_index >= header.col_index
605 and col_index < header.col_index + header.colspan
606 ):
607 form.raw_tags.append(header.text)
608 for header in row_headers:
609 if (
610 row_index < header.row_index + header.rowspan
611 and row_index + rowspan > header.row_index
612 ):
613 form.raw_tags.append(header.text)
614 if form.form not in ["", wxr.wtp.title]: 614 ↛ 617line 614 didn't jump to line 617 because the condition on line 614 was always true
615 translate_raw_tags(form)
616 entry.forms.append(form)
617 col_index += 1
618 if row.contain_node(NodeKind.TABLE_CELL):
619 row_index += 1
621 for link in expanded_node.find_child(NodeKind.LINK):
622 clean_node(wxr, entry, link)
623 wxr.wtp.title = word_page_title
626def get_cell_span(cell: WikiNode) -> tuple[int, int]:
627 colspan = 1
628 colspan_str = cell.attrs.get("colspan", "1")
629 if re.fullmatch(r"\d+", colspan_str) is not None: 629 ↛ 631line 629 didn't jump to line 631 because the condition on line 629 was always true
630 colspan = int(colspan_str)
631 rowspan = 1
632 rowspan_str = cell.attrs.get("rowspan", "1")
633 if re.fullmatch(r"\d+", rowspan_str) is not None: 633 ↛ 635line 633 didn't jump to line 635 because the condition on line 633 was always true
634 rowspan = int(rowspan_str)
635 return colspan, rowspan