Coverage for src/wiktextract/extractor/fr/conjugation.py: 92%
456 statements
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
1import re
2from dataclasses import dataclass
4from wikitextprocessor.parser import (
5 LEVEL_KIND_FLAGS,
6 HTMLNode,
7 NodeKind,
8 TemplateNode,
9 WikiNode,
10)
12from ...page import clean_node
13from ...wxr_context import WiktextractContext
14from .models import Form, WordEntry
15from .tags import translate_raw_tags
18def extract_conjugation(
19 wxr: WiktextractContext,
20 entry: WordEntry,
21 conj_page_title: str,
22 select_tab: str = "1",
23) -> None:
24 """
25 Find and extract conjugation page.
27 https://fr.wiktionary.org/wiki/Conjugaison:français
28 https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_de_tous_les_modèles/Français/Conjugaison
29 https://fr.wiktionary.org/wiki/Aide:Conjugaisons
30 """
31 conj_page = wxr.wtp.get_page_body(
32 conj_page_title, wxr.wtp.NAMESPACE_DATA["Conjugaison"]["id"]
33 )
34 if conj_page is None:
35 return
36 conj_root = wxr.wtp.parse(conj_page)
37 for conj_template in conj_root.find_child(NodeKind.TEMPLATE):
38 if conj_template.template_name.endswith("-intro"): 38 ↛ 39line 38 didn't jump to line 39 because the condition on line 38 was never true
39 continue
40 if conj_template.template_name in ["ku-conj-trans", "ku-conj"]:
41 extract_ku_conj_trans_template(
42 wxr, entry, conj_template, conj_page_title
43 )
44 elif conj_template.template_name == "ko-conj":
45 extract_ko_conj_template(wxr, entry, conj_template, conj_page_title)
46 elif conj_template.template_name == "de-conj":
47 extract_de_conj_template(wxr, entry, conj_template, conj_page_title)
48 elif "-conj" in conj_template.template_name:
49 process_conj_template(wxr, entry, conj_template, conj_page_title)
50 elif conj_template.template_name == "Onglets conjugaison":
51 process_onglets_conjugaison_template(
52 wxr, entry, conj_template, conj_page_title, select_tab
53 )
54 elif conj_template.template_name.removeprefix(":").startswith(
55 "Conjugaison:"
56 ):
57 extract_conjugation(
58 wxr,
59 entry,
60 conj_template.template_name.removeprefix(":"),
61 clean_node(
62 wxr, None, conj_template.template_parameters.get("sél", "2")
63 ),
64 )
65 elif conj_template.template_name.startswith("ja-flx-adj"):
66 process_ja_flx_adj_template(
67 wxr, entry, conj_template, conj_page_title
68 )
69 elif conj_template.template_name.startswith("ja-"): 69 ↛ 37line 69 didn't jump to line 37 because the condition on line 69 was always true
70 process_ja_conj_template(wxr, entry, conj_template, conj_page_title)
72 if conj_page_title.startswith("Conjugaison:kurde/"):
73 for table in conj_root.find_child(NodeKind.TABLE): 73 ↛ 74line 73 didn't jump to line 74 because the loop on line 73 never started
74 extract_ku_conj_trans_table_node(wxr, entry, table, conj_page_title)
76 for link_node in conj_root.find_child(NodeKind.LINK): 76 ↛ 77line 76 didn't jump to line 77 because the loop on line 76 never started
77 clean_node(wxr, None, link_node)
80def process_onglets_conjugaison_template(
81 wxr: WiktextractContext,
82 entry: WordEntry,
83 node: TemplateNode,
84 conj_page_title: str,
85 select_tab: str,
86) -> None:
87 # https://fr.wiktionary.org/wiki/Modèle:Onglets_conjugaison
88 # this template expands to two tabs of tables
89 selected_tabs = []
90 if select_tab != "1" or ( 90 ↛ 98line 90 didn't jump to line 98 because the condition on line 90 was always true
91 select_tab == "1"
92 and clean_node(wxr, None, node.template_parameters.get("onglet1", ""))
93 == "Conjugaison active"
94 ):
95 # don't extract or only extract "Conjugaison pronominale" tab
96 selected_tabs = [select_tab]
97 else:
98 selected_tabs = [str(i) for i in range(1, 7)]
100 for tab_index in selected_tabs:
101 arg_name = f"contenu{tab_index}"
102 if arg_name not in node.template_parameters: 102 ↛ 103line 102 didn't jump to line 103 because the condition on line 102 was never true
103 break
104 arg_value = node.template_parameters[arg_name]
105 if ( 105 ↛ 109line 105 didn't jump to line 109 because the condition on line 105 was never true
106 isinstance(arg_value, TemplateNode)
107 and "-conj" in arg_value.template_name
108 ):
109 process_conj_template(wxr, entry, arg_value, conj_page_title)
110 elif isinstance(arg_value, list): 110 ↛ 100line 110 didn't jump to line 100 because the condition on line 110 was always true
111 for arg_node in arg_value:
112 if (
113 isinstance(arg_node, TemplateNode)
114 and "-conj" in arg_node.template_name
115 ):
116 process_conj_template(wxr, entry, arg_node, conj_page_title)
119def process_conj_template(
120 wxr: WiktextractContext,
121 entry: WordEntry,
122 template_node: TemplateNode,
123 conj_page_title: str,
124) -> None:
125 # https://fr.wiktionary.org/wiki/Catégorie:Modèles_de_conjugaison_en_français
126 # https://fr.wiktionary.org/wiki/Modèle:fr-conj-1-ger
127 expanded_template = wxr.wtp.parse(
128 wxr.wtp.node_to_wikitext(template_node), expand_all=True
129 )
130 process_expanded_conj_template(
131 wxr, entry, expanded_template, conj_page_title
132 )
135def process_expanded_conj_template(
136 wxr: WiktextractContext,
137 entry: WordEntry,
138 node: WikiNode,
139 conj_page_title: str,
140) -> None:
141 h3_text = (
142 clean_node(wxr, None, node.largs)
143 if node.kind == NodeKind.LEVEL3
144 else ""
145 )
146 for child in node.find_child(NodeKind.HTML | LEVEL_KIND_FLAGS):
147 if child.kind in LEVEL_KIND_FLAGS: 147 ↛ 148line 147 didn't jump to line 148 because the condition on line 147 was never true
148 process_expanded_conj_template(wxr, entry, child, conj_page_title)
149 elif child.kind == NodeKind.HTML: 149 ↛ 146line 149 didn't jump to line 146 because the condition on line 149 was always true
150 if child.tag == "h3":
151 h3_text = clean_node(wxr, None, child)
152 elif child.tag == "div": 152 ↛ 146line 152 didn't jump to line 146 because the condition on line 152 was always true
153 if h3_text == "Modes impersonnels":
154 process_fr_conj_modes_table(
155 wxr, entry, child, conj_page_title
156 )
157 else:
158 process_fr_conj_table(
159 wxr, entry, child, h3_text, conj_page_title
160 )
163@dataclass
164class TableHeader:
165 text: str
166 col_index: int = 0
167 colspan: int = 0
168 row_index: int = 0
169 rowspan: int = 0
172def process_fr_conj_modes_table(
173 wxr: WiktextractContext,
174 entry: WordEntry,
175 div_node: HTMLNode,
176 conj_page_title: str,
177) -> None:
178 # the first "Modes impersonnels" table
180 for table_node in div_node.find_child(NodeKind.TABLE):
181 col_headers = []
182 for row in table_node.find_child(NodeKind.TABLE_ROW):
183 row_header = ""
184 is_header_row = not row.contain_node(NodeKind.TABLE_CELL)
185 col_index = 0
186 form_text = ""
187 for node in row.find_child(
188 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
189 ):
190 if node.kind == NodeKind.TABLE_HEADER_CELL or (
191 node.contain_node(NodeKind.BOLD) and col_index == 0
192 ):
193 if is_header_row:
194 header_text = clean_node(wxr, None, node)
195 if header_text == "Mode": 195 ↛ 196line 195 didn't jump to line 196 because the condition on line 195 was never true
196 continue
197 else:
198 colspan = 1
199 colspan_str = node.attrs.get("colspan", "1")
200 if re.fullmatch(r"\d+", colspan_str) is not None: 200 ↛ 202line 200 didn't jump to line 202 because the condition on line 200 was always true
201 colspan = int(colspan_str)
202 col_headers.append(
203 TableHeader(header_text, col_index, colspan)
204 )
205 col_index += colspan
206 else:
207 row_header = clean_node(wxr, None, node)
208 else:
209 node_text = clean_node(wxr, None, node)
210 if (
211 node_text.endswith(("]", "\\", "Prononciation ?"))
212 and form_text != ""
213 ):
214 form = Form(
215 form=form_text,
216 ipas=[node_text]
217 if node_text.endswith(("]", "\\"))
218 else [],
219 source=conj_page_title,
220 )
221 if row_header != "": 221 ↛ 223line 221 didn't jump to line 223 because the condition on line 221 was always true
222 form.raw_tags.append(row_header)
223 for col_header in col_headers:
224 if (
225 col_index >= col_header.col_index
226 and col_index
227 < col_header.col_index + col_header.colspan
228 ):
229 form.raw_tags.append(col_header.text)
230 translate_raw_tags(form)
231 entry.forms.append(form)
232 form_text = ""
233 elif node_text != "":
234 if not form_text.endswith("’") and form_text != "":
235 form_text += " "
236 form_text += node_text
237 col_index += 1
240def process_fr_conj_table(
241 wxr: WiktextractContext,
242 entry: WordEntry,
243 div_node: HTMLNode,
244 h3_text: str,
245 conj_page_title: str,
246) -> None:
247 for table_node in div_node.find_child(NodeKind.TABLE):
248 for row_index, row in enumerate(
249 table_node.find_child(NodeKind.TABLE_ROW)
250 ):
251 for cell_index, cell in enumerate(
252 row.find_child(NodeKind.TABLE_CELL)
253 ):
254 for cell_child in cell.children:
255 if isinstance(cell_child, WikiNode):
256 if (
257 cell_child.kind == NodeKind.HTML
258 and cell_child.tag == "table"
259 ):
260 process_fr_conj_html_table(
261 wxr, entry, cell_child, h3_text, conj_page_title
262 )
263 elif cell_child.kind == NodeKind.TABLE: 263 ↛ 254line 263 didn't jump to line 254 because the condition on line 263 was always true
264 process_fr_conj_wiki_table(
265 wxr, entry, cell_child, h3_text, conj_page_title
266 )
269def process_fr_conj_html_table(
270 wxr: WiktextractContext,
271 entry: WordEntry,
272 table_node: HTMLNode,
273 h3_text: str,
274 conj_page_title: str,
275):
276 tags = [h3_text] if h3_text != "" else []
277 for tr_index, tr_node in enumerate(table_node.find_html_recursively("tr")):
278 if tr_index == 0:
279 tags.append(clean_node(wxr, None, tr_node.children))
280 else:
281 form = Form(raw_tags=tags, source=conj_page_title)
282 for td_index, td_node in enumerate(
283 tr_node.find_html_recursively("td")
284 ):
285 td_text = clean_node(wxr, None, td_node)
286 if td_index < 2:
287 form.form += td_text
288 if td_index == 0 and not td_text.endswith("’"):
289 form.form += " "
290 else:
291 if len(form.ipas) > 0:
292 form.ipas[0] += td_text
293 else:
294 if not td_text.endswith("‿"): 294 ↛ 296line 294 didn't jump to line 296 because the condition on line 294 was always true
295 td_text += " "
296 form.ipas.append(td_text)
298 translate_raw_tags(form)
299 entry.forms.append(form)
302def process_fr_conj_wiki_table(
303 wxr: WiktextractContext,
304 entry: WordEntry,
305 table_node: WikiNode,
306 h3_text: str,
307 conj_page_title: str,
308):
309 tags = [h3_text] if h3_text != "" else []
310 for row_index, row in enumerate(table_node.find_child(NodeKind.TABLE_ROW)):
311 if row_index == 0:
312 tags.append(clean_node(wxr, None, row.children))
313 else:
314 form = Form(raw_tags=tags, source=conj_page_title)
315 for cell_index, cell in enumerate(
316 row.find_child(NodeKind.TABLE_CELL)
317 ):
318 cell_text = clean_node(wxr, None, cell)
319 if cell_index < 2:
320 if cell_text == "—" or cell_text.endswith(
321 "Prononciation ?"
322 ):
323 continue
324 if cell_text.startswith(
325 "-"
326 ) and not form.form.strip().endswith(")"):
327 form.form = form.form.strip()
328 form.form += cell_text
329 if cell_index == 0 and len(cell_text) > 0:
330 form.form += " "
331 elif not cell_text.endswith("Prononciation ?"): 331 ↛ 315line 331 didn't jump to line 315 because the condition on line 331 was always true
332 form.ipas.append(cell_text)
334 if len(form.form) > 0:
335 translate_raw_tags(form)
336 entry.forms.append(form)
339def process_ja_flx_adj_template(
340 wxr: WiktextractContext,
341 entry: WordEntry,
342 template_node: TemplateNode,
343 conj_page_title: str,
344) -> None:
345 # https://fr.wiktionary.org/wiki/Modèle:ja-adj
346 # https://fr.wiktionary.org/wiki/Modèle:ja-flx-adj-な
347 expanded_template = wxr.wtp.parse(
348 wxr.wtp.node_to_wikitext(template_node), expand_all=True
349 )
350 for table_node in expanded_template.find_child(NodeKind.TABLE):
351 first_tag = ""
352 for row in table_node.find_child(NodeKind.TABLE_ROW):
353 forms = []
354 tags = [first_tag]
355 for cell_index, row_child in enumerate(
356 row.find_child(NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL)
357 ):
358 row_child_text = clean_node(wxr, None, row_child)
359 if row_child.kind == NodeKind.TABLE_HEADER_CELL:
360 first_tag = row_child_text
361 else:
362 for line_index, line in enumerate(
363 row_child_text.splitlines()
364 ):
365 if cell_index == 0:
366 tags.append(line)
367 continue
368 if line_index + 1 > len(forms):
369 forms.append(
370 translate_raw_tags(
371 Form(raw_tags=tags, source=conj_page_title)
372 )
373 )
374 if cell_index == 1:
375 forms[line_index].form = line
376 elif cell_index == 2:
377 forms[line_index].hiragana = line
378 elif cell_index == 3: 378 ↛ 362line 378 didn't jump to line 362 because the condition on line 378 was always true
379 forms[line_index].roman = line
381 entry.forms.extend(forms)
384def process_ja_conj_template(
385 wxr: WiktextractContext,
386 entry: WordEntry,
387 template_node: TemplateNode,
388 conj_page_title: str,
389) -> None:
390 # https://fr.wiktionary.org/wiki/Modèle:ja-verbe-conj
391 # Modèle:ja-在る
392 expanded_template = wxr.wtp.parse(
393 wxr.wtp.node_to_wikitext(template_node), expand_all=True
394 )
395 for table_node in expanded_template.find_child(NodeKind.TABLE):
396 first_tag = ""
397 row_headers = {}
398 for row in table_node.find_child(NodeKind.TABLE_ROW):
399 if (
400 all(
401 isinstance(c, WikiNode)
402 and c.kind == NodeKind.TABLE_HEADER_CELL
403 for c in row.children
404 )
405 and len(row.children) > 1
406 ):
407 # skip header row of the "Clefs de constructions" table
408 continue
410 for header in row.find_child(NodeKind.TABLE_HEADER_CELL):
411 header_text = clean_node(wxr, None, header)
412 if len(row.children) == 1:
413 first_tag = header_text
414 else:
415 row_headers[header_text] = int(
416 header.attrs.get("rowspan", "1")
417 )
419 tags = [first_tag]
420 for tag, rowspan in row_headers.copy().items():
421 tags.append(tag)
422 if rowspan == 1:
423 del row_headers[tag]
424 else:
425 row_headers[tag] = rowspan - 1
426 form = Form(raw_tags=tags, source=conj_page_title)
427 for cell_index, cell in enumerate(
428 row.find_child(NodeKind.TABLE_CELL)
429 ):
430 cell_text = clean_node(wxr, None, cell)
431 if cell_index == 0:
432 form.form = cell_text
433 elif cell_index == 1:
434 form.hiragana = cell_text
435 elif cell_index == 2: 435 ↛ 427line 435 didn't jump to line 427 because the condition on line 435 was always true
436 form.roman = cell_text
437 if len(form.form) > 0:
438 translate_raw_tags(form)
439 entry.forms.append(form)
442def extract_ku_conj_trans_template(
443 wxr: WiktextractContext,
444 entry: WordEntry,
445 t_node: TemplateNode,
446 conj_page_title: str,
447) -> None:
448 expanded_node = wxr.wtp.parse(
449 wxr.wtp.node_to_wikitext(t_node), expand_all=True
450 )
451 for table in expanded_node.find_child(NodeKind.TABLE):
452 extract_ku_conj_trans_table_node(wxr, entry, table, conj_page_title)
453 for link_node in expanded_node.find_child(NodeKind.LINK):
454 clean_node(wxr, entry, link_node)
457def extract_ku_conj_trans_table_node(
458 wxr: WiktextractContext,
459 entry: WordEntry,
460 table_node: WikiNode,
461 conj_page_title: str,
462) -> None:
463 @dataclass
464 class TableHeader:
465 text: str
466 index: int
467 span: int
469 ignore_headers = (
470 "Conjugaison du verbe",
471 "TEMPS DU PRÉSENT ET DU FUTUR",
472 "TEMPS DU PRESENT ET DU FUTUR",
473 "TEMPS DU PASSÉ",
474 "TEMPS DU PASSE",
475 )
476 col_headers = []
477 last_row_has_header = False
478 last_header = ""
479 for row in table_node.find_child(NodeKind.TABLE_ROW):
480 col_index = 0
481 current_row_has_header = row.contain_node(NodeKind.TABLE_HEADER_CELL)
482 if not last_row_has_header and current_row_has_header:
483 col_headers.clear()
484 for cell in row.find_child(
485 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
486 ):
487 cell_str = clean_node(wxr, None, cell)
488 if cell_str == "":
489 col_index += 1
490 continue
491 if cell.kind == NodeKind.TABLE_HEADER_CELL:
492 if cell_str.startswith(ignore_headers):
493 last_header = cell_str
494 continue
495 colspan = 1
496 colspan_str = cell.attrs.get("colspan", "1")
497 if re.fullmatch(r"\d+", colspan_str) is not None: 497 ↛ 499line 497 didn't jump to line 499 because the condition on line 497 was always true
498 colspan = int(colspan_str)
499 col_headers.append(
500 TableHeader(text=cell_str, index=col_index, span=colspan)
501 )
502 last_header = cell_str
503 col_index += colspan
504 elif last_header == "TEMPS DU PASSÉ":
505 continue
506 elif cell_str == "(inusité)":
507 col_index += 1
508 elif cell_str != wxr.wtp.title: 508 ↛ 484line 508 didn't jump to line 484 because the condition on line 508 was always true
509 form = Form(form=cell_str, source=conj_page_title)
510 for header in col_headers:
511 if (
512 col_index >= header.index
513 and col_index < header.index + header.span
514 ):
515 form.raw_tags.append(header.text)
516 translate_raw_tags(form)
517 entry.forms.append(form)
518 col_index += 1
519 last_row_has_header = current_row_has_header
522def extract_ko_conj_template(
523 wxr: WiktextractContext,
524 entry: WordEntry,
525 t_node: TemplateNode,
526 conj_page_title: str,
527) -> None:
528 word_page_title = wxr.wtp.title
529 wxr.wtp.title = conj_page_title
530 expanded_node = wxr.wtp.parse(
531 wxr.wtp.node_to_wikitext(t_node), expand_all=True
532 )
533 for h3 in expanded_node.find_html("h3"):
534 clean_node(wxr, entry, h3)
535 for table_index, table in enumerate(
536 expanded_node.find_child(NodeKind.TABLE)
537 ):
538 if table_index == 0:
539 continue
540 shared_raw_tags = []
541 for caption_node in table.find_child(NodeKind.TABLE_CAPTION):
542 caption = clean_node(wxr, None, caption_node.children)
543 if caption != "": 543 ↛ 541line 543 didn't jump to line 541 because the condition on line 543 was always true
544 shared_raw_tags.append(caption)
545 col_headers = []
546 row_headers = []
547 row_index = 0
548 row_header_indexes = [0]
549 for row in table.find_child(NodeKind.TABLE_ROW):
550 col_index = 0
551 for header_cell in row.find_child(NodeKind.TABLE_HEADER_CELL):
552 cell_str = clean_node(wxr, None, header_cell)
553 if cell_str == "":
554 continue
555 colspan, rowspan = get_cell_span(header_cell)
556 if row.contain_node(NodeKind.TABLE_CELL):
557 header_added = False
558 current_row_index = row_index
559 for index, row_header_index in enumerate( 559 ↛ 567line 559 didn't jump to line 567 because the loop on line 559 didn't complete
560 row_header_indexes
561 ):
562 if row_index >= row_header_index:
563 current_row_index = row_header_indexes[index]
564 row_header_indexes[index] += rowspan
565 header_added = True
566 break
567 if not header_added: 567 ↛ 568line 567 didn't jump to line 568 because the condition on line 567 was never true
568 row_header_indexes.append(rowspan)
569 row_headers.append(
570 TableHeader(
571 text=cell_str,
572 row_index=current_row_index,
573 rowspan=rowspan,
574 )
575 )
576 else:
577 col_headers.append(
578 TableHeader(
579 text=cell_str,
580 col_index=col_index,
581 colspan=colspan,
582 )
583 )
584 col_index += colspan
585 if row.contain_node(NodeKind.TABLE_CELL):
586 row_index += 1
588 row_index = 0
589 for row in table.find_child(NodeKind.TABLE_ROW):
590 col_index = 0
591 for cell in row.find_child(NodeKind.TABLE_CELL):
592 cell_str = clean_node(wxr, None, cell)
593 colspan, rowspan = get_cell_span(cell)
594 if cell_str == "—": 594 ↛ 595line 594 didn't jump to line 595 because the condition on line 594 was never true
595 col_index += 1
596 else:
597 form = Form(
598 source=conj_page_title, raw_tags=shared_raw_tags
599 )
600 for line_index, line in enumerate(cell_str.splitlines()):
601 match line_index:
602 case 0:
603 form.form = line
604 case 1:
605 form.roman = line
606 case 2: 606 ↛ 600line 606 didn't jump to line 600 because the pattern on line 606 always matched
607 form.ipas.append(line)
608 for header in col_headers:
609 if (
610 col_index >= header.col_index
611 and col_index < header.col_index + header.colspan
612 ):
613 form.raw_tags.append(header.text)
614 for header in row_headers:
615 if (
616 row_index < header.row_index + header.rowspan
617 and row_index + rowspan > header.row_index
618 ):
619 form.raw_tags.append(header.text)
620 if form.form not in ["", wxr.wtp.title]: 620 ↛ 623line 620 didn't jump to line 623 because the condition on line 620 was always true
621 translate_raw_tags(form)
622 entry.forms.append(form)
623 col_index += 1
624 if row.contain_node(NodeKind.TABLE_CELL):
625 row_index += 1
627 for link in expanded_node.find_child(NodeKind.LINK):
628 clean_node(wxr, entry, link)
629 wxr.wtp.title = word_page_title
632def get_cell_span(cell: WikiNode) -> tuple[int, int]:
633 colspan = 1
634 colspan_str = cell.attrs.get("colspan", "1")
635 if re.fullmatch(r"\d+", colspan_str) is not None: 635 ↛ 637line 635 didn't jump to line 637 because the condition on line 635 was always true
636 colspan = int(colspan_str)
637 rowspan = 1
638 rowspan_str = cell.attrs.get("rowspan", "1")
639 if re.fullmatch(r"\d+", rowspan_str) is not None: 639 ↛ 641line 639 didn't jump to line 641 because the condition on line 639 was always true
640 rowspan = int(rowspan_str)
641 return colspan, rowspan
644def extract_de_conj_template(
645 wxr: WiktextractContext,
646 word_entry: WordEntry,
647 t_node: TemplateNode,
648 conj_page_title: str,
649):
650 word_page_title = wxr.wtp.title
651 wxr.wtp.title = conj_page_title
652 expanded_node = wxr.wtp.parse(
653 wxr.wtp.node_to_wikitext(t_node), expand_all=True
654 )
655 wxr.wtp.title = word_page_title
656 for table_index, table in enumerate(
657 expanded_node.find_child(NodeKind.TABLE)
658 ):
659 table_header = ""
660 col_headers = []
661 for row in table.find_child(NodeKind.TABLE_ROW):
662 word_part = ""
663 col_index = 0
664 if table_index >= 2 and row.contain_node(
665 NodeKind.TABLE_HEADER_CELL
666 ):
667 col_headers.clear()
668 for cell in row.find_child(
669 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
670 ):
671 cell_text = clean_node(wxr, None, cell)
672 if cell_text == "":
673 continue
674 elif cell.kind == NodeKind.TABLE_HEADER_CELL:
675 if len(row.children) == 1:
676 table_header = clean_node(wxr, None, cell)
677 else:
678 col_headers.append(clean_node(wxr, None, cell))
679 elif table_index < 2:
680 form = Form(form=cell_text, source=conj_page_title)
681 if ":" in cell_text:
682 colon_index = cell_text.index(":")
683 raw_tag = cell_text[:colon_index].strip()
684 if raw_tag != "": 684 ↛ 686line 684 didn't jump to line 686 because the condition on line 684 was always true
685 form.raw_tags.append(raw_tag)
686 form.form = cell_text[colon_index + 1 :].strip()
687 if table_header != "": 687 ↛ 689line 687 didn't jump to line 689 because the condition on line 687 was always true
688 form.raw_tags.append(table_header)
689 if col_index < len(col_headers): 689 ↛ 691line 689 didn't jump to line 691 because the condition on line 689 was always true
690 form.raw_tags.append(col_headers[col_index])
691 if form.form not in ["", wxr.wtp.title]:
692 translate_raw_tags(form)
693 word_entry.forms.append(form)
694 elif col_index % 2 == 0:
695 word_part = cell_text
696 else:
697 form = Form(
698 form=f"{word_part} {cell_text}", source=conj_page_title
699 )
700 if table_header != "": 700 ↛ 702line 700 didn't jump to line 702 because the condition on line 700 was always true
701 form.raw_tags.append(table_header)
702 if col_index // 2 < len(col_headers): 702 ↛ 704line 702 didn't jump to line 704 because the condition on line 702 was always true
703 form.raw_tags.append(col_headers[col_index // 2])
704 if form.form not in ["", wxr.wtp.title]: 704 ↛ 707line 704 didn't jump to line 707 because the condition on line 704 was always true
705 translate_raw_tags(form)
706 word_entry.forms.append(form)
707 col_index += 1
709 for cat_link in expanded_node.find_child(NodeKind.LINK):
710 clean_node(wxr, word_entry, cat_link)
713def extract_declension_page(
714 wxr: WiktextractContext, word_entry: WordEntry, page_title: str
715):
716 page_body = wxr.wtp.get_page_body(
717 page_title, wxr.wtp.NAMESPACE_DATA["Appendix"]["id"]
718 )
719 if page_body is None: 719 ↛ 720line 719 didn't jump to line 720 because the condition on line 719 was never true
720 return
721 root = wxr.wtp.parse(page_body)
722 for t_node in root.find_child(NodeKind.TEMPLATE):
723 extract_declension_template(wxr, word_entry, page_title, t_node, "")
726def extract_declension_template(
727 wxr: WiktextractContext,
728 word_entry: WordEntry,
729 page_title: str,
730 t_node: TemplateNode,
731 tab_name: str,
732):
733 if t_node.template_name in [ 733 ↛ 740line 733 didn't jump to line 740 because the condition on line 733 was always true
734 "de-adjectif-déclinaisons",
735 "de-adj-déclinaisons",
736 ]:
737 extract_de_adj_declension_template(
738 wxr, word_entry, page_title, t_node, tab_name
739 )
740 elif t_node.template_name == "Onglets conjugaison":
741 for index in range(1, 7):
742 tab_name_arg = f"onglet{index}"
743 if tab_name_arg not in t_node.template_parameters:
744 break
745 tab_name = clean_node(
746 wxr, None, t_node.template_parameters[tab_name_arg]
747 )
748 tab_content = wxr.wtp.parse(
749 wxr.wtp.node_to_wikitext(
750 t_node.template_parameters[f"contenu{index}"]
751 )
752 )
753 for node in tab_content.find_child(NodeKind.TEMPLATE):
754 extract_declension_template(
755 wxr, word_entry, page_title, node, tab_name
756 )
759def extract_de_adj_declension_template(
760 wxr: WiktextractContext,
761 word_entry: WordEntry,
762 page_title: str,
763 t_node: TemplateNode,
764 tab_name: str,
765):
766 # https://fr.wiktionary.org/wiki/Modèle:de-adjectif-déclinaisons
767 expanded_node = wxr.wtp.parse(
768 wxr.wtp.node_to_wikitext(t_node), expand_all=True
769 )
770 for level_node in expanded_node.find_child(LEVEL_KIND_FLAGS):
771 section_title = clean_node(wxr, None, level_node.largs)
772 for table in level_node.find_child(NodeKind.TABLE):
773 table_caption = ""
774 for cap_node in table.find_child(NodeKind.TABLE_CAPTION):
775 table_caption = clean_node(wxr, None, cap_node.children)
776 col_headers = []
777 for row in table.find_child(NodeKind.TABLE_ROW):
778 col_index = 0
779 row_header = ""
780 row_has_data = row.contain_node(NodeKind.TABLE_CELL)
781 article = ""
782 for cell in row.find_child(
783 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
784 ):
785 colspan = int(cell.attrs.get("colspan", "1"))
786 cell_text = clean_node(wxr, None, cell)
787 if cell.kind == NodeKind.TABLE_HEADER_CELL:
788 if row_has_data:
789 row_header = clean_node(wxr, None, cell)
790 elif cell_text != "Forme":
791 col_headers.append(
792 TableHeader(
793 clean_node(wxr, None, cell),
794 col_index=col_index,
795 colspan=colspan,
796 )
797 )
798 else:
799 use_col_headers = []
800 for col_header in col_headers:
801 if (
802 col_index >= col_header.col_index
803 and col_index
804 < col_header.col_index + col_header.colspan
805 ):
806 use_col_headers.append(col_header.text)
807 if "Article" in use_col_headers:
808 if cell_text != "—": 808 ↛ 828line 808 didn't jump to line 828 because the condition on line 808 was always true
809 article = cell_text
810 else:
811 form = Form(
812 form=cell_text,
813 article=article,
814 raw_tags=use_col_headers,
815 source=page_title,
816 )
817 for raw_tag in [
818 tab_name,
819 section_title,
820 table_caption,
821 row_header,
822 ]:
823 if raw_tag != "":
824 form.raw_tags.append(raw_tag)
825 if form.form not in ["", wxr.wtp.title]: 825 ↛ 828line 825 didn't jump to line 828 because the condition on line 825 was always true
826 translate_raw_tags(form)
827 word_entry.forms.append(form)
828 col_index += colspan
830 for link in level_node.find_child(NodeKind.LINK):
831 clean_node(wxr, word_entry, link)