Coverage for src/wiktextract/extractor/fr/conjugation.py: 92%
458 statements
« prev ^ index » next coverage.py v7.11.0, created at 2025-10-17 05:52 +0000
« prev ^ index » next coverage.py v7.11.0, created at 2025-10-17 05:52 +0000
1import re
2from dataclasses import dataclass
4from wikitextprocessor.parser import (
5 LEVEL_KIND_FLAGS,
6 HTMLNode,
7 NodeKind,
8 TemplateNode,
9 WikiNode,
10)
12from ...page import clean_node
13from ...wxr_context import WiktextractContext
14from .models import Form, WordEntry
15from .tags import translate_raw_tags
18def extract_conjugation(
19 wxr: WiktextractContext,
20 entry: WordEntry,
21 conj_page_title: str,
22 select_tab: str = "1",
23) -> None:
24 """
25 Find and extract conjugation page.
27 https://fr.wiktionary.org/wiki/Conjugaison:français
28 https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_de_tous_les_modèles/Français/Conjugaison
29 https://fr.wiktionary.org/wiki/Aide:Conjugaisons
30 """
31 conj_page = wxr.wtp.get_page_body(
32 conj_page_title, wxr.wtp.NAMESPACE_DATA["Conjugaison"]["id"]
33 )
34 if conj_page is None:
35 return
36 conj_root = wxr.wtp.parse(conj_page)
37 for conj_template in conj_root.find_child(NodeKind.TEMPLATE):
38 if conj_template.template_name.endswith("-intro"): 38 ↛ 39line 38 didn't jump to line 39 because the condition on line 38 was never true
39 continue
40 if conj_template.template_name in ["ku-conj-trans", "ku-conj"]:
41 extract_ku_conj_trans_template(
42 wxr, entry, conj_template, conj_page_title
43 )
44 elif conj_template.template_name == "ko-conj":
45 extract_ko_conj_template(wxr, entry, conj_template, conj_page_title)
46 elif conj_template.template_name == "de-conj":
47 extract_de_conj_template(wxr, entry, conj_template, conj_page_title)
48 elif (
49 "-conj" in conj_template.template_name
50 # https://fr.wiktionary.org/wiki/Catégorie:Modèles_de_conjugaison_en_italien
51 # Italian table templates
52 or conj_template.template_name.startswith("it-")
53 ):
54 process_conj_template(wxr, entry, conj_template, conj_page_title)
55 elif conj_template.template_name == "Onglets conjugaison":
56 process_onglets_conjugaison_template(
57 wxr, entry, conj_template, conj_page_title, select_tab
58 )
59 elif conj_template.template_name.removeprefix(":").startswith(
60 "Conjugaison:"
61 ):
62 extract_conjugation(
63 wxr,
64 entry,
65 conj_template.template_name.removeprefix(":"),
66 clean_node(
67 wxr, None, conj_template.template_parameters.get("sél", "2")
68 ),
69 )
70 elif conj_template.template_name.startswith("ja-flx-adj"):
71 process_ja_flx_adj_template(
72 wxr, entry, conj_template, conj_page_title
73 )
74 elif conj_template.template_name.startswith("ja-"): 74 ↛ 37line 74 didn't jump to line 37 because the condition on line 74 was always true
75 process_ja_conj_template(wxr, entry, conj_template, conj_page_title)
77 if conj_page_title.startswith("Conjugaison:kurde/"):
78 for table in conj_root.find_child(NodeKind.TABLE): 78 ↛ 79line 78 didn't jump to line 79 because the loop on line 78 never started
79 extract_ku_conj_trans_table_node(wxr, entry, table, conj_page_title)
81 for link_node in conj_root.find_child(NodeKind.LINK): 81 ↛ 82line 81 didn't jump to line 82 because the loop on line 81 never started
82 clean_node(wxr, None, link_node)
85def process_onglets_conjugaison_template(
86 wxr: WiktextractContext,
87 entry: WordEntry,
88 node: TemplateNode,
89 conj_page_title: str,
90 select_tab: str,
91) -> None:
92 # https://fr.wiktionary.org/wiki/Modèle:Onglets_conjugaison
93 # this template expands to two tabs of tables
94 selected_tabs = []
95 if select_tab != "1" or ( 95 ↛ 103line 95 didn't jump to line 103 because the condition on line 95 was always true
96 select_tab == "1"
97 and clean_node(wxr, None, node.template_parameters.get("onglet1", ""))
98 == "Conjugaison active"
99 ):
100 # don't extract or only extract "Conjugaison pronominale" tab
101 selected_tabs = [select_tab]
102 else:
103 selected_tabs = [str(i) for i in range(1, 7)]
105 for tab_index in selected_tabs:
106 arg_name = f"contenu{tab_index}"
107 if arg_name not in node.template_parameters: 107 ↛ 108line 107 didn't jump to line 108 because the condition on line 107 was never true
108 break
109 arg_value = node.template_parameters[arg_name]
110 if ( 110 ↛ 114line 110 didn't jump to line 114 because the condition on line 110 was never true
111 isinstance(arg_value, TemplateNode)
112 and "-conj" in arg_value.template_name
113 ):
114 process_conj_template(wxr, entry, arg_value, conj_page_title)
115 elif isinstance(arg_value, list): 115 ↛ 105line 115 didn't jump to line 105 because the condition on line 115 was always true
116 for arg_node in arg_value:
117 if isinstance(arg_node, TemplateNode) and (
118 "-conj" in arg_node.template_name
119 or arg_node.template_name.startswith("it-")
120 ):
121 process_conj_template(wxr, entry, arg_node, conj_page_title)
124def process_conj_template(
125 wxr: WiktextractContext,
126 entry: WordEntry,
127 template_node: TemplateNode,
128 conj_page_title: str,
129) -> None:
130 # https://fr.wiktionary.org/wiki/Catégorie:Modèles_de_conjugaison_en_français
131 # https://fr.wiktionary.org/wiki/Modèle:fr-conj-1-ger
132 expanded_template = wxr.wtp.parse(
133 wxr.wtp.node_to_wikitext(template_node), expand_all=True
134 )
135 process_expanded_conj_template(
136 wxr, entry, expanded_template, conj_page_title
137 )
140def process_expanded_conj_template(
141 wxr: WiktextractContext,
142 entry: WordEntry,
143 node: WikiNode,
144 conj_page_title: str,
145) -> None:
146 h3_text = (
147 clean_node(wxr, None, node.largs)
148 if node.kind == NodeKind.LEVEL3
149 else ""
150 )
151 for child in node.find_child(NodeKind.HTML | LEVEL_KIND_FLAGS):
152 if child.kind in LEVEL_KIND_FLAGS: 152 ↛ 153line 152 didn't jump to line 153 because the condition on line 152 was never true
153 process_expanded_conj_template(wxr, entry, child, conj_page_title)
154 elif child.kind == NodeKind.HTML: 154 ↛ 151line 154 didn't jump to line 151 because the condition on line 154 was always true
155 if child.tag == "h3":
156 h3_text = clean_node(wxr, None, child)
157 elif child.tag == "div": 157 ↛ 151line 157 didn't jump to line 151 because the condition on line 157 was always true
158 if h3_text == "Modes impersonnels":
159 process_fr_conj_modes_table(
160 wxr, entry, child, conj_page_title
161 )
162 else:
163 process_fr_conj_table(
164 wxr, entry, child, h3_text, conj_page_title
165 )
168@dataclass
169class TableHeader:
170 text: str
171 col_index: int = 0
172 colspan: int = 0
173 row_index: int = 0
174 rowspan: int = 0
177def process_fr_conj_modes_table(
178 wxr: WiktextractContext,
179 entry: WordEntry,
180 div_node: HTMLNode,
181 conj_page_title: str,
182) -> None:
183 # the first "Modes impersonnels" table
185 for table_node in div_node.find_child(NodeKind.TABLE):
186 col_headers = []
187 for row in table_node.find_child(NodeKind.TABLE_ROW):
188 row_header = ""
189 is_header_row = not row.contain_node(NodeKind.TABLE_CELL)
190 col_index = 0
191 form_text = ""
192 for node in row.find_child(
193 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
194 ):
195 if node.kind == NodeKind.TABLE_HEADER_CELL or (
196 node.contain_node(NodeKind.BOLD) and col_index == 0
197 ):
198 if is_header_row:
199 header_text = clean_node(wxr, None, node)
200 if header_text == "Mode": 200 ↛ 201line 200 didn't jump to line 201 because the condition on line 200 was never true
201 continue
202 else:
203 colspan = 1
204 colspan_str = node.attrs.get("colspan", "1")
205 if re.fullmatch(r"\d+", colspan_str) is not None: 205 ↛ 207line 205 didn't jump to line 207 because the condition on line 205 was always true
206 colspan = int(colspan_str)
207 col_headers.append(
208 TableHeader(header_text, col_index, colspan)
209 )
210 col_index += colspan
211 else:
212 row_header = clean_node(wxr, None, node)
213 else:
214 node_text = clean_node(wxr, None, node)
215 if (
216 node_text.endswith(("]", "\\", "Prononciation ?"))
217 and form_text != ""
218 ):
219 form = Form(
220 form=form_text,
221 ipas=[node_text]
222 if node_text.endswith(("]", "\\"))
223 else [],
224 source=conj_page_title,
225 )
226 if row_header != "": 226 ↛ 228line 226 didn't jump to line 228 because the condition on line 226 was always true
227 form.raw_tags.append(row_header)
228 for col_header in col_headers:
229 if (
230 col_index >= col_header.col_index
231 and col_index
232 < col_header.col_index + col_header.colspan
233 ):
234 form.raw_tags.append(col_header.text)
235 translate_raw_tags(form)
236 entry.forms.append(form)
237 form_text = ""
238 elif node_text != "":
239 if not form_text.endswith("’") and form_text != "":
240 form_text += " "
241 form_text += node_text
242 col_index += 1
245def process_fr_conj_table(
246 wxr: WiktextractContext,
247 entry: WordEntry,
248 div_node: HTMLNode,
249 h3_text: str,
250 conj_page_title: str,
251) -> None:
252 for table_node in div_node.find_child(NodeKind.TABLE):
253 for row_index, row in enumerate(
254 table_node.find_child(NodeKind.TABLE_ROW)
255 ):
256 for cell_index, cell in enumerate(
257 row.find_child(NodeKind.TABLE_CELL)
258 ):
259 for cell_child in cell.children:
260 if isinstance(cell_child, WikiNode):
261 if (
262 cell_child.kind == NodeKind.HTML
263 and cell_child.tag == "table"
264 ):
265 process_fr_conj_html_table(
266 wxr, entry, cell_child, h3_text, conj_page_title
267 )
268 elif cell_child.kind == NodeKind.TABLE: 268 ↛ 259line 268 didn't jump to line 259 because the condition on line 268 was always true
269 process_fr_conj_wiki_table(
270 wxr, entry, cell_child, h3_text, conj_page_title
271 )
274def process_fr_conj_html_table(
275 wxr: WiktextractContext,
276 entry: WordEntry,
277 table_node: HTMLNode,
278 h3_text: str,
279 conj_page_title: str,
280):
281 tags = [h3_text] if h3_text != "" else []
282 for tr_index, tr_node in enumerate(table_node.find_html_recursively("tr")):
283 if tr_index == 0:
284 tags.append(clean_node(wxr, None, tr_node.children))
285 else:
286 form = Form(raw_tags=tags, source=conj_page_title)
287 for td_index, td_node in enumerate(
288 tr_node.find_html_recursively("td")
289 ):
290 td_text = clean_node(wxr, None, td_node)
291 if td_index < 2:
292 form.form += td_text
293 if td_index == 0 and not td_text.endswith("’"):
294 form.form += " "
295 else:
296 if len(form.ipas) > 0:
297 form.ipas[0] += td_text
298 else:
299 if not td_text.endswith("‿"): 299 ↛ 301line 299 didn't jump to line 301 because the condition on line 299 was always true
300 td_text += " "
301 form.ipas.append(td_text)
303 translate_raw_tags(form)
304 entry.forms.append(form)
307def process_fr_conj_wiki_table(
308 wxr: WiktextractContext,
309 entry: WordEntry,
310 table_node: WikiNode,
311 h3_text: str,
312 conj_page_title: str,
313):
314 tags = [h3_text] if h3_text != "" else []
315 for row_index, row in enumerate(table_node.find_child(NodeKind.TABLE_ROW)):
316 if row_index == 0:
317 tags.append(clean_node(wxr, None, row.children))
318 else:
319 form = Form(raw_tags=tags, source=conj_page_title)
320 for cell_index, cell in enumerate(
321 row.find_child(NodeKind.TABLE_CELL)
322 ):
323 cell_text = clean_node(wxr, None, cell)
324 if cell_index < 2:
325 if cell_text == "—" or cell_text.endswith(
326 "Prononciation ?"
327 ):
328 continue
329 if cell_text.startswith(
330 "-"
331 ) and not form.form.strip().endswith(")"):
332 form.form = form.form.strip()
333 form.form += cell_text
334 if cell_index == 0 and len(cell_text) > 0:
335 form.form += " "
336 elif not cell_text.endswith("Prononciation ?"): 336 ↛ 320line 336 didn't jump to line 320 because the condition on line 336 was always true
337 form.ipas.append(cell_text)
339 if len(form.form) > 0:
340 translate_raw_tags(form)
341 entry.forms.append(form)
344def process_ja_flx_adj_template(
345 wxr: WiktextractContext,
346 entry: WordEntry,
347 template_node: TemplateNode,
348 conj_page_title: str,
349) -> None:
350 # https://fr.wiktionary.org/wiki/Modèle:ja-adj
351 # https://fr.wiktionary.org/wiki/Modèle:ja-flx-adj-な
352 expanded_template = wxr.wtp.parse(
353 wxr.wtp.node_to_wikitext(template_node), expand_all=True
354 )
355 for table_node in expanded_template.find_child(NodeKind.TABLE):
356 first_tag = ""
357 for row in table_node.find_child(NodeKind.TABLE_ROW):
358 forms = []
359 tags = [first_tag]
360 for cell_index, row_child in enumerate(
361 row.find_child(NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL)
362 ):
363 row_child_text = clean_node(wxr, None, row_child)
364 if row_child.kind == NodeKind.TABLE_HEADER_CELL:
365 first_tag = row_child_text
366 else:
367 for line_index, line in enumerate(
368 row_child_text.splitlines()
369 ):
370 if cell_index == 0:
371 tags.append(line)
372 continue
373 if line_index + 1 > len(forms):
374 forms.append(
375 translate_raw_tags(
376 Form(raw_tags=tags, source=conj_page_title)
377 )
378 )
379 if cell_index == 1:
380 forms[line_index].form = line
381 elif cell_index == 2:
382 forms[line_index].hiragana = line
383 elif cell_index == 3: 383 ↛ 367line 383 didn't jump to line 367 because the condition on line 383 was always true
384 forms[line_index].roman = line
386 entry.forms.extend(forms)
389def process_ja_conj_template(
390 wxr: WiktextractContext,
391 entry: WordEntry,
392 template_node: TemplateNode,
393 conj_page_title: str,
394) -> None:
395 # https://fr.wiktionary.org/wiki/Modèle:ja-verbe-conj
396 # Modèle:ja-在る
397 expanded_template = wxr.wtp.parse(
398 wxr.wtp.node_to_wikitext(template_node), expand_all=True
399 )
400 for table_node in expanded_template.find_child(NodeKind.TABLE):
401 first_tag = ""
402 row_headers = {}
403 for row in table_node.find_child(NodeKind.TABLE_ROW):
404 if (
405 all(
406 isinstance(c, WikiNode)
407 and c.kind == NodeKind.TABLE_HEADER_CELL
408 for c in row.children
409 )
410 and len(row.children) > 1
411 ):
412 # skip header row of the "Clefs de constructions" table
413 continue
415 for header in row.find_child(NodeKind.TABLE_HEADER_CELL):
416 header_text = clean_node(wxr, None, header)
417 if len(row.children) == 1:
418 first_tag = header_text
419 else:
420 row_headers[header_text] = int(
421 header.attrs.get("rowspan", "1")
422 )
424 tags = [first_tag]
425 for tag, rowspan in row_headers.copy().items():
426 tags.append(tag)
427 if rowspan == 1:
428 del row_headers[tag]
429 else:
430 row_headers[tag] = rowspan - 1
431 forms = []
432 for cell_index, cell in enumerate(
433 row.find_child(NodeKind.TABLE_CELL)
434 ):
435 cell_text = clean_node(wxr, None, cell)
436 for line_index, line in enumerate(cell_text.splitlines()):
437 if cell_index == 0:
438 forms.append(
439 Form(
440 form=line.strip(),
441 raw_tags=tags,
442 source=conj_page_title,
443 )
444 )
445 elif cell_index == 1 and line_index < len(forms):
446 forms[line_index].hiragana = line.strip()
447 elif cell_index == 2 and line_index < len(forms): 447 ↛ 436line 447 didn't jump to line 436 because the condition on line 447 was always true
448 forms[line_index].roman = line.strip()
449 for form in forms:
450 if len(form.form) > 0: 450 ↛ 449line 450 didn't jump to line 449 because the condition on line 450 was always true
451 translate_raw_tags(form)
452 entry.forms.append(form)
455def extract_ku_conj_trans_template(
456 wxr: WiktextractContext,
457 entry: WordEntry,
458 t_node: TemplateNode,
459 conj_page_title: str,
460) -> None:
461 expanded_node = wxr.wtp.parse(
462 wxr.wtp.node_to_wikitext(t_node), expand_all=True
463 )
464 for table in expanded_node.find_child(NodeKind.TABLE):
465 extract_ku_conj_trans_table_node(wxr, entry, table, conj_page_title)
466 for link_node in expanded_node.find_child(NodeKind.LINK):
467 clean_node(wxr, entry, link_node)
470def extract_ku_conj_trans_table_node(
471 wxr: WiktextractContext,
472 entry: WordEntry,
473 table_node: WikiNode,
474 conj_page_title: str,
475) -> None:
476 @dataclass
477 class TableHeader:
478 text: str
479 index: int
480 span: int
482 ignore_headers = (
483 "Conjugaison du verbe",
484 "TEMPS DU PRÉSENT ET DU FUTUR",
485 "TEMPS DU PRESENT ET DU FUTUR",
486 "TEMPS DU PASSÉ",
487 "TEMPS DU PASSE",
488 )
489 col_headers = []
490 last_row_has_header = False
491 last_header = ""
492 for row in table_node.find_child(NodeKind.TABLE_ROW):
493 col_index = 0
494 current_row_has_header = row.contain_node(NodeKind.TABLE_HEADER_CELL)
495 if not last_row_has_header and current_row_has_header:
496 col_headers.clear()
497 for cell in row.find_child(
498 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
499 ):
500 cell_str = clean_node(wxr, None, cell)
501 if cell_str == "":
502 col_index += 1
503 continue
504 if cell.kind == NodeKind.TABLE_HEADER_CELL:
505 if cell_str.startswith(ignore_headers):
506 last_header = cell_str
507 continue
508 colspan = 1
509 colspan_str = cell.attrs.get("colspan", "1")
510 if re.fullmatch(r"\d+", colspan_str) is not None: 510 ↛ 512line 510 didn't jump to line 512 because the condition on line 510 was always true
511 colspan = int(colspan_str)
512 col_headers.append(
513 TableHeader(text=cell_str, index=col_index, span=colspan)
514 )
515 last_header = cell_str
516 col_index += colspan
517 elif last_header == "TEMPS DU PASSÉ":
518 continue
519 elif cell_str == "(inusité)":
520 col_index += 1
521 elif cell_str != wxr.wtp.title: 521 ↛ 497line 521 didn't jump to line 497 because the condition on line 521 was always true
522 form = Form(form=cell_str, source=conj_page_title)
523 for header in col_headers:
524 if (
525 col_index >= header.index
526 and col_index < header.index + header.span
527 ):
528 form.raw_tags.append(header.text)
529 translate_raw_tags(form)
530 entry.forms.append(form)
531 col_index += 1
532 last_row_has_header = current_row_has_header
535def extract_ko_conj_template(
536 wxr: WiktextractContext,
537 entry: WordEntry,
538 t_node: TemplateNode,
539 conj_page_title: str,
540) -> None:
541 word_page_title = wxr.wtp.title
542 wxr.wtp.title = conj_page_title
543 expanded_node = wxr.wtp.parse(
544 wxr.wtp.node_to_wikitext(t_node), expand_all=True
545 )
546 for h3 in expanded_node.find_html("h3"):
547 clean_node(wxr, entry, h3)
548 for table_index, table in enumerate(
549 expanded_node.find_child(NodeKind.TABLE)
550 ):
551 if table_index == 0:
552 continue
553 shared_raw_tags = []
554 for caption_node in table.find_child(NodeKind.TABLE_CAPTION):
555 caption = clean_node(wxr, None, caption_node.children)
556 if caption != "": 556 ↛ 554line 556 didn't jump to line 554 because the condition on line 556 was always true
557 shared_raw_tags.append(caption)
558 col_headers = []
559 row_headers = []
560 row_index = 0
561 row_header_indexes = [0]
562 for row in table.find_child(NodeKind.TABLE_ROW):
563 col_index = 0
564 for header_cell in row.find_child(NodeKind.TABLE_HEADER_CELL):
565 cell_str = clean_node(wxr, None, header_cell)
566 if cell_str == "":
567 continue
568 colspan, rowspan = get_cell_span(header_cell)
569 if row.contain_node(NodeKind.TABLE_CELL):
570 header_added = False
571 current_row_index = row_index
572 for index, row_header_index in enumerate( 572 ↛ 580line 572 didn't jump to line 580 because the loop on line 572 didn't complete
573 row_header_indexes
574 ):
575 if row_index >= row_header_index:
576 current_row_index = row_header_indexes[index]
577 row_header_indexes[index] += rowspan
578 header_added = True
579 break
580 if not header_added: 580 ↛ 581line 580 didn't jump to line 581 because the condition on line 580 was never true
581 row_header_indexes.append(rowspan)
582 row_headers.append(
583 TableHeader(
584 text=cell_str,
585 row_index=current_row_index,
586 rowspan=rowspan,
587 )
588 )
589 else:
590 col_headers.append(
591 TableHeader(
592 text=cell_str,
593 col_index=col_index,
594 colspan=colspan,
595 )
596 )
597 col_index += colspan
598 if row.contain_node(NodeKind.TABLE_CELL):
599 row_index += 1
601 row_index = 0
602 for row in table.find_child(NodeKind.TABLE_ROW):
603 col_index = 0
604 for cell in row.find_child(NodeKind.TABLE_CELL):
605 cell_str = clean_node(wxr, None, cell)
606 colspan, rowspan = get_cell_span(cell)
607 if cell_str == "—": 607 ↛ 608line 607 didn't jump to line 608 because the condition on line 607 was never true
608 col_index += 1
609 else:
610 form = Form(
611 source=conj_page_title, raw_tags=shared_raw_tags
612 )
613 for line_index, line in enumerate(cell_str.splitlines()):
614 match line_index:
615 case 0:
616 form.form = line
617 case 1:
618 form.roman = line
619 case 2: 619 ↛ 613line 619 didn't jump to line 613 because the pattern on line 619 always matched
620 form.ipas.append(line)
621 for header in col_headers:
622 if (
623 col_index >= header.col_index
624 and col_index < header.col_index + header.colspan
625 ):
626 form.raw_tags.append(header.text)
627 for header in row_headers:
628 if (
629 row_index < header.row_index + header.rowspan
630 and row_index + rowspan > header.row_index
631 ):
632 form.raw_tags.append(header.text)
633 if form.form not in ["", wxr.wtp.title]: 633 ↛ 636line 633 didn't jump to line 636 because the condition on line 633 was always true
634 translate_raw_tags(form)
635 entry.forms.append(form)
636 col_index += 1
637 if row.contain_node(NodeKind.TABLE_CELL):
638 row_index += 1
640 for link in expanded_node.find_child(NodeKind.LINK):
641 clean_node(wxr, entry, link)
642 wxr.wtp.title = word_page_title
645def get_cell_span(cell: WikiNode) -> tuple[int, int]:
646 colspan = 1
647 colspan_str = cell.attrs.get("colspan", "1")
648 if re.fullmatch(r"\d+", colspan_str) is not None: 648 ↛ 650line 648 didn't jump to line 650 because the condition on line 648 was always true
649 colspan = int(colspan_str)
650 rowspan = 1
651 rowspan_str = cell.attrs.get("rowspan", "1")
652 if re.fullmatch(r"\d+", rowspan_str) is not None: 652 ↛ 654line 652 didn't jump to line 654 because the condition on line 652 was always true
653 rowspan = int(rowspan_str)
654 return colspan, rowspan
657def extract_de_conj_template(
658 wxr: WiktextractContext,
659 word_entry: WordEntry,
660 t_node: TemplateNode,
661 conj_page_title: str,
662):
663 word_page_title = wxr.wtp.title
664 wxr.wtp.title = conj_page_title
665 expanded_node = wxr.wtp.parse(
666 wxr.wtp.node_to_wikitext(t_node), expand_all=True
667 )
668 wxr.wtp.title = word_page_title
669 for table_index, table in enumerate(
670 expanded_node.find_child(NodeKind.TABLE)
671 ):
672 table_header = ""
673 col_headers = []
674 for row in table.find_child(NodeKind.TABLE_ROW):
675 word_part = ""
676 col_index = 0
677 if table_index >= 2 and row.contain_node(
678 NodeKind.TABLE_HEADER_CELL
679 ):
680 col_headers.clear()
681 for cell in row.find_child(
682 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
683 ):
684 cell_text = clean_node(wxr, None, cell)
685 if cell_text == "":
686 continue
687 elif cell.kind == NodeKind.TABLE_HEADER_CELL:
688 if len(row.children) == 1:
689 table_header = clean_node(wxr, None, cell)
690 else:
691 col_headers.append(clean_node(wxr, None, cell))
692 elif table_index < 2:
693 form = Form(form=cell_text, source=conj_page_title)
694 if ":" in cell_text:
695 colon_index = cell_text.index(":")
696 raw_tag = cell_text[:colon_index].strip()
697 if raw_tag != "": 697 ↛ 699line 697 didn't jump to line 699 because the condition on line 697 was always true
698 form.raw_tags.append(raw_tag)
699 form.form = cell_text[colon_index + 1 :].strip()
700 if table_header != "": 700 ↛ 702line 700 didn't jump to line 702 because the condition on line 700 was always true
701 form.raw_tags.append(table_header)
702 if col_index < len(col_headers): 702 ↛ 704line 702 didn't jump to line 704 because the condition on line 702 was always true
703 form.raw_tags.append(col_headers[col_index])
704 if form.form not in ["", wxr.wtp.title]:
705 translate_raw_tags(form)
706 word_entry.forms.append(form)
707 elif col_index % 2 == 0:
708 word_part = cell_text
709 else:
710 form = Form(
711 form=f"{word_part} {cell_text}", source=conj_page_title
712 )
713 if table_header != "": 713 ↛ 715line 713 didn't jump to line 715 because the condition on line 713 was always true
714 form.raw_tags.append(table_header)
715 if col_index // 2 < len(col_headers): 715 ↛ 717line 715 didn't jump to line 717 because the condition on line 715 was always true
716 form.raw_tags.append(col_headers[col_index // 2])
717 if form.form not in ["", wxr.wtp.title]: 717 ↛ 720line 717 didn't jump to line 720 because the condition on line 717 was always true
718 translate_raw_tags(form)
719 word_entry.forms.append(form)
720 col_index += 1
722 for cat_link in expanded_node.find_child(NodeKind.LINK):
723 clean_node(wxr, word_entry, cat_link)
726def extract_declension_page(
727 wxr: WiktextractContext, word_entry: WordEntry, page_title: str
728):
729 page_body = wxr.wtp.get_page_body(
730 page_title, wxr.wtp.NAMESPACE_DATA["Appendix"]["id"]
731 )
732 if page_body is None: 732 ↛ 733line 732 didn't jump to line 733 because the condition on line 732 was never true
733 return
734 root = wxr.wtp.parse(page_body)
735 for t_node in root.find_child(NodeKind.TEMPLATE):
736 extract_declension_template(wxr, word_entry, page_title, t_node, "")
739def extract_declension_template(
740 wxr: WiktextractContext,
741 word_entry: WordEntry,
742 page_title: str,
743 t_node: TemplateNode,
744 tab_name: str,
745):
746 if t_node.template_name in [ 746 ↛ 753line 746 didn't jump to line 753 because the condition on line 746 was always true
747 "de-adjectif-déclinaisons",
748 "de-adj-déclinaisons",
749 ]:
750 extract_de_adj_declension_template(
751 wxr, word_entry, page_title, t_node, tab_name
752 )
753 elif t_node.template_name == "Onglets conjugaison":
754 for index in range(1, 7):
755 tab_name_arg = f"onglet{index}"
756 if tab_name_arg not in t_node.template_parameters:
757 break
758 tab_name = clean_node(
759 wxr, None, t_node.template_parameters[tab_name_arg]
760 )
761 tab_content = wxr.wtp.parse(
762 wxr.wtp.node_to_wikitext(
763 t_node.template_parameters[f"contenu{index}"]
764 )
765 )
766 for node in tab_content.find_child(NodeKind.TEMPLATE):
767 extract_declension_template(
768 wxr, word_entry, page_title, node, tab_name
769 )
772def extract_de_adj_declension_template(
773 wxr: WiktextractContext,
774 word_entry: WordEntry,
775 page_title: str,
776 t_node: TemplateNode,
777 tab_name: str,
778):
779 # https://fr.wiktionary.org/wiki/Modèle:de-adjectif-déclinaisons
780 expanded_node = wxr.wtp.parse(
781 wxr.wtp.node_to_wikitext(t_node), expand_all=True
782 )
783 for level_node in expanded_node.find_child(LEVEL_KIND_FLAGS):
784 section_title = clean_node(wxr, None, level_node.largs)
785 for table in level_node.find_child(NodeKind.TABLE):
786 table_caption = ""
787 for cap_node in table.find_child(NodeKind.TABLE_CAPTION):
788 table_caption = clean_node(wxr, None, cap_node.children)
789 col_headers = []
790 for row in table.find_child(NodeKind.TABLE_ROW):
791 col_index = 0
792 row_header = ""
793 row_has_data = row.contain_node(NodeKind.TABLE_CELL)
794 article = ""
795 for cell in row.find_child(
796 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
797 ):
798 colspan = int(cell.attrs.get("colspan", "1"))
799 cell_text = clean_node(wxr, None, cell)
800 if cell.kind == NodeKind.TABLE_HEADER_CELL:
801 if row_has_data:
802 row_header = clean_node(wxr, None, cell)
803 elif cell_text != "Forme":
804 col_headers.append(
805 TableHeader(
806 clean_node(wxr, None, cell),
807 col_index=col_index,
808 colspan=colspan,
809 )
810 )
811 else:
812 use_col_headers = []
813 for col_header in col_headers:
814 if (
815 col_index >= col_header.col_index
816 and col_index
817 < col_header.col_index + col_header.colspan
818 ):
819 use_col_headers.append(col_header.text)
820 if "Article" in use_col_headers:
821 if cell_text != "—": 821 ↛ 841line 821 didn't jump to line 841 because the condition on line 821 was always true
822 article = cell_text
823 else:
824 form = Form(
825 form=cell_text,
826 article=article,
827 raw_tags=use_col_headers,
828 source=page_title,
829 )
830 for raw_tag in [
831 tab_name,
832 section_title,
833 table_caption,
834 row_header,
835 ]:
836 if raw_tag != "":
837 form.raw_tags.append(raw_tag)
838 if form.form not in ["", wxr.wtp.title]: 838 ↛ 841line 838 didn't jump to line 841 because the condition on line 838 was always true
839 translate_raw_tags(form)
840 word_entry.forms.append(form)
841 col_index += colspan
843 for link in level_node.find_child(NodeKind.LINK):
844 clean_node(wxr, word_entry, link)