Coverage for src / wiktextract / extractor / fr / conjugation.py: 92%
550 statements
« prev ^ index » next coverage.py v7.12.0, created at 2025-11-21 08:01 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2025-11-21 08:01 +0000
1import re
2from dataclasses import dataclass
3from itertools import chain
5from wikitextprocessor.parser import (
6 LEVEL_KIND_FLAGS,
7 HTMLNode,
8 NodeKind,
9 TemplateNode,
10 WikiNode,
11)
13from ...page import clean_node
14from ...wxr_context import WiktextractContext
15from .models import Form, WordEntry
16from .tags import translate_raw_tags
19def extract_conjugation(
20 wxr: WiktextractContext,
21 entry: WordEntry,
22 conj_page_title: str,
23 select_tab: str = "1",
24) -> None:
25 """
26 Find and extract conjugation page.
28 https://fr.wiktionary.org/wiki/Conjugaison:français
29 https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_de_tous_les_modèles/Français/Conjugaison
30 https://fr.wiktionary.org/wiki/Aide:Conjugaisons
31 """
32 conj_page = wxr.wtp.get_page_body(
33 conj_page_title, wxr.wtp.NAMESPACE_DATA["Conjugaison"]["id"]
34 )
35 if conj_page is None:
36 return
37 conj_root = wxr.wtp.parse(conj_page)
38 for conj_template in conj_root.find_child(NodeKind.TEMPLATE):
39 if conj_template.template_name.endswith("-intro"): 39 ↛ 40line 39 didn't jump to line 40 because the condition on line 39 was never true
40 continue
41 if conj_template.template_name in ["ku-conj-trans", "ku-conj"]:
42 extract_ku_conj_trans_template(
43 wxr, entry, conj_template, conj_page_title
44 )
45 elif conj_template.template_name == "ko-conj":
46 extract_ko_conj_template(wxr, entry, conj_template, conj_page_title)
47 elif conj_template.template_name == "de-conj":
48 extract_de_conj_template(wxr, entry, conj_template, conj_page_title)
49 elif conj_template.template_name.startswith("pt-conj/"):
50 extract_pt_conj_template(wxr, entry, conj_template, conj_page_title)
51 elif conj_template.template_name.startswith("cs-conj-"):
52 extract_cs_conj_template(wxr, entry, conj_template, conj_page_title)
53 elif conj_template.template_name.startswith(("ro-verb-", "se-conj-")): 53 ↛ 54line 53 didn't jump to line 54 because the condition on line 53 was never true
54 from .inflection import extract_inf_table_template
56 extract_inf_table_template(
57 wxr, entry, conj_template, conj_page_title
58 )
59 elif (
60 "-conj" in conj_template.template_name
61 # https://fr.wiktionary.org/wiki/Catégorie:Modèles_de_conjugaison_en_italien
62 # Italian table templates
63 or conj_template.template_name.startswith("it-")
64 ):
65 process_conj_template(wxr, entry, conj_template, conj_page_title)
66 elif conj_template.template_name == "Onglets conjugaison":
67 process_onglets_conjugaison_template(
68 wxr, entry, conj_template, conj_page_title, select_tab
69 )
70 elif conj_template.template_name.removeprefix(":").startswith(
71 "Conjugaison:"
72 ):
73 extract_conjugation(
74 wxr,
75 entry,
76 conj_template.template_name.removeprefix(":"),
77 clean_node(
78 wxr, None, conj_template.template_parameters.get("sél", "2")
79 ),
80 )
81 elif conj_template.template_name.startswith("ja-flx-adj"):
82 process_ja_flx_adj_template(
83 wxr, entry, conj_template, conj_page_title
84 )
85 elif conj_template.template_name.startswith("ja-"): 85 ↛ 38line 85 didn't jump to line 38 because the condition on line 85 was always true
86 process_ja_conj_template(wxr, entry, conj_template, conj_page_title)
88 if conj_page_title.startswith("Conjugaison:kurde/"):
89 for table in conj_root.find_child(NodeKind.TABLE): 89 ↛ 90line 89 didn't jump to line 90 because the loop on line 89 never started
90 extract_ku_conj_trans_table_node(wxr, entry, table, conj_page_title)
92 for link_node in conj_root.find_child(NodeKind.LINK): 92 ↛ 93line 92 didn't jump to line 93 because the loop on line 92 never started
93 clean_node(wxr, None, link_node)
96def process_onglets_conjugaison_template(
97 wxr: WiktextractContext,
98 entry: WordEntry,
99 node: TemplateNode,
100 conj_page_title: str,
101 select_tab: str,
102) -> None:
103 # https://fr.wiktionary.org/wiki/Modèle:Onglets_conjugaison
104 # this template expands to two tabs of tables
105 selected_tabs = []
106 if select_tab != "1" or ( 106 ↛ 114line 106 didn't jump to line 114 because the condition on line 106 was always true
107 select_tab == "1"
108 and clean_node(wxr, None, node.template_parameters.get("onglet1", ""))
109 == "Conjugaison active"
110 ):
111 # don't extract or only extract "Conjugaison pronominale" tab
112 selected_tabs = [select_tab]
113 else:
114 selected_tabs = [str(i) for i in range(1, 7)]
116 for tab_index in selected_tabs:
117 arg_name = f"contenu{tab_index}"
118 if arg_name not in node.template_parameters: 118 ↛ 119line 118 didn't jump to line 119 because the condition on line 118 was never true
119 break
120 arg_value = node.template_parameters[arg_name]
121 if ( 121 ↛ 125line 121 didn't jump to line 125 because the condition on line 121 was never true
122 isinstance(arg_value, TemplateNode)
123 and "-conj" in arg_value.template_name
124 ):
125 process_conj_template(wxr, entry, arg_value, conj_page_title)
126 elif isinstance(arg_value, list): 126 ↛ 116line 126 didn't jump to line 116 because the condition on line 126 was always true
127 for arg_node in arg_value:
128 if isinstance(arg_node, TemplateNode) and (
129 "-conj" in arg_node.template_name
130 or arg_node.template_name.startswith("it-")
131 ):
132 process_conj_template(wxr, entry, arg_node, conj_page_title)
135def process_conj_template(
136 wxr: WiktextractContext,
137 entry: WordEntry,
138 template_node: TemplateNode,
139 conj_page_title: str,
140) -> None:
141 # https://fr.wiktionary.org/wiki/Catégorie:Modèles_de_conjugaison_en_français
142 # https://fr.wiktionary.org/wiki/Modèle:fr-conj-1-ger
143 expanded_template = wxr.wtp.parse(
144 wxr.wtp.node_to_wikitext(template_node), expand_all=True
145 )
146 process_expanded_conj_template(
147 wxr, entry, expanded_template, conj_page_title
148 )
151def process_expanded_conj_template(
152 wxr: WiktextractContext,
153 entry: WordEntry,
154 node: WikiNode,
155 conj_page_title: str,
156) -> None:
157 h3_text = (
158 clean_node(wxr, None, node.largs)
159 if node.kind == NodeKind.LEVEL3
160 else ""
161 )
162 for child in node.find_child(NodeKind.HTML | LEVEL_KIND_FLAGS):
163 if child.kind in LEVEL_KIND_FLAGS: 163 ↛ 164line 163 didn't jump to line 164 because the condition on line 163 was never true
164 process_expanded_conj_template(wxr, entry, child, conj_page_title)
165 elif child.kind == NodeKind.HTML: 165 ↛ 162line 165 didn't jump to line 162 because the condition on line 165 was always true
166 if child.tag == "h3":
167 h3_text = clean_node(wxr, None, child)
168 elif child.tag == "div": 168 ↛ 162line 168 didn't jump to line 162 because the condition on line 168 was always true
169 if h3_text == "Modes impersonnels":
170 process_fr_conj_modes_table(
171 wxr, entry, child, conj_page_title
172 )
173 else:
174 process_fr_conj_table(
175 wxr, entry, child, h3_text, conj_page_title
176 )
179@dataclass
180class TableHeader:
181 text: str
182 col_index: int = 0
183 colspan: int = 0
184 row_index: int = 0
185 rowspan: int = 0
188def process_fr_conj_modes_table(
189 wxr: WiktextractContext,
190 entry: WordEntry,
191 div_node: HTMLNode,
192 conj_page_title: str,
193) -> None:
194 # the first "Modes impersonnels" table
196 for table_node in div_node.find_child(NodeKind.TABLE):
197 col_headers = []
198 for row in table_node.find_child(NodeKind.TABLE_ROW):
199 row_header = ""
200 is_header_row = not row.contain_node(NodeKind.TABLE_CELL)
201 col_index = 0
202 form_text = ""
203 for node in row.find_child(
204 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
205 ):
206 if node.kind == NodeKind.TABLE_HEADER_CELL or (
207 node.contain_node(NodeKind.BOLD) and col_index == 0
208 ):
209 if is_header_row:
210 header_text = clean_node(wxr, None, node)
211 if header_text == "Mode": 211 ↛ 212line 211 didn't jump to line 212 because the condition on line 211 was never true
212 continue
213 else:
214 colspan = 1
215 colspan_str = node.attrs.get("colspan", "1")
216 if re.fullmatch(r"\d+", colspan_str) is not None: 216 ↛ 218line 216 didn't jump to line 218 because the condition on line 216 was always true
217 colspan = int(colspan_str)
218 col_headers.append(
219 TableHeader(header_text, col_index, colspan)
220 )
221 col_index += colspan
222 else:
223 row_header = clean_node(wxr, None, node)
224 else:
225 node_text = clean_node(wxr, None, node)
226 if (
227 node_text.endswith(("]", "\\", "Prononciation ?"))
228 and form_text != ""
229 ):
230 form = Form(
231 form=form_text,
232 ipas=[node_text]
233 if node_text.endswith(("]", "\\"))
234 else [],
235 source=conj_page_title,
236 )
237 if row_header != "": 237 ↛ 239line 237 didn't jump to line 239 because the condition on line 237 was always true
238 form.raw_tags.append(row_header)
239 for col_header in col_headers:
240 if (
241 col_index >= col_header.col_index
242 and col_index
243 < col_header.col_index + col_header.colspan
244 ):
245 form.raw_tags.append(col_header.text)
246 translate_raw_tags(form)
247 entry.forms.append(form)
248 form_text = ""
249 elif node_text != "":
250 if not form_text.endswith("’") and form_text != "":
251 form_text += " "
252 form_text += node_text
253 col_index += 1
256def process_fr_conj_table(
257 wxr: WiktextractContext,
258 entry: WordEntry,
259 div_node: HTMLNode,
260 h3_text: str,
261 conj_page_title: str,
262) -> None:
263 for table_node in div_node.find_child(NodeKind.TABLE):
264 for row_index, row in enumerate(
265 table_node.find_child(NodeKind.TABLE_ROW)
266 ):
267 for cell_index, cell in enumerate(
268 row.find_child(NodeKind.TABLE_CELL)
269 ):
270 for cell_child in cell.children:
271 if isinstance(cell_child, WikiNode):
272 if (
273 cell_child.kind == NodeKind.HTML
274 and cell_child.tag == "table"
275 ):
276 process_fr_conj_html_table(
277 wxr, entry, cell_child, h3_text, conj_page_title
278 )
279 elif cell_child.kind == NodeKind.TABLE: 279 ↛ 270line 279 didn't jump to line 270 because the condition on line 279 was always true
280 process_fr_conj_wiki_table(
281 wxr, entry, cell_child, h3_text, conj_page_title
282 )
285def process_fr_conj_html_table(
286 wxr: WiktextractContext,
287 entry: WordEntry,
288 table_node: HTMLNode,
289 h3_text: str,
290 conj_page_title: str,
291):
292 tags = [h3_text] if h3_text != "" else []
293 for tr_index, tr_node in enumerate(table_node.find_html_recursively("tr")):
294 if tr_index == 0:
295 tags.append(clean_node(wxr, None, tr_node.children))
296 else:
297 form = Form(raw_tags=tags, source=conj_page_title)
298 for td_index, td_node in enumerate(
299 tr_node.find_html_recursively("td")
300 ):
301 td_text = clean_node(wxr, None, td_node)
302 if td_index < 2:
303 form.form += td_text
304 if td_index == 0 and not td_text.endswith("’"):
305 form.form += " "
306 else:
307 if len(form.ipas) > 0:
308 form.ipas[0] += td_text
309 else:
310 if not td_text.endswith("‿"): 310 ↛ 312line 310 didn't jump to line 312 because the condition on line 310 was always true
311 td_text += " "
312 form.ipas.append(td_text)
314 translate_raw_tags(form)
315 entry.forms.append(form)
318def process_fr_conj_wiki_table(
319 wxr: WiktextractContext,
320 entry: WordEntry,
321 table_node: WikiNode,
322 h3_text: str,
323 conj_page_title: str,
324):
325 tags = [h3_text] if h3_text != "" else []
326 for row_index, row in enumerate(table_node.find_child(NodeKind.TABLE_ROW)):
327 if row_index == 0:
328 tags.append(clean_node(wxr, None, row.children))
329 else:
330 form = Form(raw_tags=tags, source=conj_page_title)
331 for cell_index, cell in enumerate(
332 row.find_child(NodeKind.TABLE_CELL)
333 ):
334 cell_text = clean_node(wxr, None, cell)
335 if cell_index < 2:
336 if cell_text == "—" or cell_text.endswith(
337 "Prononciation ?"
338 ):
339 continue
340 if cell_text.startswith(
341 "-"
342 ) and not form.form.strip().endswith(")"):
343 form.form = form.form.strip()
344 form.form += cell_text
345 if cell_index == 0 and len(cell_text) > 0:
346 form.form += " "
347 elif not cell_text.endswith("Prononciation ?"): 347 ↛ 331line 347 didn't jump to line 331 because the condition on line 347 was always true
348 form.ipas.append(cell_text)
350 if len(form.form) > 0:
351 translate_raw_tags(form)
352 entry.forms.append(form)
355def process_ja_flx_adj_template(
356 wxr: WiktextractContext,
357 entry: WordEntry,
358 template_node: TemplateNode,
359 conj_page_title: str,
360) -> None:
361 # https://fr.wiktionary.org/wiki/Modèle:ja-adj
362 # https://fr.wiktionary.org/wiki/Modèle:ja-flx-adj-な
363 expanded_template = wxr.wtp.parse(
364 wxr.wtp.node_to_wikitext(template_node), expand_all=True
365 )
366 for table_node in expanded_template.find_child(NodeKind.TABLE):
367 first_tag = ""
368 for row in table_node.find_child(NodeKind.TABLE_ROW):
369 forms = []
370 tags = [first_tag]
371 for cell_index, row_child in enumerate(
372 row.find_child(NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL)
373 ):
374 row_child_text = clean_node(wxr, None, row_child)
375 if row_child.kind == NodeKind.TABLE_HEADER_CELL:
376 first_tag = row_child_text
377 else:
378 for line_index, line in enumerate(
379 row_child_text.splitlines()
380 ):
381 if cell_index == 0:
382 tags.append(line)
383 continue
384 if line_index + 1 > len(forms):
385 forms.append(
386 translate_raw_tags(
387 Form(raw_tags=tags, source=conj_page_title)
388 )
389 )
390 if cell_index == 1:
391 forms[line_index].form = line
392 elif cell_index == 2:
393 forms[line_index].hiragana = line
394 elif cell_index == 3: 394 ↛ 378line 394 didn't jump to line 378 because the condition on line 394 was always true
395 forms[line_index].roman = line
397 entry.forms.extend(forms)
400def process_ja_conj_template(
401 wxr: WiktextractContext,
402 entry: WordEntry,
403 template_node: TemplateNode,
404 conj_page_title: str,
405) -> None:
406 # https://fr.wiktionary.org/wiki/Modèle:ja-verbe-conj
407 # Modèle:ja-在る
408 expanded_template = wxr.wtp.parse(
409 wxr.wtp.node_to_wikitext(template_node), expand_all=True
410 )
411 for table_node in expanded_template.find_child(NodeKind.TABLE):
412 first_tag = ""
413 row_headers = {}
414 for row in table_node.find_child(NodeKind.TABLE_ROW):
415 if (
416 all(
417 isinstance(c, WikiNode)
418 and c.kind == NodeKind.TABLE_HEADER_CELL
419 for c in row.children
420 )
421 and len(row.children) > 1
422 ):
423 # skip header row of the "Clefs de constructions" table
424 continue
426 for header in row.find_child(NodeKind.TABLE_HEADER_CELL):
427 header_text = clean_node(wxr, None, header)
428 if len(row.children) == 1:
429 first_tag = header_text
430 else:
431 row_headers[header_text] = int(
432 header.attrs.get("rowspan", "1")
433 )
435 tags = [first_tag]
436 for tag, rowspan in row_headers.copy().items():
437 tags.append(tag)
438 if rowspan == 1:
439 del row_headers[tag]
440 else:
441 row_headers[tag] = rowspan - 1
442 forms = []
443 for cell_index, cell in enumerate(
444 row.find_child(NodeKind.TABLE_CELL)
445 ):
446 cell_text = clean_node(wxr, None, cell)
447 for line_index, line in enumerate(cell_text.splitlines()):
448 if cell_index == 0:
449 forms.append(
450 Form(
451 form=line.strip(),
452 raw_tags=tags,
453 source=conj_page_title,
454 )
455 )
456 elif cell_index == 1 and line_index < len(forms):
457 forms[line_index].hiragana = line.strip()
458 elif cell_index == 2 and line_index < len(forms): 458 ↛ 447line 458 didn't jump to line 447 because the condition on line 458 was always true
459 forms[line_index].roman = line.strip()
460 for form in forms:
461 if len(form.form) > 0: 461 ↛ 460line 461 didn't jump to line 460 because the condition on line 461 was always true
462 translate_raw_tags(form)
463 entry.forms.append(form)
466def extract_ku_conj_trans_template(
467 wxr: WiktextractContext,
468 entry: WordEntry,
469 t_node: TemplateNode,
470 conj_page_title: str,
471) -> None:
472 expanded_node = wxr.wtp.parse(
473 wxr.wtp.node_to_wikitext(t_node), expand_all=True
474 )
475 for table in expanded_node.find_child(NodeKind.TABLE):
476 extract_ku_conj_trans_table_node(wxr, entry, table, conj_page_title)
477 for link_node in expanded_node.find_child(NodeKind.LINK):
478 clean_node(wxr, entry, link_node)
481def extract_ku_conj_trans_table_node(
482 wxr: WiktextractContext,
483 entry: WordEntry,
484 table_node: WikiNode,
485 conj_page_title: str,
486) -> None:
487 @dataclass
488 class TableHeader:
489 text: str
490 index: int
491 span: int
493 ignore_headers = (
494 "Conjugaison du verbe",
495 "TEMPS DU PRÉSENT ET DU FUTUR",
496 "TEMPS DU PRESENT ET DU FUTUR",
497 "TEMPS DU PASSÉ",
498 "TEMPS DU PASSE",
499 )
500 col_headers = []
501 last_row_has_header = False
502 last_header = ""
503 for row in table_node.find_child(NodeKind.TABLE_ROW):
504 col_index = 0
505 current_row_has_header = row.contain_node(NodeKind.TABLE_HEADER_CELL)
506 if not last_row_has_header and current_row_has_header:
507 col_headers.clear()
508 for cell in row.find_child(
509 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
510 ):
511 cell_str = clean_node(wxr, None, cell)
512 if cell_str == "":
513 col_index += 1
514 continue
515 if cell.kind == NodeKind.TABLE_HEADER_CELL:
516 if cell_str.startswith(ignore_headers):
517 last_header = cell_str
518 continue
519 colspan = 1
520 colspan_str = cell.attrs.get("colspan", "1")
521 if re.fullmatch(r"\d+", colspan_str) is not None: 521 ↛ 523line 521 didn't jump to line 523 because the condition on line 521 was always true
522 colspan = int(colspan_str)
523 col_headers.append(
524 TableHeader(text=cell_str, index=col_index, span=colspan)
525 )
526 last_header = cell_str
527 col_index += colspan
528 elif last_header == "TEMPS DU PASSÉ":
529 continue
530 elif cell_str == "(inusité)":
531 col_index += 1
532 elif cell_str != wxr.wtp.title: 532 ↛ 508line 532 didn't jump to line 508 because the condition on line 532 was always true
533 form = Form(form=cell_str, source=conj_page_title)
534 for header in col_headers:
535 if (
536 col_index >= header.index
537 and col_index < header.index + header.span
538 ):
539 form.raw_tags.append(header.text)
540 translate_raw_tags(form)
541 entry.forms.append(form)
542 col_index += 1
543 last_row_has_header = current_row_has_header
546def extract_ko_conj_template(
547 wxr: WiktextractContext,
548 entry: WordEntry,
549 t_node: TemplateNode,
550 conj_page_title: str,
551) -> None:
552 word_page_title = wxr.wtp.title
553 wxr.wtp.title = conj_page_title
554 expanded_node = wxr.wtp.parse(
555 wxr.wtp.node_to_wikitext(t_node), expand_all=True
556 )
557 for h3 in expanded_node.find_html("h3"):
558 clean_node(wxr, entry, h3)
559 for table_index, table in enumerate(
560 expanded_node.find_child(NodeKind.TABLE)
561 ):
562 if table_index == 0:
563 continue
564 shared_raw_tags = []
565 for caption_node in table.find_child(NodeKind.TABLE_CAPTION):
566 caption = clean_node(wxr, None, caption_node.children)
567 if caption != "": 567 ↛ 565line 567 didn't jump to line 565 because the condition on line 567 was always true
568 shared_raw_tags.append(caption)
569 col_headers = []
570 row_headers = []
571 row_index = 0
572 row_header_indexes = [0]
573 for row in table.find_child(NodeKind.TABLE_ROW):
574 col_index = 0
575 for header_cell in row.find_child(NodeKind.TABLE_HEADER_CELL):
576 cell_str = clean_node(wxr, None, header_cell)
577 if cell_str == "":
578 continue
579 colspan, rowspan = get_cell_span(header_cell)
580 if row.contain_node(NodeKind.TABLE_CELL):
581 header_added = False
582 current_row_index = row_index
583 for index, row_header_index in enumerate( 583 ↛ 591line 583 didn't jump to line 591 because the loop on line 583 didn't complete
584 row_header_indexes
585 ):
586 if row_index >= row_header_index:
587 current_row_index = row_header_indexes[index]
588 row_header_indexes[index] += rowspan
589 header_added = True
590 break
591 if not header_added: 591 ↛ 592line 591 didn't jump to line 592 because the condition on line 591 was never true
592 row_header_indexes.append(rowspan)
593 row_headers.append(
594 TableHeader(
595 text=cell_str,
596 row_index=current_row_index,
597 rowspan=rowspan,
598 )
599 )
600 else:
601 col_headers.append(
602 TableHeader(
603 text=cell_str,
604 col_index=col_index,
605 colspan=colspan,
606 )
607 )
608 col_index += colspan
609 if row.contain_node(NodeKind.TABLE_CELL):
610 row_index += 1
612 row_index = 0
613 for row in table.find_child(NodeKind.TABLE_ROW):
614 col_index = 0
615 for cell in row.find_child(NodeKind.TABLE_CELL):
616 cell_str = clean_node(wxr, None, cell)
617 colspan, rowspan = get_cell_span(cell)
618 if cell_str == "—": 618 ↛ 619line 618 didn't jump to line 619 because the condition on line 618 was never true
619 col_index += 1
620 else:
621 form = Form(
622 source=conj_page_title, raw_tags=shared_raw_tags
623 )
624 for line_index, line in enumerate(cell_str.splitlines()):
625 match line_index:
626 case 0:
627 form.form = line
628 case 1:
629 form.roman = line
630 case 2: 630 ↛ 624line 630 didn't jump to line 624 because the pattern on line 630 always matched
631 form.ipas.append(line)
632 for header in col_headers:
633 if (
634 col_index >= header.col_index
635 and col_index < header.col_index + header.colspan
636 ):
637 form.raw_tags.append(header.text)
638 for header in row_headers:
639 if (
640 row_index < header.row_index + header.rowspan
641 and row_index + rowspan > header.row_index
642 ):
643 form.raw_tags.append(header.text)
644 if form.form not in ["", wxr.wtp.title]: 644 ↛ 647line 644 didn't jump to line 647 because the condition on line 644 was always true
645 translate_raw_tags(form)
646 entry.forms.append(form)
647 col_index += 1
648 if row.contain_node(NodeKind.TABLE_CELL):
649 row_index += 1
651 for link in expanded_node.find_child(NodeKind.LINK):
652 clean_node(wxr, entry, link)
653 wxr.wtp.title = word_page_title
656def get_cell_span(cell: WikiNode) -> tuple[int, int]:
657 colspan = 1
658 colspan_str = cell.attrs.get("colspan", "1")
659 if re.fullmatch(r"\d+", colspan_str) is not None: 659 ↛ 661line 659 didn't jump to line 661 because the condition on line 659 was always true
660 colspan = int(colspan_str)
661 rowspan = 1
662 rowspan_str = cell.attrs.get("rowspan", "1")
663 if re.fullmatch(r"\d+", rowspan_str) is not None: 663 ↛ 665line 663 didn't jump to line 665 because the condition on line 663 was always true
664 rowspan = int(rowspan_str)
665 return colspan, rowspan
668def extract_de_conj_template(
669 wxr: WiktextractContext,
670 word_entry: WordEntry,
671 t_node: TemplateNode,
672 conj_page_title: str,
673):
674 word_page_title = wxr.wtp.title
675 wxr.wtp.title = conj_page_title
676 expanded_node = wxr.wtp.parse(
677 wxr.wtp.node_to_wikitext(t_node), expand_all=True
678 )
679 wxr.wtp.title = word_page_title
680 for table_index, table in enumerate(
681 expanded_node.find_child(NodeKind.TABLE)
682 ):
683 table_header = ""
684 col_headers = []
685 for row in table.find_child(NodeKind.TABLE_ROW):
686 word_part = ""
687 col_index = 0
688 if table_index >= 2 and row.contain_node(
689 NodeKind.TABLE_HEADER_CELL
690 ):
691 col_headers.clear()
692 for cell in row.find_child(
693 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
694 ):
695 cell_text = clean_node(wxr, None, cell)
696 if cell_text == "":
697 continue
698 elif cell.kind == NodeKind.TABLE_HEADER_CELL:
699 if len(row.children) == 1:
700 table_header = clean_node(wxr, None, cell)
701 else:
702 col_headers.append(clean_node(wxr, None, cell))
703 elif table_index < 2:
704 form = Form(form=cell_text, source=conj_page_title)
705 if ":" in cell_text:
706 colon_index = cell_text.index(":")
707 raw_tag = cell_text[:colon_index].strip()
708 if raw_tag != "": 708 ↛ 710line 708 didn't jump to line 710 because the condition on line 708 was always true
709 form.raw_tags.append(raw_tag)
710 form.form = cell_text[colon_index + 1 :].strip()
711 if table_header != "": 711 ↛ 713line 711 didn't jump to line 713 because the condition on line 711 was always true
712 form.raw_tags.append(table_header)
713 if col_index < len(col_headers): 713 ↛ 715line 713 didn't jump to line 715 because the condition on line 713 was always true
714 form.raw_tags.append(col_headers[col_index])
715 if form.form not in ["", wxr.wtp.title]:
716 translate_raw_tags(form)
717 word_entry.forms.append(form)
718 elif col_index % 2 == 0:
719 word_part = cell_text
720 else:
721 form = Form(
722 form=f"{word_part} {cell_text}", source=conj_page_title
723 )
724 if table_header != "": 724 ↛ 726line 724 didn't jump to line 726 because the condition on line 724 was always true
725 form.raw_tags.append(table_header)
726 if col_index // 2 < len(col_headers): 726 ↛ 728line 726 didn't jump to line 728 because the condition on line 726 was always true
727 form.raw_tags.append(col_headers[col_index // 2])
728 if form.form not in ["", wxr.wtp.title]: 728 ↛ 731line 728 didn't jump to line 731 because the condition on line 728 was always true
729 translate_raw_tags(form)
730 word_entry.forms.append(form)
731 col_index += 1
733 for cat_link in expanded_node.find_child(NodeKind.LINK):
734 clean_node(wxr, word_entry, cat_link)
737def extract_declension_page(
738 wxr: WiktextractContext, word_entry: WordEntry, page_title: str
739):
740 page_body = wxr.wtp.get_page_body(
741 page_title, wxr.wtp.NAMESPACE_DATA["Appendix"]["id"]
742 )
743 if page_body is None: 743 ↛ 744line 743 didn't jump to line 744 because the condition on line 743 was never true
744 return
745 root = wxr.wtp.parse(page_body)
746 for t_node in root.find_child(NodeKind.TEMPLATE):
747 extract_declension_template(wxr, word_entry, page_title, t_node, "")
750def extract_declension_template(
751 wxr: WiktextractContext,
752 word_entry: WordEntry,
753 page_title: str,
754 t_node: TemplateNode,
755 tab_name: str,
756):
757 if t_node.template_name in [ 757 ↛ 764line 757 didn't jump to line 764 because the condition on line 757 was always true
758 "de-adjectif-déclinaisons",
759 "de-adj-déclinaisons",
760 ]:
761 extract_de_adj_declension_template(
762 wxr, word_entry, page_title, t_node, tab_name
763 )
764 elif t_node.template_name == "Onglets conjugaison":
765 for index in range(1, 7):
766 tab_name_arg = f"onglet{index}"
767 if tab_name_arg not in t_node.template_parameters:
768 break
769 tab_name = clean_node(
770 wxr, None, t_node.template_parameters[tab_name_arg]
771 )
772 tab_content = wxr.wtp.parse(
773 wxr.wtp.node_to_wikitext(
774 t_node.template_parameters[f"contenu{index}"]
775 )
776 )
777 for node in tab_content.find_child(NodeKind.TEMPLATE):
778 extract_declension_template(
779 wxr, word_entry, page_title, node, tab_name
780 )
783def extract_de_adj_declension_template(
784 wxr: WiktextractContext,
785 word_entry: WordEntry,
786 page_title: str,
787 t_node: TemplateNode,
788 tab_name: str,
789):
790 # https://fr.wiktionary.org/wiki/Modèle:de-adjectif-déclinaisons
791 expanded_node = wxr.wtp.parse(
792 wxr.wtp.node_to_wikitext(t_node), expand_all=True
793 )
794 for level_node in expanded_node.find_child(LEVEL_KIND_FLAGS):
795 section_title = clean_node(wxr, None, level_node.largs)
796 for table in level_node.find_child(NodeKind.TABLE):
797 table_caption = ""
798 for cap_node in table.find_child(NodeKind.TABLE_CAPTION):
799 table_caption = clean_node(wxr, None, cap_node.children)
800 col_headers = []
801 for row in table.find_child(NodeKind.TABLE_ROW):
802 col_index = 0
803 row_header = ""
804 row_has_data = row.contain_node(NodeKind.TABLE_CELL)
805 article = ""
806 for cell in row.find_child(
807 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
808 ):
809 colspan = int(cell.attrs.get("colspan", "1"))
810 cell_text = clean_node(wxr, None, cell)
811 if cell.kind == NodeKind.TABLE_HEADER_CELL:
812 if row_has_data:
813 row_header = clean_node(wxr, None, cell)
814 elif cell_text != "Forme":
815 col_headers.append(
816 TableHeader(
817 clean_node(wxr, None, cell),
818 col_index=col_index,
819 colspan=colspan,
820 )
821 )
822 else:
823 use_col_headers = []
824 for col_header in col_headers:
825 if (
826 col_index >= col_header.col_index
827 and col_index
828 < col_header.col_index + col_header.colspan
829 ):
830 use_col_headers.append(col_header.text)
831 if "Article" in use_col_headers:
832 if cell_text != "—": 832 ↛ 852line 832 didn't jump to line 852 because the condition on line 832 was always true
833 article = cell_text
834 else:
835 form = Form(
836 form=cell_text,
837 article=article,
838 raw_tags=use_col_headers,
839 source=page_title,
840 )
841 for raw_tag in [
842 tab_name,
843 section_title,
844 table_caption,
845 row_header,
846 ]:
847 if raw_tag != "":
848 form.raw_tags.append(raw_tag)
849 if form.form not in ["", wxr.wtp.title]: 849 ↛ 852line 849 didn't jump to line 852 because the condition on line 849 was always true
850 translate_raw_tags(form)
851 word_entry.forms.append(form)
852 col_index += colspan
854 for link in level_node.find_child(NodeKind.LINK):
855 clean_node(wxr, word_entry, link)
858def extract_pt_conj_template(
859 wxr: WiktextractContext,
860 word_entry: WordEntry,
861 t_node: TemplateNode,
862 page_title: str,
863):
864 expanded_node = wxr.wtp.parse(
865 wxr.wtp.node_to_wikitext(t_node), expand_all=True
866 )
867 for table in expanded_node.find_child(NodeKind.TABLE):
868 col_headers = []
869 row_headers = []
870 for row_index, row in enumerate(table.find_child(NodeKind.TABLE_ROW)):
871 row_has_data = row.contain_node(NodeKind.TABLE_CELL)
872 col_index = 0
873 for header in chain(col_headers, row_headers):
874 if (
875 row_index > header.row_index
876 and row_index < header.row_index + header.rowspan
877 and header.col_index <= col_index
878 ):
879 col_index += header.colspan
880 for cell_node in row.find_child(
881 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
882 ):
883 cell_text = clean_node(wxr, None, cell_node)
884 colspan = int(cell_node.attrs.get("colspan", "1"))
885 rowspan = int(cell_node.attrs.get("rowspan", "1"))
886 if cell_node.kind == NodeKind.TABLE_HEADER_CELL:
887 if row_has_data:
888 row_headers.append(
889 TableHeader(
890 cell_text,
891 col_index,
892 colspan,
893 row_index,
894 rowspan,
895 )
896 )
897 else:
898 if (
899 cell_text
900 == "Formas pessoais\n(formes personnelles)"
901 ):
902 col_headers.clear()
903 row_headers.clear()
904 col_headers.append(
905 TableHeader(
906 cell_text,
907 col_index,
908 colspan,
909 row_index,
910 rowspan,
911 )
912 )
913 elif cell_node.contain_node(NodeKind.LIST): 913 ↛ 914line 913 didn't jump to line 914 because the condition on line 913 was never true
914 continue # skip end notes
915 else:
916 for line in cell_text.splitlines():
917 form_str = line.strip("/ \n")
918 raw_tag = ""
919 if ":" in form_str:
920 colon_index = form_str.index(":")
921 raw_tag = form_str[:colon_index].strip()
922 form_str = form_str[colon_index + 1 :].strip()
923 if form_str not in ["", "-", wxr.wtp.title]:
924 form = Form(form=form_str, source=page_title)
925 for col_header in col_headers:
926 if (
927 (
928 (
929 col_header.col_index
930 < col_index + colspan
931 and col_index
932 < col_header.col_index
933 + col_header.colspan
934 )
935 or (
936 # "Modo Subjuntivo" header
937 col_header.col_index == 0
938 and col_header.row_index
939 < row_index + rowspan
940 and col_header.row_index
941 + col_header.rowspan
942 > row_index
943 )
944 )
945 and col_header.text != ""
946 and col_header.text not in form.raw_tags
947 ):
948 form.raw_tags.append(col_header.text)
949 for row_header in row_headers:
950 if (
951 row_header.row_index < row_index + rowspan
952 and row_index
953 < row_header.row_index + row_header.rowspan
954 and row_header.text != ""
955 and row_header.text not in form.raw_tags
956 ):
957 form.raw_tags.append(row_header.text)
958 if raw_tag != "":
959 form.raw_tags.append(raw_tag)
960 translate_raw_tags(form)
961 word_entry.forms.append(form)
962 col_index += colspan
965def extract_cs_conj_template(
966 wxr: WiktextractContext,
967 word_entry: WordEntry,
968 t_node: TemplateNode,
969 page_title: str,
970):
971 def add_form(form_nodes, col_headers, col_index, row_header, raw_tags):
972 form_str = clean_node(wxr, None, form_nodes)
973 if form_str not in ["", "—", wxr.wtp.title]: 973 ↛ exitline 973 didn't return from function 'add_form' because the condition on line 973 was always true
974 form = Form(form=form_str, source=page_title)
975 if col_index < len(col_headers): 975 ↛ 977line 975 didn't jump to line 977 because the condition on line 975 was always true
976 form.raw_tags.append(col_headers[col_index])
977 if row_header != "": 977 ↛ 979line 977 didn't jump to line 979 because the condition on line 977 was always true
978 form.raw_tags.append(row_header)
979 form.raw_tags.extend(raw_tags)
980 translate_raw_tags(form)
981 word_entry.forms.append(form)
982 form_nodes.clear()
983 raw_tags.clear()
985 expanded_node = wxr.wtp.parse(
986 wxr.wtp.node_to_wikitext(t_node), expand_all=True
987 )
988 for table in expanded_node.find_child(NodeKind.TABLE):
989 col_headers = []
990 for row in table.find_child(NodeKind.TABLE_ROW):
991 row_header = ""
992 for col_index, cell in enumerate(
993 row.find_child(NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL)
994 ):
995 if cell.kind == NodeKind.TABLE_HEADER_CELL:
996 cell_scope = cell.attrs.get("scope", "")
997 if cell_scope == "col":
998 col_headers.append(clean_node(wxr, None, cell))
999 elif cell_scope == "row": 999 ↛ 992line 999 didn't jump to line 992 because the condition on line 999 was always true
1000 row_header = clean_node(wxr, None, cell)
1001 else:
1002 raw_tags = []
1003 form_nodes = []
1004 for node in cell.children:
1005 if isinstance(node, HTMLNode) and node.tag == "span":
1006 span_class = node.attrs.get("class", "").split()
1007 if ( 1007 ↛ 1017line 1007 didn't jump to line 1017 because the condition on line 1007 was always true
1008 "ligne-de-forme" in span_class
1009 or "registre" in span_class
1010 ):
1011 raw_tag = clean_node(wxr, None, node).strip(
1012 "() "
1013 )
1014 if raw_tag != "": 1014 ↛ 1004line 1014 didn't jump to line 1004 because the condition on line 1014 was always true
1015 raw_tags.append(raw_tag)
1016 else:
1017 form_nodes.append(node)
1018 elif isinstance(node, HTMLNode) and node.tag == "br":
1019 add_form(
1020 form_nodes,
1021 col_headers,
1022 col_index,
1023 row_header,
1024 raw_tags,
1025 )
1026 else:
1027 form_nodes.append(node)
1028 add_form(
1029 form_nodes, col_headers, col_index, row_header, raw_tags
1030 )