Coverage for src / wiktextract / extractor / fr / conjugation.py: 91%
556 statements
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-12 08:09 +0000
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-12 08:09 +0000
1import re
2from dataclasses import dataclass
3from itertools import chain
5from wikitextprocessor.parser import (
6 LEVEL_KIND_FLAGS,
7 HTMLNode,
8 LevelNode,
9 NodeKind,
10 TemplateNode,
11 WikiNode,
12)
14from ...page import clean_node
15from ...wxr_context import WiktextractContext
16from .models import Form, WordEntry
17from .tags import translate_raw_tags
20def extract_conjugation(
21 wxr: WiktextractContext,
22 entry: WordEntry,
23 conj_page_title: str,
24 select_tab: str = "1",
25) -> None:
26 """
27 Find and extract conjugation page.
29 https://fr.wiktionary.org/wiki/Conjugaison:français
30 https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_de_tous_les_modèles/Français/Conjugaison
31 https://fr.wiktionary.org/wiki/Aide:Conjugaisons
32 """
33 conj_page = wxr.wtp.get_page_body(
34 conj_page_title, wxr.wtp.NAMESPACE_DATA["Conjugaison"]["id"]
35 )
36 if conj_page is None:
37 return
38 conj_root = wxr.wtp.parse(conj_page)
39 for node in conj_root.children:
40 if isinstance(node, TemplateNode):
41 extract_conj_templates(
42 wxr, entry, conj_page_title, node, select_tab
43 )
44 elif isinstance(node, LevelNode): 44 ↛ 45line 44 didn't jump to line 45 because the condition on line 44 was never true
45 for t_node in node.find_child(NodeKind.TEMPLATE):
46 extract_conj_templates(
47 wxr, entry, conj_page_title, t_node, select_tab
48 )
50 if conj_page_title.startswith("Conjugaison:kurde/"):
51 for table in conj_root.find_child(NodeKind.TABLE): 51 ↛ 52line 51 didn't jump to line 52 because the loop on line 51 never started
52 extract_ku_conj_trans_table_node(wxr, entry, table, conj_page_title)
54 for link_node in conj_root.find_child(NodeKind.LINK): 54 ↛ 55line 54 didn't jump to line 55 because the loop on line 54 never started
55 clean_node(wxr, None, link_node)
58def extract_conj_templates(
59 wxr: WiktextractContext,
60 entry: WordEntry,
61 conj_page_title: str,
62 conj_template: TemplateNode,
63 select_tab: str = "1",
64) -> None:
65 if conj_template.template_name.endswith("-intro"): 65 ↛ 66line 65 didn't jump to line 66 because the condition on line 65 was never true
66 return
67 if conj_template.template_name in ["ku-conj-trans", "ku-conj"]:
68 extract_ku_conj_trans_template(
69 wxr, entry, conj_template, conj_page_title
70 )
71 elif conj_template.template_name == "ko-conj":
72 extract_ko_conj_template(wxr, entry, conj_template, conj_page_title)
73 elif conj_template.template_name == "de-conj":
74 extract_de_conj_template(wxr, entry, conj_template, conj_page_title)
75 elif conj_template.template_name.startswith("pt-conj/"):
76 extract_pt_conj_template(wxr, entry, conj_template, conj_page_title)
77 elif conj_template.template_name.startswith("cs-conj-"):
78 extract_cs_conj_template(wxr, entry, conj_template, conj_page_title)
79 elif conj_template.template_name.startswith(("ro-verb-", "se-conj-")): 79 ↛ 80line 79 didn't jump to line 80 because the condition on line 79 was never true
80 from .inflection import extract_inf_table_template
82 extract_inf_table_template(wxr, entry, conj_template, conj_page_title)
83 elif (
84 "-conj" in conj_template.template_name
85 # https://fr.wiktionary.org/wiki/Catégorie:Modèles_de_conjugaison_en_italien
86 # Italian table templates
87 or conj_template.template_name.startswith("it-")
88 ):
89 process_conj_template(wxr, entry, conj_template, conj_page_title)
90 elif conj_template.template_name == "Onglets conjugaison":
91 process_onglets_conjugaison_template(
92 wxr, entry, conj_template, conj_page_title, select_tab
93 )
94 elif conj_template.template_name.removeprefix(":").startswith(
95 "Conjugaison:"
96 ):
97 extract_conjugation(
98 wxr,
99 entry,
100 conj_template.template_name.removeprefix(":"),
101 clean_node(
102 wxr, None, conj_template.template_parameters.get("sél", "2")
103 ),
104 )
105 elif conj_template.template_name.startswith("ja-flx-adj"):
106 process_ja_flx_adj_template(wxr, entry, conj_template, conj_page_title)
107 elif conj_template.template_name.startswith("ja-"): 107 ↛ exitline 107 didn't return from function 'extract_conj_templates' because the condition on line 107 was always true
108 process_ja_conj_template(wxr, entry, conj_template, conj_page_title)
111def process_onglets_conjugaison_template(
112 wxr: WiktextractContext,
113 entry: WordEntry,
114 node: TemplateNode,
115 conj_page_title: str,
116 select_tab: str,
117) -> None:
118 # https://fr.wiktionary.org/wiki/Modèle:Onglets_conjugaison
119 # this template expands to two tabs of tables
120 selected_tabs = []
121 if select_tab != "1" or ( 121 ↛ 129line 121 didn't jump to line 129 because the condition on line 121 was always true
122 select_tab == "1"
123 and clean_node(wxr, None, node.template_parameters.get("onglet1", ""))
124 == "Conjugaison active"
125 ):
126 # don't extract or only extract "Conjugaison pronominale" tab
127 selected_tabs = [select_tab]
128 else:
129 selected_tabs = [str(i) for i in range(1, 7)]
131 for tab_index in selected_tabs:
132 arg_name = f"contenu{tab_index}"
133 if arg_name not in node.template_parameters: 133 ↛ 134line 133 didn't jump to line 134 because the condition on line 133 was never true
134 break
135 arg_value = node.template_parameters[arg_name]
136 if ( 136 ↛ 140line 136 didn't jump to line 140 because the condition on line 136 was never true
137 isinstance(arg_value, TemplateNode)
138 and "-conj" in arg_value.template_name
139 ):
140 process_conj_template(wxr, entry, arg_value, conj_page_title)
141 elif isinstance(arg_value, list): 141 ↛ 131line 141 didn't jump to line 131 because the condition on line 141 was always true
142 for arg_node in arg_value:
143 if isinstance(arg_node, TemplateNode) and (
144 "-conj" in arg_node.template_name
145 or arg_node.template_name.startswith("it-")
146 ):
147 process_conj_template(wxr, entry, arg_node, conj_page_title)
150def process_conj_template(
151 wxr: WiktextractContext,
152 entry: WordEntry,
153 template_node: TemplateNode,
154 conj_page_title: str,
155) -> None:
156 # https://fr.wiktionary.org/wiki/Catégorie:Modèles_de_conjugaison_en_français
157 # https://fr.wiktionary.org/wiki/Modèle:fr-conj-1-ger
158 expanded_template = wxr.wtp.parse(
159 wxr.wtp.node_to_wikitext(template_node), expand_all=True
160 )
161 process_expanded_conj_template(
162 wxr, entry, expanded_template, conj_page_title
163 )
166def process_expanded_conj_template(
167 wxr: WiktextractContext,
168 entry: WordEntry,
169 node: WikiNode,
170 conj_page_title: str,
171) -> None:
172 h3_text = (
173 clean_node(wxr, None, node.largs)
174 if node.kind == NodeKind.LEVEL3
175 else ""
176 )
177 for child in node.find_child(NodeKind.HTML | LEVEL_KIND_FLAGS):
178 if child.kind in LEVEL_KIND_FLAGS: 178 ↛ 179line 178 didn't jump to line 179 because the condition on line 178 was never true
179 process_expanded_conj_template(wxr, entry, child, conj_page_title)
180 elif child.kind == NodeKind.HTML: 180 ↛ 177line 180 didn't jump to line 177 because the condition on line 180 was always true
181 if child.tag == "h3":
182 h3_text = clean_node(wxr, None, child)
183 elif child.tag == "div": 183 ↛ 177line 183 didn't jump to line 177 because the condition on line 183 was always true
184 if h3_text == "Modes impersonnels":
185 process_fr_conj_modes_table(
186 wxr, entry, child, conj_page_title
187 )
188 else:
189 process_fr_conj_table(
190 wxr, entry, child, h3_text, conj_page_title
191 )
194@dataclass
195class TableHeader:
196 text: str
197 col_index: int = 0
198 colspan: int = 0
199 row_index: int = 0
200 rowspan: int = 0
203def process_fr_conj_modes_table(
204 wxr: WiktextractContext,
205 entry: WordEntry,
206 div_node: HTMLNode,
207 conj_page_title: str,
208) -> None:
209 # the first "Modes impersonnels" table
211 for table_node in div_node.find_child(NodeKind.TABLE):
212 col_headers = []
213 for row in table_node.find_child(NodeKind.TABLE_ROW):
214 row_header = ""
215 is_header_row = not row.contain_node(NodeKind.TABLE_CELL)
216 col_index = 0
217 form_text = ""
218 for node in row.find_child(
219 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
220 ):
221 if node.kind == NodeKind.TABLE_HEADER_CELL or (
222 node.contain_node(NodeKind.BOLD) and col_index == 0
223 ):
224 if is_header_row:
225 header_text = clean_node(wxr, None, node)
226 if header_text == "Mode": 226 ↛ 227line 226 didn't jump to line 227 because the condition on line 226 was never true
227 continue
228 else:
229 colspan = 1
230 colspan_str = node.attrs.get("colspan", "1")
231 if re.fullmatch(r"\d+", colspan_str) is not None: 231 ↛ 233line 231 didn't jump to line 233 because the condition on line 231 was always true
232 colspan = int(colspan_str)
233 col_headers.append(
234 TableHeader(header_text, col_index, colspan)
235 )
236 col_index += colspan
237 else:
238 row_header = clean_node(wxr, None, node)
239 else:
240 node_text = clean_node(wxr, None, node)
241 if (
242 node_text.endswith(("]", "\\", "Prononciation ?"))
243 and form_text != ""
244 ):
245 form = Form(
246 form=form_text,
247 ipas=[node_text]
248 if node_text.endswith(("]", "\\"))
249 else [],
250 source=conj_page_title,
251 )
252 if row_header != "": 252 ↛ 254line 252 didn't jump to line 254 because the condition on line 252 was always true
253 form.raw_tags.append(row_header)
254 for col_header in col_headers:
255 if (
256 col_index >= col_header.col_index
257 and col_index
258 < col_header.col_index + col_header.colspan
259 ):
260 form.raw_tags.append(col_header.text)
261 translate_raw_tags(form)
262 entry.forms.append(form)
263 form_text = ""
264 elif node_text != "":
265 if not form_text.endswith("’") and form_text != "":
266 form_text += " "
267 form_text += node_text
268 col_index += 1
271def process_fr_conj_table(
272 wxr: WiktextractContext,
273 entry: WordEntry,
274 div_node: HTMLNode,
275 h3_text: str,
276 conj_page_title: str,
277) -> None:
278 for table_node in div_node.find_child(NodeKind.TABLE):
279 for row_index, row in enumerate(
280 table_node.find_child(NodeKind.TABLE_ROW)
281 ):
282 for cell_index, cell in enumerate(
283 row.find_child(NodeKind.TABLE_CELL)
284 ):
285 for cell_child in cell.children:
286 if isinstance(cell_child, WikiNode):
287 if (
288 cell_child.kind == NodeKind.HTML
289 and cell_child.tag == "table"
290 ):
291 process_fr_conj_html_table(
292 wxr, entry, cell_child, h3_text, conj_page_title
293 )
294 elif cell_child.kind == NodeKind.TABLE: 294 ↛ 285line 294 didn't jump to line 285 because the condition on line 294 was always true
295 process_fr_conj_wiki_table(
296 wxr, entry, cell_child, h3_text, conj_page_title
297 )
300def process_fr_conj_html_table(
301 wxr: WiktextractContext,
302 entry: WordEntry,
303 table_node: HTMLNode,
304 h3_text: str,
305 conj_page_title: str,
306):
307 tags = [h3_text] if h3_text != "" else []
308 for tr_index, tr_node in enumerate(table_node.find_html_recursively("tr")):
309 if tr_index == 0:
310 tags.append(clean_node(wxr, None, tr_node.children))
311 else:
312 form = Form(raw_tags=tags, source=conj_page_title)
313 for td_index, td_node in enumerate(
314 tr_node.find_html_recursively("td")
315 ):
316 td_text = clean_node(wxr, None, td_node)
317 if td_index < 2:
318 form.form += td_text
319 if td_index == 0 and not td_text.endswith("’"):
320 form.form += " "
321 else:
322 if len(form.ipas) > 0:
323 form.ipas[0] += td_text
324 else:
325 if not td_text.endswith("‿"): 325 ↛ 327line 325 didn't jump to line 327 because the condition on line 325 was always true
326 td_text += " "
327 form.ipas.append(td_text)
329 translate_raw_tags(form)
330 entry.forms.append(form)
333def process_fr_conj_wiki_table(
334 wxr: WiktextractContext,
335 entry: WordEntry,
336 table_node: WikiNode,
337 h3_text: str,
338 conj_page_title: str,
339):
340 tags = [h3_text] if h3_text != "" else []
341 for row_index, row in enumerate(table_node.find_child(NodeKind.TABLE_ROW)):
342 if row_index == 0:
343 tags.append(clean_node(wxr, None, row.children))
344 else:
345 form = Form(raw_tags=tags, source=conj_page_title)
346 for cell_index, cell in enumerate(
347 row.find_child(NodeKind.TABLE_CELL)
348 ):
349 cell_text = clean_node(wxr, None, cell)
350 if cell_index < 2:
351 if cell_text == "—" or cell_text.endswith(
352 "Prononciation ?"
353 ):
354 continue
355 if cell_text.startswith(
356 "-"
357 ) and not form.form.strip().endswith(")"):
358 form.form = form.form.strip()
359 form.form += cell_text
360 if cell_index == 0 and len(cell_text) > 0:
361 form.form += " "
362 elif not cell_text.endswith("Prononciation ?"): 362 ↛ 346line 362 didn't jump to line 346 because the condition on line 362 was always true
363 form.ipas.append(cell_text)
365 if len(form.form) > 0:
366 translate_raw_tags(form)
367 entry.forms.append(form)
370def process_ja_flx_adj_template(
371 wxr: WiktextractContext,
372 entry: WordEntry,
373 template_node: TemplateNode,
374 conj_page_title: str,
375) -> None:
376 # https://fr.wiktionary.org/wiki/Modèle:ja-adj
377 # https://fr.wiktionary.org/wiki/Modèle:ja-flx-adj-な
378 expanded_template = wxr.wtp.parse(
379 wxr.wtp.node_to_wikitext(template_node), expand_all=True
380 )
381 for table_node in expanded_template.find_child(NodeKind.TABLE):
382 first_tag = ""
383 for row in table_node.find_child(NodeKind.TABLE_ROW):
384 forms = []
385 tags = [first_tag]
386 for cell_index, row_child in enumerate(
387 row.find_child(NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL)
388 ):
389 row_child_text = clean_node(wxr, None, row_child)
390 if row_child.kind == NodeKind.TABLE_HEADER_CELL:
391 first_tag = row_child_text
392 else:
393 for line_index, line in enumerate(
394 row_child_text.splitlines()
395 ):
396 if cell_index == 0:
397 tags.append(line)
398 continue
399 if line_index + 1 > len(forms):
400 forms.append(
401 translate_raw_tags(
402 Form(raw_tags=tags, source=conj_page_title)
403 )
404 )
405 if cell_index == 1:
406 forms[line_index].form = line
407 elif cell_index == 2:
408 forms[line_index].hiragana = line
409 elif cell_index == 3: 409 ↛ 393line 409 didn't jump to line 393 because the condition on line 409 was always true
410 forms[line_index].roman = line
412 entry.forms.extend(forms)
415def process_ja_conj_template(
416 wxr: WiktextractContext,
417 entry: WordEntry,
418 template_node: TemplateNode,
419 conj_page_title: str,
420) -> None:
421 # https://fr.wiktionary.org/wiki/Modèle:ja-verbe-conj
422 # Modèle:ja-在る
423 expanded_template = wxr.wtp.parse(
424 wxr.wtp.node_to_wikitext(template_node), expand_all=True
425 )
426 for table_node in expanded_template.find_child(NodeKind.TABLE):
427 first_tag = ""
428 row_headers = {}
429 for row in table_node.find_child(NodeKind.TABLE_ROW):
430 if (
431 all(
432 isinstance(c, WikiNode)
433 and c.kind == NodeKind.TABLE_HEADER_CELL
434 for c in row.children
435 )
436 and len(row.children) > 1
437 ):
438 # skip header row of the "Clefs de constructions" table
439 continue
441 for header in row.find_child(NodeKind.TABLE_HEADER_CELL):
442 header_text = clean_node(wxr, None, header)
443 if len(row.children) == 1:
444 first_tag = header_text
445 else:
446 row_headers[header_text] = int(
447 header.attrs.get("rowspan", "1")
448 )
450 tags = [first_tag]
451 for tag, rowspan in row_headers.copy().items():
452 tags.append(tag)
453 if rowspan == 1:
454 del row_headers[tag]
455 else:
456 row_headers[tag] = rowspan - 1
457 forms = []
458 for cell_index, cell in enumerate(
459 row.find_child(NodeKind.TABLE_CELL)
460 ):
461 cell_text = clean_node(wxr, None, cell)
462 for line_index, line in enumerate(cell_text.splitlines()):
463 if cell_index == 0:
464 forms.append(
465 Form(
466 form=line.strip(),
467 raw_tags=tags,
468 source=conj_page_title,
469 )
470 )
471 elif cell_index == 1 and line_index < len(forms):
472 forms[line_index].hiragana = line.strip()
473 elif cell_index == 2 and line_index < len(forms): 473 ↛ 462line 473 didn't jump to line 462 because the condition on line 473 was always true
474 forms[line_index].roman = line.strip()
475 for form in forms:
476 if len(form.form) > 0: 476 ↛ 475line 476 didn't jump to line 475 because the condition on line 476 was always true
477 translate_raw_tags(form)
478 entry.forms.append(form)
481def extract_ku_conj_trans_template(
482 wxr: WiktextractContext,
483 entry: WordEntry,
484 t_node: TemplateNode,
485 conj_page_title: str,
486) -> None:
487 expanded_node = wxr.wtp.parse(
488 wxr.wtp.node_to_wikitext(t_node), expand_all=True
489 )
490 for table in expanded_node.find_child(NodeKind.TABLE):
491 extract_ku_conj_trans_table_node(wxr, entry, table, conj_page_title)
492 for link_node in expanded_node.find_child(NodeKind.LINK):
493 clean_node(wxr, entry, link_node)
496def extract_ku_conj_trans_table_node(
497 wxr: WiktextractContext,
498 entry: WordEntry,
499 table_node: WikiNode,
500 conj_page_title: str,
501) -> None:
502 @dataclass
503 class TableHeader:
504 text: str
505 index: int
506 span: int
508 ignore_headers = (
509 "Conjugaison du verbe",
510 "TEMPS DU PRÉSENT ET DU FUTUR",
511 "TEMPS DU PRESENT ET DU FUTUR",
512 "TEMPS DU PASSÉ",
513 "TEMPS DU PASSE",
514 )
515 col_headers = []
516 last_row_has_header = False
517 last_header = ""
518 for row in table_node.find_child(NodeKind.TABLE_ROW):
519 col_index = 0
520 current_row_has_header = row.contain_node(NodeKind.TABLE_HEADER_CELL)
521 if not last_row_has_header and current_row_has_header:
522 col_headers.clear()
523 for cell in row.find_child(
524 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
525 ):
526 cell_str = clean_node(wxr, None, cell)
527 if cell_str == "":
528 col_index += 1
529 continue
530 if cell.kind == NodeKind.TABLE_HEADER_CELL:
531 if cell_str.startswith(ignore_headers):
532 last_header = cell_str
533 continue
534 colspan = 1
535 colspan_str = cell.attrs.get("colspan", "1")
536 if re.fullmatch(r"\d+", colspan_str) is not None: 536 ↛ 538line 536 didn't jump to line 538 because the condition on line 536 was always true
537 colspan = int(colspan_str)
538 col_headers.append(
539 TableHeader(text=cell_str, index=col_index, span=colspan)
540 )
541 last_header = cell_str
542 col_index += colspan
543 elif last_header == "TEMPS DU PASSÉ":
544 continue
545 elif cell_str == "(inusité)":
546 col_index += 1
547 elif cell_str != wxr.wtp.title: 547 ↛ 523line 547 didn't jump to line 523 because the condition on line 547 was always true
548 form = Form(form=cell_str, source=conj_page_title)
549 for header in col_headers:
550 if (
551 col_index >= header.index
552 and col_index < header.index + header.span
553 ):
554 form.raw_tags.append(header.text)
555 translate_raw_tags(form)
556 entry.forms.append(form)
557 col_index += 1
558 last_row_has_header = current_row_has_header
561def extract_ko_conj_template(
562 wxr: WiktextractContext,
563 entry: WordEntry,
564 t_node: TemplateNode,
565 conj_page_title: str,
566) -> None:
567 word_page_title = wxr.wtp.title
568 wxr.wtp.title = conj_page_title
569 expanded_node = wxr.wtp.parse(
570 wxr.wtp.node_to_wikitext(t_node), expand_all=True
571 )
572 for h3 in expanded_node.find_html("h3"):
573 clean_node(wxr, entry, h3)
574 for table_index, table in enumerate(
575 expanded_node.find_child(NodeKind.TABLE)
576 ):
577 if table_index == 0:
578 continue
579 shared_raw_tags = []
580 for caption_node in table.find_child(NodeKind.TABLE_CAPTION):
581 caption = clean_node(wxr, None, caption_node.children)
582 if caption != "": 582 ↛ 580line 582 didn't jump to line 580 because the condition on line 582 was always true
583 shared_raw_tags.append(caption)
584 col_headers = []
585 row_headers = []
586 row_index = 0
587 row_header_indexes = [0]
588 for row in table.find_child(NodeKind.TABLE_ROW):
589 col_index = 0
590 for header_cell in row.find_child(NodeKind.TABLE_HEADER_CELL):
591 cell_str = clean_node(wxr, None, header_cell)
592 if cell_str == "":
593 continue
594 colspan, rowspan = get_cell_span(header_cell)
595 if row.contain_node(NodeKind.TABLE_CELL):
596 header_added = False
597 current_row_index = row_index
598 for index, row_header_index in enumerate( 598 ↛ 606line 598 didn't jump to line 606 because the loop on line 598 didn't complete
599 row_header_indexes
600 ):
601 if row_index >= row_header_index:
602 current_row_index = row_header_indexes[index]
603 row_header_indexes[index] += rowspan
604 header_added = True
605 break
606 if not header_added: 606 ↛ 607line 606 didn't jump to line 607 because the condition on line 606 was never true
607 row_header_indexes.append(rowspan)
608 row_headers.append(
609 TableHeader(
610 text=cell_str,
611 row_index=current_row_index,
612 rowspan=rowspan,
613 )
614 )
615 else:
616 col_headers.append(
617 TableHeader(
618 text=cell_str,
619 col_index=col_index,
620 colspan=colspan,
621 )
622 )
623 col_index += colspan
624 if row.contain_node(NodeKind.TABLE_CELL):
625 row_index += 1
627 row_index = 0
628 for row in table.find_child(NodeKind.TABLE_ROW):
629 col_index = 0
630 for cell in row.find_child(NodeKind.TABLE_CELL):
631 cell_str = clean_node(wxr, None, cell)
632 colspan, rowspan = get_cell_span(cell)
633 if cell_str == "—": 633 ↛ 634line 633 didn't jump to line 634 because the condition on line 633 was never true
634 col_index += 1
635 else:
636 form = Form(
637 source=conj_page_title, raw_tags=shared_raw_tags
638 )
639 for line_index, line in enumerate(cell_str.splitlines()):
640 match line_index:
641 case 0:
642 form.form = line
643 case 1:
644 form.roman = line
645 case 2: 645 ↛ 639line 645 didn't jump to line 639 because the pattern on line 645 always matched
646 form.ipas.append(line)
647 for header in col_headers:
648 if (
649 col_index >= header.col_index
650 and col_index < header.col_index + header.colspan
651 ):
652 form.raw_tags.append(header.text)
653 for header in row_headers:
654 if (
655 row_index < header.row_index + header.rowspan
656 and row_index + rowspan > header.row_index
657 ):
658 form.raw_tags.append(header.text)
659 if form.form not in ["", wxr.wtp.title]: 659 ↛ 662line 659 didn't jump to line 662 because the condition on line 659 was always true
660 translate_raw_tags(form)
661 entry.forms.append(form)
662 col_index += 1
663 if row.contain_node(NodeKind.TABLE_CELL):
664 row_index += 1
666 for link in expanded_node.find_child(NodeKind.LINK):
667 clean_node(wxr, entry, link)
668 wxr.wtp.title = word_page_title
671def get_cell_span(cell: WikiNode) -> tuple[int, int]:
672 colspan = 1
673 colspan_str = cell.attrs.get("colspan", "1")
674 if re.fullmatch(r"\d+", colspan_str) is not None: 674 ↛ 676line 674 didn't jump to line 676 because the condition on line 674 was always true
675 colspan = int(colspan_str)
676 rowspan = 1
677 rowspan_str = cell.attrs.get("rowspan", "1")
678 if re.fullmatch(r"\d+", rowspan_str) is not None: 678 ↛ 680line 678 didn't jump to line 680 because the condition on line 678 was always true
679 rowspan = int(rowspan_str)
680 return colspan, rowspan
683def extract_de_conj_template(
684 wxr: WiktextractContext,
685 word_entry: WordEntry,
686 t_node: TemplateNode,
687 conj_page_title: str,
688):
689 word_page_title = wxr.wtp.title
690 wxr.wtp.title = conj_page_title
691 expanded_node = wxr.wtp.parse(
692 wxr.wtp.node_to_wikitext(t_node), expand_all=True
693 )
694 wxr.wtp.title = word_page_title
695 for table_index, table in enumerate(
696 expanded_node.find_child(NodeKind.TABLE)
697 ):
698 table_header = ""
699 col_headers = []
700 for row in table.find_child(NodeKind.TABLE_ROW):
701 word_part = ""
702 col_index = 0
703 if table_index >= 2 and row.contain_node(
704 NodeKind.TABLE_HEADER_CELL
705 ):
706 col_headers.clear()
707 for cell in row.find_child(
708 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
709 ):
710 cell_text = clean_node(wxr, None, cell)
711 if cell_text == "":
712 continue
713 elif cell.kind == NodeKind.TABLE_HEADER_CELL:
714 if len(row.children) == 1:
715 table_header = clean_node(wxr, None, cell)
716 else:
717 col_headers.append(clean_node(wxr, None, cell))
718 elif table_index < 2:
719 form = Form(form=cell_text, source=conj_page_title)
720 if ":" in cell_text:
721 colon_index = cell_text.index(":")
722 raw_tag = cell_text[:colon_index].strip()
723 if raw_tag != "": 723 ↛ 725line 723 didn't jump to line 725 because the condition on line 723 was always true
724 form.raw_tags.append(raw_tag)
725 form.form = cell_text[colon_index + 1 :].strip()
726 if table_header != "": 726 ↛ 728line 726 didn't jump to line 728 because the condition on line 726 was always true
727 form.raw_tags.append(table_header)
728 if col_index < len(col_headers): 728 ↛ 730line 728 didn't jump to line 730 because the condition on line 728 was always true
729 form.raw_tags.append(col_headers[col_index])
730 if form.form not in ["", wxr.wtp.title]:
731 translate_raw_tags(form)
732 word_entry.forms.append(form)
733 elif col_index % 2 == 0:
734 word_part = cell_text
735 else:
736 form = Form(
737 form=f"{word_part} {cell_text}", source=conj_page_title
738 )
739 if table_header != "": 739 ↛ 741line 739 didn't jump to line 741 because the condition on line 739 was always true
740 form.raw_tags.append(table_header)
741 if col_index // 2 < len(col_headers): 741 ↛ 743line 741 didn't jump to line 743 because the condition on line 741 was always true
742 form.raw_tags.append(col_headers[col_index // 2])
743 if form.form not in ["", wxr.wtp.title]: 743 ↛ 746line 743 didn't jump to line 746 because the condition on line 743 was always true
744 translate_raw_tags(form)
745 word_entry.forms.append(form)
746 col_index += 1
748 for cat_link in expanded_node.find_child(NodeKind.LINK):
749 clean_node(wxr, word_entry, cat_link)
752def extract_declension_page(
753 wxr: WiktextractContext, word_entry: WordEntry, page_title: str
754):
755 page_body = wxr.wtp.get_page_body(
756 page_title, wxr.wtp.NAMESPACE_DATA["Appendix"]["id"]
757 )
758 if page_body is None: 758 ↛ 759line 758 didn't jump to line 759 because the condition on line 758 was never true
759 return
760 root = wxr.wtp.parse(page_body)
761 for t_node in root.find_child(NodeKind.TEMPLATE):
762 extract_declension_template(wxr, word_entry, page_title, t_node, "")
765def extract_declension_template(
766 wxr: WiktextractContext,
767 word_entry: WordEntry,
768 page_title: str,
769 t_node: TemplateNode,
770 tab_name: str,
771):
772 if t_node.template_name in [ 772 ↛ 779line 772 didn't jump to line 779 because the condition on line 772 was always true
773 "de-adjectif-déclinaisons",
774 "de-adj-déclinaisons",
775 ]:
776 extract_de_adj_declension_template(
777 wxr, word_entry, page_title, t_node, tab_name
778 )
779 elif t_node.template_name == "Onglets conjugaison":
780 for index in range(1, 7):
781 tab_name_arg = f"onglet{index}"
782 if tab_name_arg not in t_node.template_parameters:
783 break
784 tab_name = clean_node(
785 wxr, None, t_node.template_parameters[tab_name_arg]
786 )
787 tab_content = wxr.wtp.parse(
788 wxr.wtp.node_to_wikitext(
789 t_node.template_parameters[f"contenu{index}"]
790 )
791 )
792 for node in tab_content.find_child(NodeKind.TEMPLATE):
793 extract_declension_template(
794 wxr, word_entry, page_title, node, tab_name
795 )
798def extract_de_adj_declension_template(
799 wxr: WiktextractContext,
800 word_entry: WordEntry,
801 page_title: str,
802 t_node: TemplateNode,
803 tab_name: str,
804):
805 # https://fr.wiktionary.org/wiki/Modèle:de-adjectif-déclinaisons
806 expanded_node = wxr.wtp.parse(
807 wxr.wtp.node_to_wikitext(t_node), expand_all=True
808 )
809 for level_node in expanded_node.find_child(LEVEL_KIND_FLAGS):
810 section_title = clean_node(wxr, None, level_node.largs)
811 for table in level_node.find_child(NodeKind.TABLE):
812 table_caption = ""
813 for cap_node in table.find_child(NodeKind.TABLE_CAPTION):
814 table_caption = clean_node(wxr, None, cap_node.children)
815 col_headers = []
816 for row in table.find_child(NodeKind.TABLE_ROW):
817 col_index = 0
818 row_header = ""
819 row_has_data = row.contain_node(NodeKind.TABLE_CELL)
820 article = ""
821 for cell in row.find_child(
822 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
823 ):
824 colspan = int(cell.attrs.get("colspan", "1"))
825 cell_text = clean_node(wxr, None, cell)
826 if cell.kind == NodeKind.TABLE_HEADER_CELL:
827 if row_has_data:
828 row_header = clean_node(wxr, None, cell)
829 elif cell_text != "Forme":
830 col_headers.append(
831 TableHeader(
832 clean_node(wxr, None, cell),
833 col_index=col_index,
834 colspan=colspan,
835 )
836 )
837 else:
838 use_col_headers = []
839 for col_header in col_headers:
840 if (
841 col_index >= col_header.col_index
842 and col_index
843 < col_header.col_index + col_header.colspan
844 ):
845 use_col_headers.append(col_header.text)
846 if "Article" in use_col_headers:
847 if cell_text != "—": 847 ↛ 867line 847 didn't jump to line 867 because the condition on line 847 was always true
848 article = cell_text
849 else:
850 form = Form(
851 form=cell_text,
852 article=article,
853 raw_tags=use_col_headers,
854 source=page_title,
855 )
856 for raw_tag in [
857 tab_name,
858 section_title,
859 table_caption,
860 row_header,
861 ]:
862 if raw_tag != "":
863 form.raw_tags.append(raw_tag)
864 if form.form not in ["", wxr.wtp.title]: 864 ↛ 867line 864 didn't jump to line 867 because the condition on line 864 was always true
865 translate_raw_tags(form)
866 word_entry.forms.append(form)
867 col_index += colspan
869 for link in level_node.find_child(NodeKind.LINK):
870 clean_node(wxr, word_entry, link)
873def extract_pt_conj_template(
874 wxr: WiktextractContext,
875 word_entry: WordEntry,
876 t_node: TemplateNode,
877 page_title: str,
878):
879 expanded_node = wxr.wtp.parse(
880 wxr.wtp.node_to_wikitext(t_node), expand_all=True
881 )
882 for table in expanded_node.find_child(NodeKind.TABLE):
883 col_headers = []
884 row_headers = []
885 for row_index, row in enumerate(table.find_child(NodeKind.TABLE_ROW)):
886 row_has_data = row.contain_node(NodeKind.TABLE_CELL)
887 col_index = 0
888 for header in chain(col_headers, row_headers):
889 if (
890 row_index > header.row_index
891 and row_index < header.row_index + header.rowspan
892 and header.col_index <= col_index
893 ):
894 col_index += header.colspan
895 for cell_node in row.find_child(
896 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
897 ):
898 cell_text = clean_node(wxr, None, cell_node)
899 colspan = int(cell_node.attrs.get("colspan", "1"))
900 rowspan = int(cell_node.attrs.get("rowspan", "1"))
901 if cell_node.kind == NodeKind.TABLE_HEADER_CELL:
902 if row_has_data:
903 row_headers.append(
904 TableHeader(
905 cell_text,
906 col_index,
907 colspan,
908 row_index,
909 rowspan,
910 )
911 )
912 else:
913 if (
914 cell_text
915 == "Formas pessoais\n(formes personnelles)"
916 ):
917 col_headers.clear()
918 row_headers.clear()
919 col_headers.append(
920 TableHeader(
921 cell_text,
922 col_index,
923 colspan,
924 row_index,
925 rowspan,
926 )
927 )
928 elif cell_node.contain_node(NodeKind.LIST): 928 ↛ 929line 928 didn't jump to line 929 because the condition on line 928 was never true
929 continue # skip end notes
930 else:
931 for line in cell_text.splitlines():
932 form_str = line.strip("/ \n")
933 raw_tag = ""
934 if ":" in form_str:
935 colon_index = form_str.index(":")
936 raw_tag = form_str[:colon_index].strip()
937 form_str = form_str[colon_index + 1 :].strip()
938 if form_str not in ["", "-", wxr.wtp.title]:
939 form = Form(form=form_str, source=page_title)
940 for col_header in col_headers:
941 if (
942 (
943 (
944 col_header.col_index
945 < col_index + colspan
946 and col_index
947 < col_header.col_index
948 + col_header.colspan
949 )
950 or (
951 # "Modo Subjuntivo" header
952 col_header.col_index == 0
953 and col_header.row_index
954 < row_index + rowspan
955 and col_header.row_index
956 + col_header.rowspan
957 > row_index
958 )
959 )
960 and col_header.text != ""
961 and col_header.text not in form.raw_tags
962 ):
963 form.raw_tags.append(col_header.text)
964 for row_header in row_headers:
965 if (
966 row_header.row_index < row_index + rowspan
967 and row_index
968 < row_header.row_index + row_header.rowspan
969 and row_header.text != ""
970 and row_header.text not in form.raw_tags
971 ):
972 form.raw_tags.append(row_header.text)
973 if raw_tag != "":
974 form.raw_tags.append(raw_tag)
975 translate_raw_tags(form)
976 word_entry.forms.append(form)
977 col_index += colspan
980def extract_cs_conj_template(
981 wxr: WiktextractContext,
982 word_entry: WordEntry,
983 t_node: TemplateNode,
984 page_title: str,
985):
986 def add_form(form_nodes, col_headers, col_index, row_header, raw_tags):
987 form_str = clean_node(wxr, None, form_nodes)
988 if form_str not in ["", "—", wxr.wtp.title]: 988 ↛ exitline 988 didn't return from function 'add_form' because the condition on line 988 was always true
989 form = Form(form=form_str, source=page_title)
990 if col_index < len(col_headers): 990 ↛ 992line 990 didn't jump to line 992 because the condition on line 990 was always true
991 form.raw_tags.append(col_headers[col_index])
992 if row_header != "": 992 ↛ 994line 992 didn't jump to line 994 because the condition on line 992 was always true
993 form.raw_tags.append(row_header)
994 form.raw_tags.extend(raw_tags)
995 translate_raw_tags(form)
996 word_entry.forms.append(form)
997 form_nodes.clear()
998 raw_tags.clear()
1000 expanded_node = wxr.wtp.parse(
1001 wxr.wtp.node_to_wikitext(t_node), expand_all=True
1002 )
1003 for table in expanded_node.find_child(NodeKind.TABLE):
1004 col_headers = []
1005 for row in table.find_child(NodeKind.TABLE_ROW):
1006 row_header = ""
1007 for col_index, cell in enumerate(
1008 row.find_child(NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL)
1009 ):
1010 if cell.kind == NodeKind.TABLE_HEADER_CELL:
1011 cell_scope = cell.attrs.get("scope", "")
1012 if cell_scope == "col":
1013 col_headers.append(clean_node(wxr, None, cell))
1014 elif cell_scope == "row": 1014 ↛ 1007line 1014 didn't jump to line 1007 because the condition on line 1014 was always true
1015 row_header = clean_node(wxr, None, cell)
1016 else:
1017 raw_tags = []
1018 form_nodes = []
1019 for node in cell.children:
1020 if isinstance(node, HTMLNode) and node.tag == "span":
1021 span_class = node.attrs.get("class", "").split()
1022 if ( 1022 ↛ 1032line 1022 didn't jump to line 1032 because the condition on line 1022 was always true
1023 "ligne-de-forme" in span_class
1024 or "registre" in span_class
1025 ):
1026 raw_tag = clean_node(wxr, None, node).strip(
1027 "() "
1028 )
1029 if raw_tag != "": 1029 ↛ 1019line 1029 didn't jump to line 1019 because the condition on line 1029 was always true
1030 raw_tags.append(raw_tag)
1031 else:
1032 form_nodes.append(node)
1033 elif isinstance(node, HTMLNode) and node.tag == "br":
1034 add_form(
1035 form_nodes,
1036 col_headers,
1037 col_index,
1038 row_header,
1039 raw_tags,
1040 )
1041 else:
1042 form_nodes.append(node)
1043 add_form(
1044 form_nodes, col_headers, col_index, row_header, raw_tags
1045 )