Coverage for src/wiktextract/extractor/es/conjugation.py: 91%
81 statements
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-04 10:58 +0000
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-04 10:58 +0000
1from dataclasses import dataclass
3from wikitextprocessor import HTMLNode, NodeKind, TemplateNode, WikiNode
5from ...page import clean_node
6from ...wxr_context import WiktextractContext
7from .models import Form, WordEntry
8from .tags import translate_raw_tags
11def extract_conjugation_section(
12 wxr: WiktextractContext, page_data: list[WordEntry], level_node: WikiNode
13) -> None:
14 forms = []
15 cats = []
16 for t_node in level_node.find_child(NodeKind.TEMPLATE):
17 if t_node.template_name == "es.v": 17 ↛ 16line 17 didn't jump to line 16 because the condition on line 17 was always true
18 new_forms, new_cats = process_es_v_template(wxr, t_node)
19 forms.extend(new_forms)
20 cats.extend(new_cats)
22 for data in page_data:
23 if (
24 data.lang_code == page_data[-1].lang_code
25 and data.etymology_text == page_data[-1].etymology_text
26 and data.pos == "verb" # should be fixed on Wiktionary
27 ):
28 data.forms.extend(forms)
29 data.categories.extend(cats)
32@dataclass
33class SpanHeader:
34 text: str
35 index: int
36 span: int
39# https://en.wikipedia.org/wiki/Spanish_pronouns
40PRONOUN_TAGS = {
41 "yo": ["first-person", "singular"],
42 "que yo": ["first-person", "singular"],
43 "tú": ["second-person", "singular"],
44 "que tú": ["second-person", "singular"],
45 "(tú)": ["second-person", "singular"],
46 "vos": ["second-person", "singular", "vos-form"],
47 "que vos": ["second-person", "singular", "vos-form"],
48 "(vos)": ["second-person", "singular", "vos-form"],
49 "él, ella, usted": ["third-person", "singular"],
50 "que él, que ella, que usted": ["third-person", "singular"],
51 "(usted)": ["third-person", "singular"],
52 "nosotros": ["first-person", "plural"],
53 "que nosotros": ["first-person", "plural"],
54 "(nosotros)": ["first-person", "plural"],
55 "vosotros": ["second-person", "plural"],
56 "que vosotros": ["second-person", "plural"],
57 "(vosotros)": ["second-person", "plural"],
58 "ustedes, ellos": ["third-person", "plural"],
59 "que ustedes, que ellos": ["third-person", "plural"],
60 "(ustedes)": ["third-person", "plural"],
61}
64def process_es_v_template(
65 wxr: WiktextractContext, template_node: TemplateNode
66) -> tuple[list[Form], list[str]]:
67 # https://es.wiktionary.org/wiki/Plantilla:es.v
68 forms = []
69 cats = {}
70 expanded_node = wxr.wtp.parse(
71 wxr.wtp.node_to_wikitext(template_node), expand_all=True
72 )
73 clean_node(wxr, cats, expanded_node)
74 table_nodes = list(expanded_node.find_child_recursively(NodeKind.TABLE))
75 if len(table_nodes) == 0: 75 ↛ 76line 75 didn't jump to line 76 because the condition on line 75 was never true
76 return [], []
77 table_node = table_nodes[0]
78 col_headers = []
79 for row in table_node.find_child(NodeKind.TABLE_ROW):
80 row_header = ""
81 single_cell = len(list(row.filter_empty_str_child())) == 1
82 all_header_row = row.contain_node(
83 NodeKind.TABLE_HEADER_CELL
84 ) and not row.contain_node(NodeKind.TABLE_CELL)
85 if not all_header_row and single_cell: 85 ↛ 86line 85 didn't jump to line 86 because the condition on line 85 was never true
86 continue # ignore end notes
87 if all_header_row and single_cell:
88 col_headers.clear() # new table
90 col_index = 0
91 is_archaic_row = False
92 for cell in row.find_child(
93 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
94 ):
95 cell_text = clean_node(wxr, None, cell)
96 if cell_text == "":
97 continue
98 if cell.kind == NodeKind.TABLE_HEADER_CELL:
99 if all_header_row:
100 colspan = int(cell.attrs.get("colspan", "1"))
101 col_headers.append(
102 SpanHeader(
103 cell_text.removeprefix("Modo ").strip(),
104 col_index,
105 colspan,
106 )
107 )
108 col_index += colspan
109 else:
110 is_archaic_row = cell_text.endswith("^†")
111 row_header = cell_text.removesuffix("^†").strip()
112 else:
113 cell_nodes = []
114 for node in cell.children:
115 if isinstance(node, HTMLNode) and node.tag == "sup":
116 sup_tag = clean_node(wxr, None, node.children)
117 if sup_tag != "" and len(forms) > 0: 117 ↛ 114line 117 didn't jump to line 114 because the condition on line 117 was always true
118 forms[-1].raw_tags.append(sup_tag)
119 translate_raw_tags(forms[-1])
120 elif (
121 isinstance(node, WikiNode)
122 and node.kind == NodeKind.LINK
123 ):
124 cell_nodes.append(node)
125 form = Form(
126 form=clean_node(wxr, None, cell_nodes).lstrip(", ")
127 )
128 for col_head in col_headers:
129 if ( 129 ↛ 128line 129 didn't jump to line 128 because the condition on line 129 was always true
130 col_index >= col_head.index
131 and col_index < col_head.index + col_head.span
132 ):
133 form.raw_tags.append(col_head.text)
134 form.tags.extend(
135 PRONOUN_TAGS.get(col_head.text, [])
136 )
137 if row_header != "": 137 ↛ 139line 137 didn't jump to line 139 because the condition on line 137 was always true
138 form.raw_tags.append(row_header)
139 if is_archaic_row: 139 ↛ 140line 139 didn't jump to line 140 because the condition on line 139 was never true
140 form.tags.append("archaic")
141 if form.form not in ["", "―"]: 141 ↛ 144line 141 didn't jump to line 144 because the condition on line 141 was always true
142 translate_raw_tags(form)
143 forms.append(form)
144 cell_nodes.clear()
145 elif not (
146 isinstance(node, HTMLNode)
147 and "movil" in node.attrs.get("class", "")
148 ):
149 cell_nodes.append(node) # hidden HTML tag
150 col_index += 1
151 return forms, cats.get("categories", [])