Coverage for src / wiktextract / extractor / es / conjugation.py: 91%
99 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-03-26 08:59 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-03-26 08:59 +0000
1from dataclasses import dataclass
3from wikitextprocessor import HTMLNode, NodeKind, TemplateNode, WikiNode
5from ...page import clean_node
6from ...wxr_context import WiktextractContext
7from .models import Form, WordEntry
8from .tags import translate_raw_tags
11def extract_conjugation_section(
12 wxr: WiktextractContext, page_data: list[WordEntry], level_node: WikiNode
13) -> None:
14 forms = []
15 cats = []
16 for t_node in level_node.find_child(NodeKind.TEMPLATE):
17 if t_node.template_name in ["es.v", "en.v", "de.v", "pl.v"]: 17 ↛ 16line 17 didn't jump to line 16 because the condition on line 17 was always true
18 new_forms, new_cats = process_es_v_template(wxr, t_node)
19 forms.extend(new_forms)
20 cats.extend(new_cats)
22 for data in page_data:
23 if (
24 data.lang_code == page_data[-1].lang_code
25 and data.etymology_texts == page_data[-1].etymology_texts
26 and data.pos == "verb" # should be fixed on Wiktionary
27 ):
28 data.forms.extend(forms)
29 data.categories.extend(cats)
32@dataclass
33class SpanHeader:
34 text: str
35 index: int
36 span: int
39# https://en.wikipedia.org/wiki/Spanish_pronouns
40PRONOUN_TAGS = {
41 "yo": ["first-person", "singular"],
42 "que yo": ["first-person", "singular"],
43 "tú": ["second-person", "singular"],
44 "que tú": ["second-person", "singular"],
45 "(tú)": ["second-person", "singular"],
46 "vos": ["second-person", "singular", "vos-form"],
47 "que vos": ["second-person", "singular", "vos-form"],
48 "(vos)": ["second-person", "singular", "vos-form"],
49 "él, ella, usted": ["third-person", "singular"],
50 "que él, que ella, que usted": ["third-person", "singular"],
51 "(usted)": ["third-person", "singular"],
52 "nosotros": ["first-person", "plural"],
53 "que nosotros": ["first-person", "plural"],
54 "(nosotros)": ["first-person", "plural"],
55 "vosotros": ["second-person", "plural"],
56 "que vosotros": ["second-person", "plural"],
57 "(vosotros)": ["second-person", "plural"],
58 "ustedes, ellos": ["third-person", "plural"],
59 "que ustedes, que ellos": ["third-person", "plural"],
60 "(ustedes)": ["third-person", "plural"],
61 # Template:en.v
62 "I": ["first-person"],
63 "you": ["second-person"],
64 "(you)": ["second-person"],
65 "he, she, it": ["third-person", "singular"],
66 "we, you, they": ["third-person", "plural"],
67 "(we)": ["third-person", "plural"],
68 # Template:de.v
69 "ich": ["first-person", "singular"],
70 "du": ["second-person", "singular"],
71 "er, sie, es": ["third-person", "singular"],
72 "wir": ["first-person", "plural"],
73 "ihr": ["second-person", "plural"],
74 "sie": ["third-person", "plural"],
75 "(du)": ["second-person", "singular"],
76 "(wir)": ["first-person", "plural"],
77 "(ihr)": ["second-person", "plural"],
78 "(Sie)": ["third-person", "plural"],
79 # Template:pl.v
80 "ja": ["first-person", "singular"],
81 "ty": ["second-person", "singular"],
82 "on, ona, ono": ["third-person", "singular"],
83 "my": ["first-person", "plural"],
84 "wy": ["second-person", "plural"],
85 "oni, one": ["third-person", "plural"],
86 "(ty)": ["second-person", "singular"],
87 "(my)": ["first-person", "plural"],
88 "(wy)": ["second-person", "plural"],
89}
92def process_es_v_template(
93 wxr: WiktextractContext, template_node: TemplateNode
94) -> tuple[list[Form], list[str]]:
95 # https://es.wiktionary.org/wiki/Plantilla:es.v
96 forms = []
97 cats = {}
98 expanded_node = wxr.wtp.parse(
99 wxr.wtp.node_to_wikitext(template_node), expand_all=True
100 )
101 clean_node(wxr, cats, expanded_node)
102 table_nodes = list(expanded_node.find_child_recursively(NodeKind.TABLE))
103 if len(table_nodes) == 0: 103 ↛ 104line 103 didn't jump to line 104 because the condition on line 103 was never true
104 return [], []
105 table_node = table_nodes[0]
106 col_headers = []
107 for row in table_node.find_child(NodeKind.TABLE_ROW):
108 row_header = ""
109 single_cell = len(list(row.filter_empty_str_child())) == 1
110 all_header_row = row.contain_node(
111 NodeKind.TABLE_HEADER_CELL
112 ) and not row.contain_node(NodeKind.TABLE_CELL)
113 if not all_header_row and single_cell: 113 ↛ 114line 113 didn't jump to line 114 because the condition on line 113 was never true
114 continue # ignore end notes
115 if all_header_row and single_cell:
116 col_headers.clear() # new table
118 col_index = 0
119 is_archaic_row = False
120 for cell in row.find_child(
121 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
122 ):
123 cell_text = clean_node(wxr, None, cell)
124 if cell_text == "":
125 continue
126 if cell.kind == NodeKind.TABLE_HEADER_CELL:
127 if all_header_row:
128 colspan = int(cell.attrs.get("colspan", "1"))
129 col_headers.append(
130 SpanHeader(
131 cell_text.removeprefix("Modo ").strip(),
132 col_index,
133 colspan,
134 )
135 )
136 col_index += colspan
137 else:
138 is_archaic_row = cell_text.endswith("^†")
139 row_header = cell_text.removesuffix("^†").strip()
140 else:
141 cell_nodes = []
142 found_sup = []
143 for node in cell.children:
144 if isinstance(node, HTMLNode) and node.tag == "sup":
145 sup_tag = clean_node(wxr, None, node.children)
146 if sup_tag != "": 146 ↛ 143line 146 didn't jump to line 143 because the condition on line 146 was always true
147 found_sup.append(sup_tag)
148 elif not (
149 isinstance(node, HTMLNode)
150 and "movil" in node.attrs.get("class", "")
151 ):
152 if len(found_sup) > 0:
153 forms.extend(
154 process_es_v_cell(
155 wxr,
156 cell_nodes,
157 col_index,
158 col_headers,
159 row_header,
160 is_archaic_row,
161 )
162 )
163 if len(forms) > 0: 163 ↛ 166line 163 didn't jump to line 166 because the condition on line 163 was always true
164 forms[-1].raw_tags.extend(found_sup)
165 translate_raw_tags(forms[-1])
166 found_sup.clear()
167 cell_nodes.clear()
168 cell_nodes.append(node) # hidden HTML tag
169 if len(cell_nodes) > 0: 169 ↛ 185line 169 didn't jump to line 185 because the condition on line 169 was always true
170 forms.extend(
171 process_es_v_cell(
172 wxr,
173 cell_nodes,
174 col_index,
175 col_headers,
176 row_header,
177 is_archaic_row,
178 )
179 )
180 if len(found_sup) > 0 and len(forms) > 0: 180 ↛ 181line 180 didn't jump to line 181 because the condition on line 180 was never true
181 forms[-1].raw_tags.extend(found_sup)
182 translate_raw_tags(forms[-1])
183 found_sup.clear()
184 cell_nodes.clear()
185 col_index += 1
186 return forms, cats.get("categories", [])
189def process_es_v_cell(
190 wxr: WiktextractContext,
191 cell_nodes: list[WikiNode | str],
192 col_index: int,
193 col_headers: list[SpanHeader],
194 row_header: str,
195 is_archaic: bool,
196) -> list[Form]:
197 forms = []
198 for form_str in clean_node(wxr, None, cell_nodes, no_strip=True).split(","):
199 form_str = form_str.strip()
200 if not form_str:
201 continue
202 form = Form(form=form_str)
203 for col_head in col_headers:
204 if (
205 col_index >= col_head.index
206 and col_index < col_head.index + col_head.span
207 ):
208 form.raw_tags.append(col_head.text)
209 form.tags.extend(PRONOUN_TAGS.get(col_head.text, []))
210 if row_header != "": 210 ↛ 212line 210 didn't jump to line 212 because the condition on line 210 was always true
211 form.raw_tags.append(row_header)
212 if is_archaic: 212 ↛ 213line 212 didn't jump to line 213 because the condition on line 212 was never true
213 form.tags.append("archaic")
214 if form.form not in ["", "―", wxr.wtp.title]:
215 translate_raw_tags(form)
216 forms.append(form)
217 return forms