Coverage for src/wiktextract/extractor/es/conjugation.py: 93%
88 statements
« prev ^ index » next coverage.py v7.11.0, created at 2025-11-03 05:44 +0000
« prev ^ index » next coverage.py v7.11.0, created at 2025-11-03 05:44 +0000
1from dataclasses import dataclass
3from wikitextprocessor import HTMLNode, NodeKind, TemplateNode, WikiNode
5from ...page import clean_node
6from ...wxr_context import WiktextractContext
7from .models import Form, WordEntry
8from .tags import translate_raw_tags
11def extract_conjugation_section(
12 wxr: WiktextractContext, page_data: list[WordEntry], level_node: WikiNode
13) -> None:
14 forms = []
15 cats = []
16 for t_node in level_node.find_child(NodeKind.TEMPLATE):
17 if t_node.template_name in ["es.v", "en.v", "de.v", "pl.v"]: 17 ↛ 16line 17 didn't jump to line 16 because the condition on line 17 was always true
18 new_forms, new_cats = process_es_v_template(wxr, t_node)
19 forms.extend(new_forms)
20 cats.extend(new_cats)
22 for data in page_data:
23 if (
24 data.lang_code == page_data[-1].lang_code
25 and data.etymology_text == page_data[-1].etymology_text
26 and data.pos == "verb" # should be fixed on Wiktionary
27 ):
28 data.forms.extend(forms)
29 data.categories.extend(cats)
32@dataclass
33class SpanHeader:
34 text: str
35 index: int
36 span: int
39# https://en.wikipedia.org/wiki/Spanish_pronouns
40PRONOUN_TAGS = {
41 "yo": ["first-person", "singular"],
42 "que yo": ["first-person", "singular"],
43 "tú": ["second-person", "singular"],
44 "que tú": ["second-person", "singular"],
45 "(tú)": ["second-person", "singular"],
46 "vos": ["second-person", "singular", "vos-form"],
47 "que vos": ["second-person", "singular", "vos-form"],
48 "(vos)": ["second-person", "singular", "vos-form"],
49 "él, ella, usted": ["third-person", "singular"],
50 "que él, que ella, que usted": ["third-person", "singular"],
51 "(usted)": ["third-person", "singular"],
52 "nosotros": ["first-person", "plural"],
53 "que nosotros": ["first-person", "plural"],
54 "(nosotros)": ["first-person", "plural"],
55 "vosotros": ["second-person", "plural"],
56 "que vosotros": ["second-person", "plural"],
57 "(vosotros)": ["second-person", "plural"],
58 "ustedes, ellos": ["third-person", "plural"],
59 "que ustedes, que ellos": ["third-person", "plural"],
60 "(ustedes)": ["third-person", "plural"],
61 # Template:en.v
62 "I": ["first-person"],
63 "you": ["second-person"],
64 "(you)": ["second-person"],
65 "he, she, it": ["third-person", "singular"],
66 "we, you, they": ["third-person", "plural"],
67 "(we)": ["third-person", "plural"],
68 # Template:de.v
69 "ich": ["first-person", "singular"],
70 "du": ["second-person", "singular"],
71 "er, sie, es": ["third-person", "singular"],
72 "wir": ["first-person", "plural"],
73 "ihr": ["second-person", "plural"],
74 "sie": ["third-person", "plural"],
75 "(du)": ["second-person", "singular"],
76 "(wir)": ["first-person", "plural"],
77 "(ihr)": ["second-person", "plural"],
78 "(Sie)": ["third-person", "plural"],
79 # Template:pl.v
80 "ja": ["first-person", "singular"],
81 "ty": ["second-person", "singular"],
82 "on, ona, ono": ["third-person", "singular"],
83 "my": ["first-person", "plural"],
84 "wy": ["second-person", "plural"],
85 "oni, one": ["third-person", "plural"],
86 "(ty)": ["second-person", "singular"],
87 "(my)": ["first-person", "plural"],
88 "(wy)": ["second-person", "plural"],
89}
92def process_es_v_template(
93 wxr: WiktextractContext, template_node: TemplateNode
94) -> tuple[list[Form], list[str]]:
95 # https://es.wiktionary.org/wiki/Plantilla:es.v
96 forms = []
97 cats = {}
98 expanded_node = wxr.wtp.parse(
99 wxr.wtp.node_to_wikitext(template_node), expand_all=True
100 )
101 clean_node(wxr, cats, expanded_node)
102 table_nodes = list(expanded_node.find_child_recursively(NodeKind.TABLE))
103 if len(table_nodes) == 0: 103 ↛ 104line 103 didn't jump to line 104 because the condition on line 103 was never true
104 return [], []
105 table_node = table_nodes[0]
106 col_headers = []
107 for row in table_node.find_child(NodeKind.TABLE_ROW):
108 row_header = ""
109 single_cell = len(list(row.filter_empty_str_child())) == 1
110 all_header_row = row.contain_node(
111 NodeKind.TABLE_HEADER_CELL
112 ) and not row.contain_node(NodeKind.TABLE_CELL)
113 if not all_header_row and single_cell: 113 ↛ 114line 113 didn't jump to line 114 because the condition on line 113 was never true
114 continue # ignore end notes
115 if all_header_row and single_cell:
116 col_headers.clear() # new table
118 col_index = 0
119 is_archaic_row = False
120 for cell in row.find_child(
121 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
122 ):
123 cell_text = clean_node(wxr, None, cell)
124 if cell_text == "":
125 continue
126 if cell.kind == NodeKind.TABLE_HEADER_CELL:
127 if all_header_row:
128 colspan = int(cell.attrs.get("colspan", "1"))
129 col_headers.append(
130 SpanHeader(
131 cell_text.removeprefix("Modo ").strip(),
132 col_index,
133 colspan,
134 )
135 )
136 col_index += colspan
137 else:
138 is_archaic_row = cell_text.endswith("^†")
139 row_header = cell_text.removesuffix("^†").strip()
140 else:
141 cell_nodes = []
142 for node in cell.children:
143 if isinstance(node, HTMLNode) and node.tag == "sup":
144 sup_tag = clean_node(wxr, None, node.children)
145 if sup_tag != "" and len(forms) > 0: 145 ↛ 142line 145 didn't jump to line 142 because the condition on line 145 was always true
146 forms[-1].raw_tags.append(sup_tag)
147 translate_raw_tags(forms[-1])
148 elif (
149 isinstance(node, WikiNode)
150 and node.kind == NodeKind.LINK
151 ):
152 cell_nodes.append(node)
153 forms.extend(
154 process_es_v_cell(
155 wxr,
156 cell_nodes,
157 col_index,
158 col_headers,
159 row_header,
160 is_archaic_row,
161 )
162 )
163 cell_nodes.clear()
164 elif not (
165 isinstance(node, HTMLNode)
166 and "movil" in node.attrs.get("class", "")
167 ):
168 cell_nodes.append(node) # hidden HTML tag
169 if len(cell_nodes) > 0: 169 ↛ 180line 169 didn't jump to line 180 because the condition on line 169 was always true
170 forms.extend(
171 process_es_v_cell(
172 wxr,
173 cell_nodes,
174 col_index,
175 col_headers,
176 row_header,
177 is_archaic_row,
178 )
179 )
180 col_index += 1
181 return forms, cats.get("categories", [])
184def process_es_v_cell(
185 wxr: WiktextractContext,
186 cell_nodes: list[WikiNode | str],
187 col_index: int,
188 col_headers: list[SpanHeader],
189 row_header: str,
190 is_archaic: bool,
191) -> list[Form]:
192 forms = []
193 for form_str in clean_node(wxr, None, cell_nodes).split(","):
194 form = Form(form=form_str.strip())
195 for col_head in col_headers:
196 if (
197 col_index >= col_head.index
198 and col_index < col_head.index + col_head.span
199 ):
200 form.raw_tags.append(col_head.text)
201 form.tags.extend(PRONOUN_TAGS.get(col_head.text, []))
202 if row_header != "": 202 ↛ 204line 202 didn't jump to line 204 because the condition on line 202 was always true
203 form.raw_tags.append(row_header)
204 if is_archaic: 204 ↛ 205line 204 didn't jump to line 205 because the condition on line 204 was never true
205 form.tags.append("archaic")
206 if form.form not in ["", "―", wxr.wtp.title]:
207 translate_raw_tags(form)
208 forms.append(form)
209 return forms