Coverage for src/wiktextract/extractor/es/conjugation.py: 93%
88 statements
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
1from dataclasses import dataclass
3from wikitextprocessor import HTMLNode, NodeKind, TemplateNode, WikiNode
5from ...page import clean_node
6from ...wxr_context import WiktextractContext
7from .models import Form, WordEntry
8from .tags import translate_raw_tags
11def extract_conjugation_section(
12 wxr: WiktextractContext, page_data: list[WordEntry], level_node: WikiNode
13) -> None:
14 forms = []
15 cats = []
16 for t_node in level_node.find_child(NodeKind.TEMPLATE):
17 if t_node.template_name in ["es.v", "en.v"]: 17 ↛ 16line 17 didn't jump to line 16 because the condition on line 17 was always true
18 new_forms, new_cats = process_es_v_template(wxr, t_node)
19 forms.extend(new_forms)
20 cats.extend(new_cats)
22 for data in page_data:
23 if (
24 data.lang_code == page_data[-1].lang_code
25 and data.etymology_text == page_data[-1].etymology_text
26 and data.pos == "verb" # should be fixed on Wiktionary
27 ):
28 data.forms.extend(forms)
29 data.categories.extend(cats)
32@dataclass
33class SpanHeader:
34 text: str
35 index: int
36 span: int
39# https://en.wikipedia.org/wiki/Spanish_pronouns
40PRONOUN_TAGS = {
41 "yo": ["first-person", "singular"],
42 "que yo": ["first-person", "singular"],
43 "tú": ["second-person", "singular"],
44 "que tú": ["second-person", "singular"],
45 "(tú)": ["second-person", "singular"],
46 "vos": ["second-person", "singular", "vos-form"],
47 "que vos": ["second-person", "singular", "vos-form"],
48 "(vos)": ["second-person", "singular", "vos-form"],
49 "él, ella, usted": ["third-person", "singular"],
50 "que él, que ella, que usted": ["third-person", "singular"],
51 "(usted)": ["third-person", "singular"],
52 "nosotros": ["first-person", "plural"],
53 "que nosotros": ["first-person", "plural"],
54 "(nosotros)": ["first-person", "plural"],
55 "vosotros": ["second-person", "plural"],
56 "que vosotros": ["second-person", "plural"],
57 "(vosotros)": ["second-person", "plural"],
58 "ustedes, ellos": ["third-person", "plural"],
59 "que ustedes, que ellos": ["third-person", "plural"],
60 "(ustedes)": ["third-person", "plural"],
61 # Template:en.v
62 "I": ["first-person"],
63 "you": ["second-person"],
64 "(you)": ["second-person"],
65 "he, she, it": ["third-person", "singular"],
66 "we, you, they": ["third-person", "plural"],
67 "(we)": ["third-person", "plural"],
68}
71def process_es_v_template(
72 wxr: WiktextractContext, template_node: TemplateNode
73) -> tuple[list[Form], list[str]]:
74 # https://es.wiktionary.org/wiki/Plantilla:es.v
75 forms = []
76 cats = {}
77 expanded_node = wxr.wtp.parse(
78 wxr.wtp.node_to_wikitext(template_node), expand_all=True
79 )
80 clean_node(wxr, cats, expanded_node)
81 table_nodes = list(expanded_node.find_child_recursively(NodeKind.TABLE))
82 if len(table_nodes) == 0: 82 ↛ 83line 82 didn't jump to line 83 because the condition on line 82 was never true
83 return [], []
84 table_node = table_nodes[0]
85 col_headers = []
86 for row in table_node.find_child(NodeKind.TABLE_ROW):
87 row_header = ""
88 single_cell = len(list(row.filter_empty_str_child())) == 1
89 all_header_row = row.contain_node(
90 NodeKind.TABLE_HEADER_CELL
91 ) and not row.contain_node(NodeKind.TABLE_CELL)
92 if not all_header_row and single_cell: 92 ↛ 93line 92 didn't jump to line 93 because the condition on line 92 was never true
93 continue # ignore end notes
94 if all_header_row and single_cell:
95 col_headers.clear() # new table
97 col_index = 0
98 is_archaic_row = False
99 for cell in row.find_child(
100 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
101 ):
102 cell_text = clean_node(wxr, None, cell)
103 if cell_text == "":
104 continue
105 if cell.kind == NodeKind.TABLE_HEADER_CELL:
106 if all_header_row:
107 colspan = int(cell.attrs.get("colspan", "1"))
108 col_headers.append(
109 SpanHeader(
110 cell_text.removeprefix("Modo ").strip(),
111 col_index,
112 colspan,
113 )
114 )
115 col_index += colspan
116 else:
117 is_archaic_row = cell_text.endswith("^†")
118 row_header = cell_text.removesuffix("^†").strip()
119 else:
120 cell_nodes = []
121 for node in cell.children:
122 if isinstance(node, HTMLNode) and node.tag == "sup":
123 sup_tag = clean_node(wxr, None, node.children)
124 if sup_tag != "" and len(forms) > 0: 124 ↛ 121line 124 didn't jump to line 121 because the condition on line 124 was always true
125 forms[-1].raw_tags.append(sup_tag)
126 translate_raw_tags(forms[-1])
127 elif (
128 isinstance(node, WikiNode)
129 and node.kind == NodeKind.LINK
130 ):
131 cell_nodes.append(node)
132 forms.extend(
133 process_es_v_cell(
134 wxr,
135 cell_nodes,
136 col_index,
137 col_headers,
138 row_header,
139 is_archaic_row,
140 )
141 )
142 cell_nodes.clear()
143 elif not (
144 isinstance(node, HTMLNode)
145 and "movil" in node.attrs.get("class", "")
146 ):
147 cell_nodes.append(node) # hidden HTML tag
148 if len(cell_nodes) > 0: 148 ↛ 159line 148 didn't jump to line 159 because the condition on line 148 was always true
149 forms.extend(
150 process_es_v_cell(
151 wxr,
152 cell_nodes,
153 col_index,
154 col_headers,
155 row_header,
156 is_archaic_row,
157 )
158 )
159 col_index += 1
160 return forms, cats.get("categories", [])
163def process_es_v_cell(
164 wxr: WiktextractContext,
165 cell_nodes: list[WikiNode | str],
166 col_index: int,
167 col_headers: list[SpanHeader],
168 row_header: str,
169 is_archaic: bool,
170) -> list[Form]:
171 forms = []
172 for form_str in clean_node(wxr, None, cell_nodes).split(","):
173 form = Form(form=form_str.strip())
174 for col_head in col_headers:
175 if (
176 col_index >= col_head.index
177 and col_index < col_head.index + col_head.span
178 ):
179 form.raw_tags.append(col_head.text)
180 form.tags.extend(PRONOUN_TAGS.get(col_head.text, []))
181 if row_header != "": 181 ↛ 183line 181 didn't jump to line 183 because the condition on line 181 was always true
182 form.raw_tags.append(row_header)
183 if is_archaic: 183 ↛ 184line 183 didn't jump to line 184 because the condition on line 183 was never true
184 form.tags.append("archaic")
185 if form.form not in ["", "―", wxr.wtp.title]:
186 translate_raw_tags(form)
187 forms.append(form)
188 return forms