Coverage for src/wiktextract/extractor/it/inflection.py: 91%
117 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1import re
2from dataclasses import dataclass
4from wikitextprocessor import NodeKind, TemplateNode, WikiNode
6from ...page import clean_node
7from ...wxr_context import WiktextractContext
8from .models import Form, WordEntry
9from .tags import translate_raw_tags
12def extract_tabs_template(
13 wxr: WiktextractContext, word_entry: WordEntry, node: TemplateNode
14) -> None:
15 # https://it.wiktionary.org/wiki/Template:Tabs
16 tags = [
17 ["masculine", "singular"],
18 ["masculine", "plural"],
19 ["feminine", "singular"],
20 ["feminine", "plural"],
21 ]
22 for arg_name in range(1, 5):
23 arg_value = clean_node(
24 wxr, None, node.template_parameters.get(arg_name, "")
25 )
26 if arg_value not in ["", wxr.wtp.title]:
27 form = Form(form=arg_value, tags=tags[arg_name - 1])
28 word_entry.forms.append(form)
31def extract_it_decl_agg_template(
32 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
33) -> None:
34 # https://it.wiktionary.org/wiki/Template:It-decl-agg4
35 # https://it.wiktionary.org/wiki/Template:It-decl-agg2
36 expanded_node = wxr.wtp.parse(
37 wxr.wtp.node_to_wikitext(t_node), expand_all=True
38 )
39 for table in expanded_node.find_child(NodeKind.TABLE):
40 raw_tag = ""
41 col_tags = []
42 for row in table.find_child(NodeKind.TABLE_ROW):
43 row_tag = ""
44 col_index = 0
45 for cell in row.find_child(
46 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
47 ):
48 match cell.kind:
49 case NodeKind.TABLE_HEADER_CELL:
50 col_span = cell.attrs.get("colspan", "")
51 if col_span != "":
52 raw_tag = clean_node(wxr, None, cell)
53 elif (
54 len(
55 [
56 n
57 for n in row.find_child(
58 NodeKind.TABLE_HEADER_CELL
59 )
60 ]
61 )
62 == 1
63 ):
64 row_tag = clean_node(wxr, None, cell)
65 else:
66 col_header = clean_node(wxr, None, cell)
67 if col_header != "": 67 ↛ 45line 67 didn't jump to line 45 because the condition on line 67 was always true
68 col_tags.append(col_header)
69 case NodeKind.TABLE_CELL: 69 ↛ 45line 69 didn't jump to line 45 because the pattern on line 69 always matched
70 word = clean_node(wxr, None, cell)
71 if word not in ["", wxr.wtp.title]:
72 form = Form(form=word)
73 if raw_tag != "": 73 ↛ 75line 73 didn't jump to line 75 because the condition on line 73 was always true
74 form.raw_tags.append(raw_tag)
75 if row_tag != "": 75 ↛ 77line 75 didn't jump to line 77 because the condition on line 75 was always true
76 form.raw_tags.append(row_tag)
77 if col_index < len(col_tags): 77 ↛ 79line 77 didn't jump to line 79 because the condition on line 77 was always true
78 form.raw_tags.append(col_tags[col_index])
79 translate_raw_tags(form)
80 word_entry.forms.append(form)
81 col_index += 1
84def extract_appendix_conjugation_page(
85 wxr: WiktextractContext, word_entry: WordEntry, page_title: str
86) -> None:
87 # https://it.wiktionary.org/wiki/Appendice:Coniugazioni
88 page_text = wxr.wtp.get_page_body(page_title, 100)
89 if page_text is None: 89 ↛ 90line 89 didn't jump to line 90 because the condition on line 89 was never true
90 return
91 root = wxr.wtp.parse(page_text)
92 for t_node in root.find_child(NodeKind.TEMPLATE):
93 if t_node.template_name.lower() == "it-conj": 93 ↛ 92line 93 didn't jump to line 92 because the condition on line 93 was always true
94 extract_it_conj_template(wxr, word_entry, t_node, page_title)
97@dataclass
98class TableHeader:
99 text: str
100 col_index: int
101 colspan: int
102 row_index: int
103 rowspan: int
106def extract_it_conj_template(
107 wxr: WiktextractContext,
108 word_entry: WordEntry,
109 t_node: TemplateNode,
110 page_title: str,
111) -> None:
112 # https://it.wiktionary.org/wiki/Template:It-conj
113 expanded_node = wxr.wtp.parse(
114 wxr.wtp.node_to_wikitext(t_node), expand_all=True
115 )
116 for table in expanded_node.find_child(NodeKind.TABLE):
117 col_headers = []
118 row_header = ""
119 for row in table.find_child(NodeKind.TABLE_ROW):
120 col_index = 0
121 for cell in row.find_child(
122 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
123 ):
124 match cell.kind:
125 case NodeKind.TABLE_HEADER_CELL:
126 header_str = clean_node(wxr, None, cell)
127 if header_str in ["persona", "indicativo"]:
128 continue
129 elif header_str in ["condizionale", "congiuntivo"]: 129 ↛ 130line 129 didn't jump to line 130 because the condition on line 129 was never true
130 col_headers.clear()
131 continue
132 elif header_str == "imperativo":
133 col_headers.clear()
134 row_header = "imperativo"
135 continue
137 if row.contain_node(NodeKind.TABLE_CELL):
138 row_header = header_str
139 else:
140 colspan = 1
141 colspan_str = cell.attrs.get("colspan", "1")
142 if re.fullmatch(r"\d+", colspan_str): 142 ↛ 144line 142 didn't jump to line 144 because the condition on line 142 was always true
143 colspan = int(colspan_str)
144 col_headers.append(
145 TableHeader(
146 header_str, col_index, colspan, 0, 0
147 )
148 )
149 col_index += colspan
150 case NodeKind.TABLE_CELL: 150 ↛ 121line 150 didn't jump to line 121 because the pattern on line 150 always matched
151 cell_has_table = False
152 for cell_table in cell.find_child_recursively(
153 NodeKind.TABLE
154 ):
155 extract_it_conj_cell_table(
156 wxr,
157 word_entry,
158 cell_table,
159 row_header,
160 col_headers,
161 page_title,
162 )
163 cell_has_table = True
164 if not cell_has_table:
165 for form_str in clean_node(
166 wxr, None, cell
167 ).splitlines():
168 form_str = form_str.strip(", ")
169 if form_str.startswith("verbo di "): 169 ↛ 170line 169 didn't jump to line 170 because the condition on line 169 was never true
170 continue # first row
171 if form_str not in ["", wxr.wtp.title]: 171 ↛ 165line 171 didn't jump to line 165 because the condition on line 171 was always true
172 add_it_conj_form(
173 word_entry,
174 form_str,
175 page_title,
176 row_header,
177 col_index,
178 col_headers,
179 )
180 col_index += 1
183def extract_it_conj_cell_table(
184 wxr: WiktextractContext,
185 word_entry: WordEntry,
186 table_node: WikiNode,
187 row_header: str,
188 col_headers: list[TableHeader],
189 page_title: str,
190) -> None:
191 for row in table_node.find_child(NodeKind.TABLE_ROW):
192 for col_index, cell in enumerate(row.find_child(NodeKind.TABLE_CELL)):
193 for cell_str in clean_node(wxr, None, cell).splitlines():
194 if cell_str not in ["", wxr.wtp.title]: 194 ↛ 193line 194 didn't jump to line 193 because the condition on line 194 was always true
195 add_it_conj_form(
196 word_entry,
197 cell_str,
198 page_title,
199 row_header,
200 col_index,
201 col_headers,
202 )
205def add_it_conj_form(
206 word_entry: WordEntry,
207 form_str: str,
208 page_title: str,
209 row_header: str,
210 col_index: int,
211 col_headers: list[TableHeader],
212) -> None:
213 form = Form(form=form_str, source=page_title)
214 if row_header != "": 214 ↛ 216line 214 didn't jump to line 216 because the condition on line 214 was always true
215 form.raw_tags.append(row_header)
216 for col_header in col_headers:
217 if (
218 col_index >= col_header.col_index
219 and col_index < col_header.col_index + col_header.colspan
220 ):
221 form.raw_tags.append(col_header.text)
222 translate_raw_tags(form)
223 word_entry.forms.append(form)