Coverage for src/wiktextract/extractor/pt/inflection.py: 93%
164 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1import re
2from dataclasses import dataclass
4from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode
6from ...page import clean_node
7from ...wxr_context import WiktextractContext
8from .models import Form, WordEntry
9from .tags import translate_raw_tags
12@dataclass
13class TableHeader:
14 text: str
15 col_index: int
16 colspan: int
17 row_index: int
18 rowspan: int
21def extract_flex_template(
22 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
23) -> None:
24 # https://pt.wiktionary.org/wiki/Predefinição:flex.pt
25 expanded_node = wxr.wtp.parse(
26 wxr.wtp.node_to_wikitext(t_node), expand_all=True
27 )
28 for table_node in expanded_node.find_child(NodeKind.TABLE):
29 col_headers = []
30 for row_node in table_node.find_child(NodeKind.TABLE_ROW):
31 row_header = ""
32 col_cell_index = 0
33 col_header_index = 0
34 for cell_node in row_node.find_child(
35 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
36 ):
37 col_span = 1
38 col_span_str = cell_node.attrs.get("colspan", "1")
39 if re.fullmatch(r"\d+", col_span_str): 39 ↛ 41line 39 didn't jump to line 41 because the condition on line 39 was always true
40 col_span = int(col_span_str)
41 cell_text = clean_node(wxr, None, cell_node)
42 if cell_text == "":
43 continue
44 if cell_node.kind == NodeKind.TABLE_HEADER_CELL:
45 if row_node.contain_node(NodeKind.TABLE_CELL):
46 row_header = cell_text
47 else:
48 col_headers.append(
49 TableHeader(
50 cell_text, col_header_index, col_span, 0, 0
51 )
52 )
53 col_header_index += col_span
54 elif cell_node.attrs.get("style") == "background:#f4f4f4;": 54 ↛ 55line 54 didn't jump to line 55 because the condition on line 54 was never true
55 row_header = cell_text
56 col_header_index += col_span
57 else:
58 for link_node in cell_node.find_child(NodeKind.LINK):
59 form_str = clean_node(wxr, None, link_node)
60 if form_str in ["", "–", "-", wxr.wtp.title]:
61 continue
62 form_data = Form(form=form_str)
63 if row_header != "": 63 ↛ 65line 63 didn't jump to line 65 because the condition on line 63 was always true
64 form_data.raw_tags.append(row_header)
65 for col_header in col_headers:
66 if (
67 col_cell_index >= col_header.col_index
68 and col_cell_index
69 < col_header.col_index + col_header.colspan
70 ):
71 form_data.raw_tags.append(col_header.text)
72 translate_raw_tags(form_data)
73 word_entry.forms.append(form_data)
75 col_cell_index += col_span
78def extract_conjugation_section(
79 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode
80) -> None:
81 for t_node in level_node.find_child(NodeKind.TEMPLATE):
82 if t_node.template_name.startswith(("conj.pt", "conj/pt")):
83 extract_conj_pt_template(wxr, word_entry, t_node)
84 elif t_node.template_name.startswith("conj.en"): 84 ↛ 81line 84 didn't jump to line 81 because the condition on line 84 was always true
85 extract_conj_en_template(wxr, word_entry, t_node)
88def extract_conj_pt_template(
89 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
90) -> None:
91 # https://pt.wiktionary.org/wiki/Predefinição:conj.pt
92 # https://pt.wiktionary.org/wiki/Predefinição:conj/pt
93 expanded_node = wxr.wtp.parse(
94 wxr.wtp.node_to_wikitext(t_node), expand_all=True
95 )
96 for index, table_node in enumerate(
97 expanded_node.find_child_recursively(NodeKind.TABLE)
98 ):
99 match index:
100 case 0:
101 extract_conj_pt_template_first_table(
102 wxr, word_entry, table_node
103 )
104 case 1: 104 ↛ 96line 104 didn't jump to line 96 because the pattern on line 104 always matched
105 extract_conj_pt_template_second_table(
106 wxr, word_entry, table_node
107 )
110def extract_conj_pt_template_first_table(
111 wxr: WiktextractContext, word_entry: WordEntry, table_node: WikiNode
112) -> None:
113 for row in table_node.find_child(NodeKind.TABLE_ROW):
114 row_header = ""
115 for cell in row.find_child(
116 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
117 ):
118 match cell.kind:
119 case NodeKind.TABLE_HEADER_CELL:
120 row_header = clean_node(wxr, None, cell)
121 case NodeKind.TABLE_CELL: 121 ↛ 115line 121 didn't jump to line 115 because the pattern on line 121 always matched
122 form_str = clean_node(wxr, None, cell)
123 if form_str not in ["", wxr.wtp.title]:
124 form = Form(form=form_str)
125 if row_header != "": 125 ↛ 127line 125 didn't jump to line 127 because the condition on line 125 was always true
126 form.raw_tags.append(row_header)
127 translate_raw_tags(form)
128 word_entry.forms.append(form)
131def extract_conj_pt_template_second_table(
132 wxr: WiktextractContext, word_entry: WordEntry, table_node: WikiNode
133) -> None:
134 col_headers = []
135 row_headers = []
136 row_index = 0
137 for row in table_node.find_child(NodeKind.TABLE_ROW):
138 col_index = 0
139 for cell in row.find_child(
140 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
141 ):
142 match cell.kind:
143 case NodeKind.TABLE_HEADER_CELL:
144 colspan = 1
145 colspan_str = cell.attrs.get("colspan", "1")
146 if re.fullmatch(r"\d+", colspan_str): 146 ↛ 148line 146 didn't jump to line 148 because the condition on line 146 was always true
147 colspan = int(colspan_str)
148 rowspan = 1
149 rowspan_str = cell.attrs.get("rowspan", "1")
150 if re.fullmatch(r"\d+", rowspan_str): 150 ↛ 152line 150 didn't jump to line 152 because the condition on line 150 was always true
151 rowspan = int(rowspan_str)
152 header_str = clean_node(wxr, None, cell)
153 if header_str == "":
154 continue
155 if rowspan > 1:
156 row_index = 0
157 row_headers.clear()
158 header = TableHeader(
159 header_str, col_index, colspan, row_index, rowspan
160 )
161 if not row.contain_node(NodeKind.TABLE_CELL):
162 col_headers.append(header)
163 col_index += colspan
164 else:
165 row_headers.append(header)
166 case NodeKind.TABLE_CELL: 166 ↛ 139line 166 didn't jump to line 139 because the pattern on line 166 always matched
167 has_link = False
168 for link_node in cell.find_child(NodeKind.LINK):
169 link_str = clean_node(wxr, None, link_node)
170 if link_str not in ["", wxr.wtp.title]: 170 ↛ 179line 170 didn't jump to line 179 because the condition on line 170 was always true
171 add_conj_pt_form(
172 word_entry,
173 link_str,
174 col_index,
175 row_index,
176 col_headers,
177 row_headers,
178 )
179 has_link = True
180 if not has_link:
181 cell_str = clean_node(wxr, None, cell)
182 if cell_str not in ["", wxr.wtp.title]: 182 ↛ 191line 182 didn't jump to line 191 because the condition on line 182 was always true
183 add_conj_pt_form(
184 word_entry,
185 cell_str,
186 col_index,
187 row_index,
188 col_headers,
189 row_headers,
190 )
191 col_index += 1
193 row_index += 1
196def add_conj_pt_form(
197 word_entry: WordEntry,
198 form_str: str,
199 col_index: int,
200 row_index: int,
201 col_headers: list[TableHeader],
202 row_headers: list[TableHeader],
203) -> None:
204 form = Form(form=form_str)
205 for col_header in col_headers:
206 if (
207 col_index >= col_header.col_index
208 and col_index < col_header.col_index + col_header.colspan
209 ):
210 form.raw_tags.append(col_header.text)
211 for row_header in row_headers:
212 if ( 212 ↛ 211line 212 didn't jump to line 211 because the condition on line 212 was always true
213 row_index >= row_header.row_index
214 and row_index < row_header.row_index + row_header.rowspan
215 ):
216 form.raw_tags.append(row_header.text)
217 translate_raw_tags(form)
218 word_entry.forms.append(form)
221def extract_conj_en_template(
222 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
223) -> None:
224 # https://pt.wiktionary.org/wiki/Predefinição:conj.en
225 expanded_node = wxr.wtp.parse(
226 wxr.wtp.node_to_wikitext(t_node), expand_all=True
227 )
228 for table in expanded_node.find_child(NodeKind.TABLE):
229 for row in table.find_child(NodeKind.TABLE_ROW):
230 for cell in row.find_child(NodeKind.TABLE_CELL):
231 raw_tag = ""
232 for sup_tag in cell.find_html("sup"):
233 raw_tag = clean_node(wxr, None, sup_tag.children).strip(
234 ": "
235 )
236 for list_node in cell.find_child(NodeKind.LIST):
237 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
238 for bold_node in list_item.find_child(NodeKind.BOLD):
239 form_str = clean_node(wxr, None, bold_node)
240 if form_str not in ["", wxr.wtp.title]: 240 ↛ 238line 240 didn't jump to line 238 because the condition on line 240 was always true
241 form = Form(form=form_str)
242 if raw_tag != "": 242 ↛ 244line 242 didn't jump to line 244 because the condition on line 242 was always true
243 form.raw_tags.append(raw_tag)
244 translate_raw_tags(form)
245 word_entry.forms.append(form)
248def extract_degree_section(
249 wxr: WiktextractContext,
250 word_entry: WordEntry,
251 level_node: LevelNode,
252) -> None:
253 for list_node in level_node.find_child(NodeKind.LIST):
254 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
255 for index, bold_node in list_item.find_child(NodeKind.BOLD, True): 255 ↛ 254line 255 didn't jump to line 254 because the loop on line 255 didn't complete
256 bold_str = clean_node(wxr, None, bold_node)
257 forms_str = clean_node(
258 wxr, None, list_item.children[index + 1 :]
259 ).strip(": ")
260 for form_str in forms_str.split(","):
261 form_str = form_str.strip()
262 if form_str not in ["", wxr.wtp.title]: 262 ↛ 260line 262 didn't jump to line 260 because the condition on line 262 was always true
263 form = Form(form=form_str)
264 if form_str != "": 264 ↛ 266line 264 didn't jump to line 266 because the condition on line 264 was always true
265 form.raw_tags.append(bold_str)
266 translate_raw_tags(form)
267 word_entry.forms.append(form)
268 break