Coverage for src/wiktextract/extractor/ru/inflection.py: 98%
138 statements
« prev ^ index » next coverage.py v7.11.0, created at 2025-11-07 08:08 +0000
« prev ^ index » next coverage.py v7.11.0, created at 2025-11-07 08:08 +0000
1from dataclasses import dataclass
2from itertools import chain
4from wikitextprocessor import HTMLNode, NodeKind, WikiNode
6from ...page import clean_node
7from ...wxr_context import WiktextractContext
8from .models import Form, WordEntry
9from .tags import translate_raw_tags
12@dataclass
13class TableHeader:
14 text: str
15 col_index: int = 0
16 colspan: int = 1
17 row_index: int = 0
18 rowspan: int = 1
21# Викисловарь:Шаблоны словоизменений
24def parse_html_forms_table(
25 wxr: WiktextractContext, word_entry: WordEntry, table_tag: HTMLNode
26):
27 # HTML table
28 # https://ru.wiktionary.org/wiki/Шаблон:прил
29 # Шаблон:спряжения
30 col_headers = []
31 row_headers = []
32 for row_index, tr_tag in enumerate(table_tag.find_html("tr")):
33 row_has_data = any(tr_tag.find_html("td"))
34 col_index = 0
35 for header in chain(col_headers, row_headers):
36 if (
37 row_index > header.row_index
38 and row_index < header.row_index + header.rowspan
39 and header.col_index <= col_index
40 ):
41 col_index += header.colspan
42 for th_tag in tr_tag.find_html("th"):
43 th_text = clean_node(wxr, None, th_tag)
44 colspan = int(th_tag.attrs.get("colspan", "1"))
45 rowspan = int(th_tag.attrs.get("rowspan", "1"))
46 if not row_has_data:
47 col_headers.append(
48 TableHeader(th_text, col_index, colspan, row_index, rowspan)
49 )
50 else:
51 row_headers.append(
52 TableHeader(th_text, col_index, colspan, row_index, rowspan)
53 )
54 col_index += colspan
56 has_rowspan_td = []
57 for row_index, tr_tag in enumerate(table_tag.find_html("tr")):
58 col_index = 0
59 last_col_header_row = 0
60 for col_header in col_headers[::-1]:
61 if col_header.row_index < row_index:
62 last_col_header_row = col_header.row_index
63 break
64 for row_header in row_headers:
65 if (
66 row_index >= row_header.row_index
67 and row_index < row_header.row_index + row_header.rowspan
68 and row_header.col_index <= col_index
69 ):
70 col_index += row_header.colspan
71 for td_tag in tr_tag.find_html("td"):
72 for above_td in has_rowspan_td:
73 if (
74 row_index > above_td.row_index
75 and row_index < above_td.row_index + above_td.rowspan
76 and above_td.col_index <= col_index
77 ):
78 col_index += above_td.colspan
79 colspan = int(td_tag.attrs.get("colspan", "1"))
80 rowspan = int(td_tag.attrs.get("rowspan", "1"))
81 if rowspan > 1:
82 has_rowspan_td.append(
83 TableHeader("", col_index, colspan, row_index, rowspan)
84 )
85 raw_tags = []
86 use_col_tags = []
87 for col_header in col_headers[::-1]:
88 if (
89 col_header.col_index < col_index + colspan
90 and col_index < col_header.col_index + col_header.colspan
91 and col_header.text not in raw_tags
92 and col_header.text not in use_col_tags
93 # column header above cell and above last header
94 # don't use headers for other top sections
95 and col_header.row_index + col_header.rowspan
96 in [last_col_header_row, last_col_header_row + 1]
97 ):
98 use_col_tags.append(col_header.text)
99 raw_tags.extend(use_col_tags[::-1])
100 for row_header in row_headers:
101 if (
102 row_header.row_index < row_index + rowspan
103 and row_index < row_header.row_index + row_header.rowspan
104 and row_header.text not in raw_tags
105 ):
106 raw_tags.append(row_header.text)
107 form_nodes = []
108 for td_child in td_tag.children:
109 if (
110 isinstance(td_child, HTMLNode)
111 and td_child.tag == "span"
112 and "cursor:help" in td_child.attrs.get("style", "")
113 ):
114 sup_tag = td_child.attrs.get("title", "")
115 if sup_tag != "": 115 ↛ 108line 115 didn't jump to line 108 because the condition on line 115 was always true
116 raw_tags.append(sup_tag)
117 else:
118 form_nodes.append(td_child)
119 td_text = clean_node(wxr, None, form_nodes)
120 for line in td_text.splitlines():
121 for word in line.split(","):
122 word = word.strip()
123 if word not in ["", "—", wxr.wtp.title]:
124 form = Form(form=word, raw_tags=raw_tags)
125 translate_raw_tags(form)
126 word_entry.forms.append(form)
127 col_index += colspan
130def parse_wikitext_forms_table(
131 wxr: WiktextractContext, word_entry: WordEntry, table: WikiNode
132) -> None:
133 # https://ru.wiktionary.org/wiki/Шаблон:сущ-ru
134 # Шаблон:inflection сущ ru
135 col_headers = []
136 row_headers = []
137 for row_index, row in enumerate(table.find_child(NodeKind.TABLE_ROW)):
138 row_has_data = row.contain_node(NodeKind.TABLE_CELL)
139 col_index = 0
140 for header in chain(col_headers, row_headers):
141 if ( 141 ↛ 146line 141 didn't jump to line 146 because the condition on line 141 was never true
142 row_index > header.row_index
143 and row_index < header.row_index + header.rowspan
144 and header.col_index <= col_index
145 ):
146 col_index += header.colspan
147 for cell_node in row.find_child(
148 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
149 ):
150 cell_text = clean_node(wxr, None, cell_node)
151 colspan = int(cell_node.attrs.get("colspan", "1"))
152 rowspan = int(cell_node.attrs.get("rowspan", "1"))
153 if cell_node.kind == NodeKind.TABLE_CELL:
154 pass
155 elif not row_has_data:
156 col_headers.append(
157 TableHeader(
158 cell_text, col_index, colspan, row_index, rowspan
159 )
160 )
161 else:
162 if cell_text == "М." and cell_node.contain_node(NodeKind.LINK):
163 for link_node in cell_node.find_child(NodeKind.LINK): 163 ↛ 166line 163 didn't jump to line 166 because the loop on line 163 didn't complete
164 cell_text = clean_node(wxr, None, link_node.largs[0][0])
165 break
166 row_headers.append(
167 TableHeader(
168 cell_text, col_index, colspan, row_index, rowspan
169 )
170 )
171 col_index += colspan
173 for row_index, row in enumerate(table.find_child(NodeKind.TABLE_ROW)):
174 col_index = 0
175 for header in chain(col_headers, row_headers):
176 if (
177 row_index >= header.row_index
178 and row_index < header.row_index + header.rowspan
179 and header.col_index <= col_index
180 ):
181 col_index += header.colspan
182 for cell_node in row.find_child(NodeKind.TABLE_CELL):
183 colspan = int(cell_node.attrs.get("colspan", "1"))
184 rowspan = int(cell_node.attrs.get("rowspan", "1"))
185 cell_text = clean_node(wxr, None, cell_node)
186 last_col_header_row = -1
187 use_tags = []
188 for line in cell_text.splitlines():
189 line = line.strip("\n /")
190 if line not in ["", "—", "-", wxr.wtp.title]:
191 form = Form(form=line)
192 for col_header in col_headers[::-1]:
193 if (
194 col_header.text != ""
195 and col_header.col_index < col_index + colspan
196 and col_index
197 < col_header.col_index + col_header.colspan
198 and col_header.text not in form.raw_tags
199 and col_header.text not in use_tags
200 and (
201 (
202 last_col_header_row != -1
203 and col_header.row_index
204 + col_header.rowspan
205 in [
206 last_col_header_row,
207 last_col_header_row + 1,
208 ]
209 )
210 or (
211 last_col_header_row == -1
212 and col_header.row_index
213 + col_header.rowspan
214 <= row_index
215 )
216 )
217 ):
218 use_tags.append(col_header.text)
219 last_col_header_row = col_header.row_index
220 form.raw_tags.extend(use_tags[::-1])
221 use_tags.clear()
222 for row_header in row_headers[::-1]:
223 if (
224 row_header.text != ""
225 and row_header.row_index < row_index + rowspan
226 and row_index
227 < row_header.row_index + row_header.rowspan
228 and row_header.text not in form.raw_tags
229 and row_header.text not in use_tags
230 ):
231 use_tags.append(row_header.text)
232 form.raw_tags.extend(use_tags[::-1])
233 translate_raw_tags(form)
234 word_entry.forms.append(form)
235 col_index += colspan
238def extract_прил_ru_comparative_forms(
239 wxr: WiktextractContext, word_entry: WordEntry, expanded_node: WikiNode
240) -> None:
241 after_comparative = False
242 for node in expanded_node.children:
243 if isinstance(node, str):
244 node_str = clean_node(wxr, None, node)
245 if node_str.endswith("Сравнительная степень —"):
246 after_comparative = True
247 elif (
248 after_comparative
249 and isinstance(node, WikiNode)
250 and node.kind == NodeKind.ITALIC
251 ):
252 for link_node in node.find_child(NodeKind.LINK):
253 form = clean_node(wxr, None, link_node)
254 if form != "": 254 ↛ 252line 254 didn't jump to line 252 because the condition on line 254 was always true
255 word_entry.forms.append(
256 Form(form=form, tags=["comparative"])
257 )