Coverage for src/wiktextract/extractor/ru/inflection.py: 98%
131 statements
« prev ^ index » next coverage.py v7.11.0, created at 2025-11-03 05:44 +0000
« prev ^ index » next coverage.py v7.11.0, created at 2025-11-03 05:44 +0000
1from dataclasses import dataclass
2from itertools import chain
4from wikitextprocessor import HTMLNode, NodeKind, WikiNode
6from ...page import clean_node
7from ...wxr_context import WiktextractContext
8from .models import Form, WordEntry
9from .tags import translate_raw_tags
12@dataclass
13class TableHeader:
14 text: str
15 col_index: int = 0
16 colspan: int = 1
17 row_index: int = 0
18 rowspan: int = 1
21# Викисловарь:Шаблоны словоизменений
24def parse_html_forms_table(
25 wxr: WiktextractContext, word_entry: WordEntry, table_tag: HTMLNode
26):
27 # HTML table
28 # https://ru.wiktionary.org/wiki/Шаблон:прил
29 # Шаблон:спряжения
30 col_headers = []
31 row_headers = []
32 for row_index, tr_tag in enumerate(table_tag.find_html("tr")):
33 row_has_data = any(tr_tag.find_html("td"))
34 col_index = 0
35 for header in chain(col_headers, row_headers):
36 if (
37 row_index > header.row_index
38 and row_index < header.row_index + header.rowspan
39 and header.col_index <= col_index
40 ):
41 col_index += header.colspan
42 for th_tag in tr_tag.find_html("th"):
43 th_text = clean_node(wxr, None, th_tag)
44 colspan = int(th_tag.attrs.get("colspan", "1"))
45 rowspan = int(th_tag.attrs.get("rowspan", "1"))
46 if not row_has_data:
47 col_headers.append(
48 TableHeader(th_text, col_index, colspan, row_index, rowspan)
49 )
50 else:
51 row_headers.append(
52 TableHeader(th_text, col_index, colspan, row_index, rowspan)
53 )
54 col_index += colspan
56 has_rowspan_td = []
57 for row_index, tr_tag in enumerate(table_tag.find_html("tr")):
58 col_index = 0
59 last_col_header_row = 0
60 for col_header in col_headers[::-1]:
61 if col_header.row_index < row_index:
62 last_col_header_row = col_header.row_index
63 break
64 for row_header in row_headers:
65 if (
66 row_index >= row_header.row_index
67 and row_index < row_header.row_index + row_header.rowspan
68 and row_header.col_index <= col_index
69 ):
70 col_index += row_header.colspan
71 for td_tag in tr_tag.find_html("td"):
72 for above_td in has_rowspan_td:
73 if (
74 row_index > above_td.row_index
75 and row_index < above_td.row_index + above_td.rowspan
76 and above_td.col_index <= col_index
77 ):
78 col_index += above_td.colspan
79 colspan = int(td_tag.attrs.get("colspan", "1"))
80 rowspan = int(td_tag.attrs.get("rowspan", "1"))
81 if rowspan > 1:
82 has_rowspan_td.append(
83 TableHeader("", col_index, colspan, row_index, rowspan)
84 )
85 td_text = clean_node(wxr, None, td_tag)
86 raw_tags = []
87 use_col_tags = []
88 for col_header in col_headers[::-1]:
89 if (
90 col_header.col_index < col_index + colspan
91 and col_index < col_header.col_index + col_header.colspan
92 and col_header.text not in raw_tags
93 and col_header.text not in use_col_tags
94 # column header above cell and above last header
95 # don't use headers for other top sections
96 and col_header.row_index + col_header.rowspan
97 in [last_col_header_row, last_col_header_row + 1]
98 ):
99 use_col_tags.append(col_header.text)
100 raw_tags.extend(use_col_tags[::-1])
101 for row_header in row_headers:
102 if (
103 row_header.row_index < row_index + rowspan
104 and row_index < row_header.row_index + row_header.rowspan
105 and row_header.text not in raw_tags
106 ):
107 raw_tags.append(row_header.text)
108 for line in td_text.splitlines():
109 for word in line.split(","):
110 word = word.strip()
111 if word not in ["", "—", wxr.wtp.title]: 111 ↛ 109line 111 didn't jump to line 109 because the condition on line 111 was always true
112 form = Form(form=word, raw_tags=raw_tags)
113 translate_raw_tags(form)
114 word_entry.forms.append(form)
115 col_index += colspan
118def parse_wikitext_forms_table(
119 wxr: WiktextractContext, word_entry: WordEntry, table: WikiNode
120) -> None:
121 # https://ru.wiktionary.org/wiki/Шаблон:сущ-ru
122 # Шаблон:inflection сущ ru
123 col_headers = []
124 row_headers = []
125 for row_index, row in enumerate(table.find_child(NodeKind.TABLE_ROW)):
126 row_has_data = row.contain_node(NodeKind.TABLE_CELL)
127 col_index = 0
128 for header in chain(col_headers, row_headers):
129 if ( 129 ↛ 134line 129 didn't jump to line 134 because the condition on line 129 was never true
130 row_index > header.row_index
131 and row_index < header.row_index + header.rowspan
132 and header.col_index <= col_index
133 ):
134 col_index += header.colspan
135 for cell_node in row.find_child(
136 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
137 ):
138 cell_text = clean_node(wxr, None, cell_node)
139 colspan = int(cell_node.attrs.get("colspan", "1"))
140 rowspan = int(cell_node.attrs.get("rowspan", "1"))
141 if cell_node.kind == NodeKind.TABLE_CELL:
142 pass
143 elif not row_has_data:
144 col_headers.append(
145 TableHeader(
146 cell_text, col_index, colspan, row_index, rowspan
147 )
148 )
149 else:
150 if cell_text == "М." and cell_node.contain_node(NodeKind.LINK):
151 for link_node in cell_node.find_child(NodeKind.LINK): 151 ↛ 154line 151 didn't jump to line 154 because the loop on line 151 didn't complete
152 cell_text = clean_node(wxr, None, link_node.largs[0][0])
153 break
154 row_headers.append(
155 TableHeader(
156 cell_text, col_index, colspan, row_index, rowspan
157 )
158 )
159 col_index += colspan
161 for row_index, row in enumerate(table.find_child(NodeKind.TABLE_ROW)):
162 col_index = 0
163 for header in chain(col_headers, row_headers):
164 if (
165 row_index >= header.row_index
166 and row_index < header.row_index + header.rowspan
167 and header.col_index <= col_index
168 ):
169 col_index += header.colspan
170 for cell_node in row.find_child(NodeKind.TABLE_CELL):
171 colspan = int(cell_node.attrs.get("colspan", "1"))
172 rowspan = int(cell_node.attrs.get("rowspan", "1"))
173 cell_text = clean_node(wxr, None, cell_node)
174 last_col_header_row = -1
175 use_tags = []
176 for line in cell_text.splitlines():
177 line = line.strip("\n /")
178 if line not in ["", "—", "-", wxr.wtp.title]:
179 form = Form(form=line)
180 for col_header in col_headers[::-1]:
181 if (
182 col_header.text != ""
183 and col_header.col_index < col_index + colspan
184 and col_index
185 < col_header.col_index + col_header.colspan
186 and col_header.text not in form.raw_tags
187 and col_header.text not in use_tags
188 and (
189 (
190 last_col_header_row != -1
191 and col_header.row_index
192 + col_header.rowspan
193 in [
194 last_col_header_row,
195 last_col_header_row + 1,
196 ]
197 )
198 or (
199 last_col_header_row == -1
200 and col_header.row_index
201 + col_header.rowspan
202 <= row_index
203 )
204 )
205 ):
206 use_tags.append(col_header.text)
207 last_col_header_row = col_header.row_index
208 form.raw_tags.extend(use_tags[::-1])
209 use_tags.clear()
210 for row_header in row_headers[::-1]:
211 if (
212 row_header.text != ""
213 and row_header.row_index < row_index + rowspan
214 and row_index
215 < row_header.row_index + row_header.rowspan
216 and row_header.text not in form.raw_tags
217 and row_header.text not in use_tags
218 ):
219 use_tags.append(row_header.text)
220 form.raw_tags.extend(use_tags[::-1])
221 translate_raw_tags(form)
222 word_entry.forms.append(form)
223 col_index += colspan
226def extract_прил_ru_comparative_forms(
227 wxr: WiktextractContext, word_entry: WordEntry, expanded_node: WikiNode
228) -> None:
229 after_comparative = False
230 for node in expanded_node.children:
231 if isinstance(node, str):
232 node_str = clean_node(wxr, None, node)
233 if node_str.endswith("Сравнительная степень —"):
234 after_comparative = True
235 elif (
236 after_comparative
237 and isinstance(node, WikiNode)
238 and node.kind == NodeKind.ITALIC
239 ):
240 for link_node in node.find_child(NodeKind.LINK):
241 form = clean_node(wxr, None, link_node)
242 if form != "": 242 ↛ 240line 242 didn't jump to line 240 because the condition on line 242 was always true
243 word_entry.forms.append(
244 Form(form=form, tags=["comparative"])
245 )