Coverage for src/wiktextract/extractor/ru/inflection.py: 95%
122 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-12 08:27 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-12 08:27 +0000
1from collections import defaultdict
2from dataclasses import dataclass
4from wikitextprocessor import HTMLNode, NodeKind, WikiNode
6from ...page import clean_node
7from ...wxr_context import WiktextractContext
8from .models import Form, WordEntry
9from .tags import translate_raw_tags
12@dataclass
13class TableHeader:
14 text: str
15 start_index: int
16 span: int
19# Викисловарь:Шаблоны словоизменений
22def parse_html_forms_table(
23 wxr: WiktextractContext, word_entry: WordEntry, table_tag: HTMLNode
24):
25 # HTML table
26 # https://ru.wiktionary.org/wiki/Шаблон:прил
27 column_headers = []
28 row_headers = []
29 td_rowspan = defaultdict(int)
30 for row_index, tr_element in enumerate(table_tag.find_html("tr")):
31 if len(list(tr_element.find_html("td"))) == 0:
32 # all column headers
33 col_index = 0
34 for th_element in tr_element.find_html("th"):
35 header_text = ""
36 for header_link in th_element.find_child(NodeKind.LINK):
37 header_text = clean_node(wxr, None, header_link)
38 if header_text == "падеж":
39 continue # ignore top left corner header
40 header_span = int(th_element.attrs.get("colspan", "1"))
41 column_headers.append(
42 TableHeader(header_text, col_index, header_span)
43 )
44 col_index += header_span
45 else: # row headers
46 for node in tr_element.children:
47 if isinstance(node, HTMLNode) and (
48 node.tag == "th"
49 or (
50 node.tag == "td"
51 and node.attrs.get("bgcolor") == "#EEF9FF"
52 )
53 ):
54 header_text = ""
55 for header_link in node.find_child(NodeKind.LINK):
56 header_text = clean_node(wxr, None, header_link)
57 header_span = int(node.attrs.get("rowspan", "1"))
58 row_headers.append(
59 TableHeader(header_text, row_index, header_span)
60 )
62 for row_index, tr_element in enumerate(table_tag.find_html("tr")):
63 col_index = 0
64 has_rowspan = False
65 for td_element in tr_element.find_html("td"):
66 rowspan = 1
67 if td_element.attrs.get("bgcolor") == "#EEF9FF":
68 # this is a td tag but contains header text
69 continue
70 if "rowspan" in td_element.attrs:
71 rowspan = int(td_element.attrs["rowspan"])
72 td_rowspan[col_index] = rowspan - 1
73 has_rowspan = True
74 elif not has_rowspan:
75 for rowspan_index, rowspan_value in td_rowspan.items():
76 if rowspan_value > 0 and col_index == rowspan_index:
77 col_index += 1
78 td_rowspan[rowspan_index] -= 1
79 td_text = clean_node(wxr, None, td_element)
80 for line in td_text.splitlines():
81 form = Form(form=line)
82 for col_header in column_headers:
83 if (
84 col_index >= col_header.start_index
85 and col_index < col_header.start_index + col_header.span
86 ):
87 form.raw_tags.append(col_header.text)
88 for row_header in row_headers:
89 if (
90 row_index < row_header.start_index + row_header.span
91 and row_index + rowspan > row_header.start_index
92 ):
93 form.raw_tags.append(row_header.text)
94 if len(form.form) > 0: 94 ↛ 80line 94 didn't jump to line 80 because the condition on line 94 was always true
95 translate_raw_tags(form)
96 word_entry.forms.append(form)
97 col_index += 1
100def parse_wikitext_forms_table(
101 wxr: WiktextractContext, word_entry: WordEntry, table_node: WikiNode
102) -> None:
103 # https://ru.wiktionary.org/wiki/Шаблон:сущ-ru
104 # Шаблон:inflection сущ ru
105 # Шаблон:Гл-блок
106 column_headers = []
107 for table_row in table_node.find_child(NodeKind.TABLE_ROW):
108 row_headers = []
109 has_data_cell = table_row.contain_node(NodeKind.TABLE_CELL)
110 for col_index, table_cell in enumerate(
111 table_row.find_child(
112 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
113 )
114 ):
115 if table_cell.kind == NodeKind.TABLE_HEADER_CELL:
116 cell_text = clean_node(wxr, None, table_cell)
117 if not has_data_cell:
118 column_headers.append(cell_text)
119 else:
120 if cell_text == "М." and table_cell.contain_node(
121 NodeKind.LINK
122 ):
123 for link_node in table_cell.find_child(NodeKind.LINK): 123 ↛ 110line 123 didn't jump to line 110 because the loop on line 123 didn't complete
124 row_headers.append(link_node.largs[0][0])
125 break
126 else:
127 row_headers.append(cell_text)
128 elif table_cell.kind == NodeKind.TABLE_CELL: 128 ↛ 110line 128 didn't jump to line 110 because the condition on line 128 was always true
129 cell_text = clean_node( # remove cursed <tr> tag
130 wxr,
131 None,
132 [
133 n
134 for n in table_cell.children
135 if not (isinstance(n, HTMLNode) and n.tag == "tr")
136 ],
137 )
138 if table_cell.attrs.get("bgcolor", "").lower() == "#eef9ff":
139 if cell_text == "М." and table_cell.contain_node( 139 ↛ 142line 139 didn't jump to line 142 because the condition on line 139 was never true
140 NodeKind.LINK
141 ):
142 for link_node in table_cell.find_child(NodeKind.LINK):
143 row_headers.append(link_node.largs[0][0])
144 break
145 else:
146 row_headers.append(cell_text)
147 else:
148 for form_text in cell_text.splitlines():
149 add_form_data(
150 word_entry,
151 form_text,
152 row_headers,
153 column_headers,
154 col_index,
155 )
157 # cursed layout from Шаблон:Гл-блок
158 # tr tag could be after or inside table cell node: Шаблон:сущ cu (-а)
159 for tr_tag in table_row.find_html_recursively("tr"):
160 row_headers = []
161 has_th_tag = False
162 for th_tag in tr_tag.find_html("th"):
163 row_headers.append(clean_node(wxr, None, th_tag))
164 has_th_tag = True
165 for td_index, td_tag in enumerate(tr_tag.find_html("td")):
166 if td_tag.contain_node(NodeKind.LINK):
167 for link_node in td_tag.find_child(NodeKind.LINK):
168 if td_tag.attrs.get("bgcolor", "").lower() == "#eef9ff":
169 row_headers.append(clean_node(wxr, None, link_node))
170 else:
171 add_form_data(
172 word_entry,
173 clean_node(wxr, None, link_node),
174 row_headers,
175 []
176 if "colspan" in td_tag.attrs
177 else column_headers,
178 td_index,
179 )
180 else:
181 add_form_data(
182 word_entry,
183 clean_node(wxr, None, td_tag),
184 row_headers,
185 [] if "colspan" in td_tag.attrs else column_headers,
186 td_index + 1 if has_th_tag else td_index,
187 )
190def add_form_data(
191 word_entry: WordEntry,
192 form_text: str,
193 row_headers: list[str],
194 col_headers: list[str],
195 col_index: int,
196) -> None:
197 form = Form(form=form_text.strip(" /"))
198 form.raw_tags.extend(row_headers)
199 if col_index < len(col_headers) and col_headers[col_index] != "":
200 form.raw_tags.append(col_headers[col_index])
201 if form.form not in ["", "—", "-"]:
202 translate_raw_tags(form)
203 word_entry.forms.append(form)
206def extract_прил_ru_comparative_forms(
207 wxr: WiktextractContext, word_entry: WordEntry, expanded_node: WikiNode
208) -> None:
209 after_comparative = False
210 for node in expanded_node.children:
211 if isinstance(node, str):
212 node_str = clean_node(wxr, None, node)
213 if node_str.endswith("Сравнительная степень —"):
214 after_comparative = True
215 elif (
216 after_comparative
217 and isinstance(node, WikiNode)
218 and node.kind == NodeKind.ITALIC
219 ):
220 for link_node in node.find_child(NodeKind.LINK):
221 form = clean_node(wxr, None, link_node)
222 if form != "": 222 ↛ 220line 222 didn't jump to line 220 because the condition on line 222 was always true
223 word_entry.forms.append(
224 Form(form=form, tags=["comparative"])
225 )