Coverage for src/wiktextract/extractor/ru/inflection.py: 92%
127 statements
« prev ^ index » next coverage.py v7.9.0, created at 2025-06-13 07:43 +0000
« prev ^ index » next coverage.py v7.9.0, created at 2025-06-13 07:43 +0000
1from collections import defaultdict
2from dataclasses import dataclass
4from wikitextprocessor import HTMLNode, NodeKind, WikiNode
6from ...page import clean_node
7from ...wxr_context import WiktextractContext
8from .models import Form, WordEntry
9from .tags import translate_raw_tags
12@dataclass
13class TableHeader:
14 text: str
15 start_index: int
16 span: int
19# Викисловарь:Шаблоны словоизменений
22def parse_html_forms_table(
23 wxr: WiktextractContext, word_entry: WordEntry, table_tag: HTMLNode
24):
25 # HTML table
26 # https://ru.wiktionary.org/wiki/Шаблон:прил
27 column_headers = []
28 row_headers = []
29 td_rowspan = defaultdict(int)
30 for tr_element in table_tag.find_html("tr"):
31 if len(list(tr_element.find_html("td"))) == 0:
32 # all header
33 current_index = 0
34 for th_element in tr_element.find_html("th"):
35 header_text = ""
36 for header_link in th_element.find_child(NodeKind.LINK):
37 header_text = clean_node(wxr, None, header_link)
38 if header_text == "падеж":
39 continue # ignore top left corner header
40 header_span = int(th_element.attrs.get("colspan", "1"))
41 column_headers.append(
42 TableHeader(header_text, current_index, header_span)
43 )
44 current_index += header_span
45 else:
46 col_index = 0
47 has_rowspan = False
48 for th_element in tr_element.find_html("th"): 48 ↛ 49line 48 didn't jump to line 49 because the loop on line 48 never started
49 header_text = ""
50 for header_link in th_element.find_child(NodeKind.LINK):
51 header_text = clean_node(wxr, None, header_link)
52 header_span = int(th_element.attrs.get("rowspan", "1"))
53 row_headers.append(TableHeader(header_text, 0, header_span))
55 for td_element in tr_element.find_html("td"):
56 if td_element.attrs.get("bgcolor") == "#EEF9FF":
57 # this is a td tag but contains header text
58 header_text = ""
59 for header_link in td_element.find_child(NodeKind.LINK):
60 header_text = clean_node(wxr, None, header_link)
61 header_span = int(td_element.attrs.get("rowspan", "1"))
62 row_headers.append(TableHeader(header_text, 0, header_span))
63 continue
64 if "rowspan" in td_element.attrs:
65 td_rowspan[col_index] = int(td_element.attrs["rowspan"]) - 1
66 has_rowspan = True
67 elif not has_rowspan:
68 for rowspan_index, rowspan_value in td_rowspan.items():
69 if rowspan_value > 0 and col_index == rowspan_index:
70 col_index += 1
71 td_rowspan[rowspan_index] -= 1
72 td_text = clean_node(wxr, None, td_element)
73 for line in td_text.split():
74 form = Form(form=line)
75 for col_header in column_headers:
76 if (
77 col_index >= col_header.start_index
78 and col_index
79 < col_header.start_index + col_header.span
80 ):
81 form.raw_tags.append(col_header.text)
82 form.raw_tags.extend([h.text for h in row_headers])
83 if len(form.form) > 0: 83 ↛ 73line 83 didn't jump to line 73 because the condition on line 83 was always true
84 translate_raw_tags(form)
85 word_entry.forms.append(form)
86 col_index += 1
88 updated_row_headers = []
89 for row_header in row_headers:
90 if row_header.span > 1:
91 row_header.span -= 1
92 updated_row_headers.append(row_header)
93 row_headers = updated_row_headers
96def parse_wikitext_forms_table(
97 wxr: WiktextractContext, word_entry: WordEntry, table_node: WikiNode
98) -> None:
99 # https://ru.wiktionary.org/wiki/Шаблон:сущ-ru
100 # Шаблон:inflection сущ ru
101 # Шаблон:Гл-блок
102 column_headers = []
103 for table_row in table_node.find_child(NodeKind.TABLE_ROW):
104 row_headers = []
105 has_data_cell = table_row.contain_node(NodeKind.TABLE_CELL)
106 for col_index, table_cell in enumerate(
107 table_row.find_child(
108 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
109 )
110 ):
111 if table_cell.kind == NodeKind.TABLE_HEADER_CELL:
112 cell_text = clean_node(wxr, None, table_cell)
113 if not has_data_cell:
114 column_headers.append(cell_text)
115 else:
116 if cell_text == "М." and table_cell.contain_node(
117 NodeKind.LINK
118 ):
119 for link_node in table_cell.find_child(NodeKind.LINK): 119 ↛ 106line 119 didn't jump to line 106 because the loop on line 119 didn't complete
120 row_headers.append(link_node.largs[0][0])
121 break
122 else:
123 row_headers.append(cell_text)
124 elif table_cell.kind == NodeKind.TABLE_CELL: 124 ↛ 106line 124 didn't jump to line 106 because the condition on line 124 was always true
125 cell_text = clean_node( # remove cursed <tr> tag
126 wxr,
127 None,
128 [
129 n
130 for n in table_cell.children
131 if not (isinstance(n, HTMLNode) and n.tag == "tr")
132 ],
133 )
134 if table_cell.attrs.get("bgcolor", "").lower() == "#eef9ff":
135 if cell_text == "М." and table_cell.contain_node( 135 ↛ 138line 135 didn't jump to line 138 because the condition on line 135 was never true
136 NodeKind.LINK
137 ):
138 for link_node in table_cell.find_child(NodeKind.LINK):
139 row_headers.append(link_node.largs[0][0])
140 break
141 else:
142 row_headers.append(cell_text)
143 else:
144 for form_text in cell_text.splitlines():
145 add_form_data(
146 word_entry,
147 form_text,
148 row_headers,
149 column_headers,
150 col_index,
151 )
153 # cursed layout from Шаблон:Гл-блок
154 # tr tag could be after or inside table cell node: Шаблон:сущ cu (-а)
155 for tr_tag in table_row.find_html_recursively("tr"):
156 row_headers = []
157 has_th_tag = False
158 for th_tag in tr_tag.find_html("th"):
159 row_headers.append(clean_node(wxr, None, th_tag))
160 has_th_tag = True
161 for td_index, td_tag in enumerate(tr_tag.find_html("td")):
162 if td_tag.contain_node(NodeKind.LINK):
163 for link_node in td_tag.find_child(NodeKind.LINK):
164 if td_tag.attrs.get("bgcolor", "").lower() == "#eef9ff":
165 row_headers.append(clean_node(wxr, None, link_node))
166 else:
167 add_form_data(
168 word_entry,
169 clean_node(wxr, None, link_node),
170 row_headers,
171 []
172 if "colspan" in td_tag.attrs
173 else column_headers,
174 td_index,
175 )
176 else:
177 add_form_data(
178 word_entry,
179 clean_node(wxr, None, td_tag),
180 row_headers,
181 [] if "colspan" in td_tag.attrs else column_headers,
182 td_index + 1 if has_th_tag else td_index,
183 )
186def add_form_data(
187 word_entry: WordEntry,
188 form_text: str,
189 row_headers: list[str],
190 col_headers: list[str],
191 col_index: int,
192) -> None:
193 form = Form(form=form_text.strip(" /"))
194 form.raw_tags.extend(row_headers)
195 if col_index < len(col_headers) and col_headers[col_index] != "":
196 form.raw_tags.append(col_headers[col_index])
197 if form.form not in ["", "—", "-"]:
198 translate_raw_tags(form)
199 word_entry.forms.append(form)
202def extract_прил_ru_comparative_forms(
203 wxr: WiktextractContext, word_entry: WordEntry, expanded_node: WikiNode
204) -> None:
205 after_comparative = False
206 for node in expanded_node.children:
207 if isinstance(node, str):
208 node_str = clean_node(wxr, None, node)
209 if node_str.endswith("Сравнительная степень —"):
210 after_comparative = True
211 elif (
212 after_comparative
213 and isinstance(node, WikiNode)
214 and node.kind == NodeKind.ITALIC
215 ):
216 for link_node in node.find_child(NodeKind.LINK):
217 form = clean_node(wxr, None, link_node)
218 if form != "": 218 ↛ 216line 218 didn't jump to line 216 because the condition on line 218 was always true
219 word_entry.forms.append(
220 Form(form=form, tags=["comparative"])
221 )