Coverage for src/wiktextract/extractor/ru/inflection.py: 98%
103 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1from collections import defaultdict
2from dataclasses import dataclass
4from wikitextprocessor import HTMLNode, NodeKind, WikiNode
6from ...page import clean_node
7from ...wxr_context import WiktextractContext
8from .models import Form, WordEntry
9from .tags import translate_raw_tags
12@dataclass
13class TableHeader:
14 text: str
15 start_index: int
16 span: int
19def parse_adj_forms_table(
20 wxr: WiktextractContext,
21 word_entry: WordEntry,
22 expanded_template: WikiNode,
23):
24 # HTML table
25 # https://ru.wiktionary.org/wiki/Шаблон:прил
26 for table_element in expanded_template.find_html("table"):
27 column_headers = []
28 row_headers = []
29 td_rowspan = defaultdict(int)
30 for tr_element in table_element.find_html("tr"):
31 if len(list(tr_element.find_html("td"))) == 0:
32 # all header
33 current_index = 0
34 for th_element in tr_element.find_html("th"):
35 header_text = ""
36 for header_link in th_element.find_child(NodeKind.LINK):
37 header_text = clean_node(wxr, None, header_link)
38 if header_text == "падеж":
39 continue # ignore top left corner header
40 header_span = int(th_element.attrs.get("colspan", "1"))
41 column_headers.append(
42 TableHeader(header_text, current_index, header_span)
43 )
44 current_index += header_span
45 else:
46 col_index = 0
47 has_rowspan = False
48 for td_element in tr_element.find_html("td"):
49 if td_element.attrs.get("bgcolor") == "#EEF9FF":
50 # this is a td tag but contains header text
51 header_text = ""
52 for header_link in td_element.find_child(NodeKind.LINK):
53 header_text = clean_node(wxr, None, header_link)
54 header_span = int(td_element.attrs.get("rowspan", "1"))
55 row_headers.append(
56 TableHeader(header_text, 0, header_span)
57 )
58 continue
59 if "rowspan" in td_element.attrs:
60 td_rowspan[col_index] = (
61 int(td_element.attrs["rowspan"]) - 1
62 )
63 has_rowspan = True
64 elif not has_rowspan:
65 for rowspan_index, rowspan_value in td_rowspan.items():
66 if rowspan_value > 0 and col_index == rowspan_index:
67 col_index += 1
68 td_rowspan[rowspan_index] -= 1
69 td_text = clean_node(wxr, None, td_element)
70 for line in td_text.split():
71 form = Form(form=line)
72 for col_header in column_headers:
73 if (
74 col_index >= col_header.start_index
75 and col_index
76 < col_header.start_index + col_header.span
77 ):
78 form.raw_tags.append(col_header.text)
79 form.raw_tags.extend([h.text for h in row_headers])
80 if len(form.form) > 0: 80 ↛ 70line 80 didn't jump to line 70 because the condition on line 80 was always true
81 translate_raw_tags(form)
82 word_entry.forms.append(form)
83 col_index += 1
85 updated_row_headers = []
86 for row_header in row_headers:
87 if row_header.span > 1:
88 row_header.span -= 1
89 updated_row_headers.append(row_header)
90 row_headers = updated_row_headers
93def parse_wikitext_forms_table(
94 wxr: WiktextractContext,
95 word_entry: WordEntry,
96 expanded_template: WikiNode,
97) -> None:
98 # https://ru.wiktionary.org/wiki/Шаблон:сущ-ru
99 # Шаблон:inflection сущ ru
100 # Шаблон:Гл-блок
101 table_nodes = list(expanded_template.find_child(NodeKind.TABLE))
102 if len(table_nodes) == 0:
103 return
104 table_node = table_nodes[0]
105 column_headers = []
106 for table_row in table_node.find_child(NodeKind.TABLE_ROW):
107 row_headers = []
108 for col_index, table_cell in enumerate(
109 table_row.find_child(
110 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
111 )
112 ):
113 if table_cell.kind == NodeKind.TABLE_HEADER_CELL:
114 column_headers.append(clean_node(wxr, None, table_cell))
115 elif table_cell.kind == NodeKind.TABLE_CELL: 115 ↛ 108line 115 didn't jump to line 108 because the condition on line 115 was always true
116 cell_text = clean_node( # remove cursed <tr> tag
117 wxr,
118 None,
119 [
120 n
121 for n in table_cell.children
122 if not (isinstance(n, HTMLNode) and n.tag == "tr")
123 ],
124 )
125 if table_cell.attrs.get("bgcolor", "").lower() == "#eef9ff":
126 if cell_text == "М." and table_cell.contain_node(
127 NodeKind.LINK
128 ):
129 for link_node in table_cell.find_child(NodeKind.LINK): 129 ↛ 108line 129 didn't jump to line 108 because the loop on line 129 didn't complete
130 row_headers.append(link_node.largs[0][0])
131 break
132 else:
133 row_headers.append(cell_text)
134 else:
135 for form_text in cell_text.splitlines():
136 add_form_data(
137 word_entry,
138 form_text,
139 row_headers,
140 column_headers,
141 col_index,
142 )
144 # cursed layout from Шаблон:Гл-блок
145 # tr tag could be after or inside table cell node: Шаблон:сущ cu (-а)
146 for tr_tag in table_row.find_html_recursively("tr"):
147 row_headers = []
148 for td_index, td_tag in enumerate(tr_tag.find_html("td")):
149 if td_tag.contain_node(NodeKind.LINK):
150 for link_node in td_tag.find_child(NodeKind.LINK):
151 if td_tag.attrs.get("bgcolor", "").lower() == "#eef9ff":
152 row_headers.append(clean_node(wxr, None, link_node))
153 else:
154 add_form_data(
155 word_entry,
156 clean_node(wxr, None, link_node),
157 row_headers,
158 []
159 if "colspan" in td_tag.attrs
160 else column_headers,
161 td_index,
162 )
163 else:
164 add_form_data(
165 word_entry,
166 clean_node(wxr, None, td_tag),
167 row_headers,
168 [] if "colspan" in td_tag.attrs else column_headers,
169 td_index,
170 )
172 clean_node(wxr, word_entry, expanded_template) # add category links
175def add_form_data(
176 word_entry: WordEntry,
177 form_text: str,
178 row_headers: list[str],
179 col_headers: list[str],
180 col_index: int,
181) -> None:
182 form = Form(form=form_text.strip(" /"))
183 form.raw_tags.extend(row_headers)
184 if col_index < len(col_headers):
185 form.raw_tags.append(col_headers[col_index])
186 if len(form.form) > 0 and form.form != "—":
187 translate_raw_tags(form)
188 word_entry.forms.append(form)