Coverage for src/wiktextract/extractor/ru/linkage.py: 83%
130 statements
« prev ^ index » next coverage.py v7.9.0, created at 2025-06-13 07:43 +0000
« prev ^ index » next coverage.py v7.9.0, created at 2025-06-13 07:43 +0000
1from wikitextprocessor import (
2 HTMLNode,
3 LevelNode,
4 NodeKind,
5 TemplateNode,
6 WikiNode,
7)
9from ...page import clean_node
10from ...wxr_context import WiktextractContext
11from .models import Form, Linkage, WordEntry
12from .section_titles import LINKAGE_TITLES
13from .tags import translate_raw_tags
16def extract_linkage_section(
17 wxr: WiktextractContext,
18 word_entry: WordEntry,
19 linkage_type: str,
20 level_node: LevelNode,
21) -> None:
22 sense_index = 0
23 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM):
24 if list_item.sarg == "#": 24 ↛ 26line 24 didn't jump to line 26 because the condition on line 24 was always true
25 sense_index += 1
26 linkage = Linkage(sense_index=sense_index)
27 for node in list_item.children:
28 if isinstance(node, WikiNode):
29 if node.kind == NodeKind.LINK:
30 linkage.word = clean_node(wxr, None, node)
31 elif isinstance(node, TemplateNode): 31 ↛ 27line 31 didn't jump to line 27 because the condition on line 31 was always true
32 find_linkage_tag(wxr, linkage, node)
33 elif isinstance(node, str) and node.strip() in (";", ","):
34 if len(linkage.word) > 0:
35 translate_raw_tags(linkage)
36 getattr(word_entry, linkage_type).append(linkage)
37 tags = linkage.raw_tags
38 linkage = Linkage(sense_index=sense_index)
39 if node.strip() == ",":
40 linkage.raw_tags = tags
42 if len(linkage.word) > 0: 42 ↛ 23line 42 didn't jump to line 23 because the condition on line 42 was always true
43 translate_raw_tags(linkage)
44 getattr(word_entry, linkage_type).append(linkage)
45 linkage = Linkage(sense_index=sense_index)
47 for t_node in level_node.find_child(NodeKind.TEMPLATE): 47 ↛ 48line 47 didn't jump to line 48 because the loop on line 47 never started
48 if t_node.template_name == "родств-блок":
49 process_related_block_template(
50 wxr, word_entry, t_node, linkage_type
51 )
54def find_linkage_tag(
55 wxr: WiktextractContext,
56 linkage: Linkage,
57 template_node: TemplateNode,
58) -> None:
59 expanded_template = wxr.wtp.parse(
60 wxr.wtp.node_to_wikitext(template_node), expand_all=True
61 )
62 for span_node in expanded_template.find_html_recursively("span"):
63 tag = clean_node(wxr, None, span_node)
64 if len(tag) > 0: 64 ↛ 62line 64 didn't jump to line 62 because the condition on line 64 was always true
65 linkage.raw_tags.append(tag)
68def process_related_block_template(
69 wxr: WiktextractContext,
70 word_entry: WordEntry,
71 t_node: TemplateNode,
72 l_type: str,
73) -> None:
74 # "Родственные слова" section
75 # Шаблон:родств-блок
76 expanded_template = wxr.wtp.parse(
77 wxr.wtp.node_to_wikitext(t_node), expand_all=True
78 )
79 for table_node in expanded_template.find_child(NodeKind.TABLE):
80 table_header = ""
81 for row in table_node.find_child(NodeKind.TABLE_ROW):
82 row_header = ""
83 for cell in row.find_child(
84 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
85 ):
86 if cell.kind == NodeKind.TABLE_HEADER_CELL: 86 ↛ 87line 86 didn't jump to line 87 because the condition on line 86 was never true
87 cell_text = clean_node(wxr, None, cell)
88 if cell_text.startswith("Список всех слов с корнем"):
89 table_header = cell_text
90 elif cell.kind == NodeKind.TABLE_CELL: 90 ↛ 83line 90 didn't jump to line 83 because the condition on line 90 was always true
91 if "block-head" in cell.attrs.get("class", ""):
92 table_header = clean_node(wxr, None, cell)
93 else:
94 for list_item in cell.find_child_recursively(
95 NodeKind.LIST_ITEM
96 ):
97 for node in list_item.find_child(
98 NodeKind.HTML | NodeKind.LINK
99 ):
100 if (
101 isinstance(node, HTMLNode)
102 and node.tag == "span"
103 ):
104 row_header = clean_node(
105 wxr, None, node
106 ).removesuffix(":")
107 elif node.kind == NodeKind.LINK: 107 ↛ 97line 107 didn't jump to line 97 because the condition on line 107 was always true
108 linkage = Linkage(
109 word=clean_node(wxr, None, node)
110 )
111 if table_header != "": 111 ↛ 113line 111 didn't jump to line 113 because the condition on line 111 was always true
112 linkage.raw_tags.append(table_header)
113 if row_header != "": 113 ↛ 115line 113 didn't jump to line 115 because the condition on line 113 was always true
114 linkage.raw_tags.append(row_header)
115 if linkage.word != "": 115 ↛ 97line 115 didn't jump to line 97 because the condition on line 115 was always true
116 translate_raw_tags(linkage)
117 getattr(word_entry, l_type).append(
118 linkage
119 )
122def extract_phrase_section(
123 wxr: WiktextractContext,
124 word_entry: WordEntry,
125 level_node: WikiNode,
126 title_text: str,
127) -> None:
128 # "Фразеологизмы и устойчивые сочетания" section
129 for t_node in level_node.find_child(NodeKind.TEMPLATE): 129 ↛ 132line 129 didn't jump to line 132 because the loop on line 129 never started
130 # a template that adds links to words in list
131 # https://ru.wiktionary.org/wiki/Шаблон:в_три_колонки
132 if t_node.template_name.lower() in ["в три колонки", "фразеологизмы"]:
133 expanded_node = wxr.wtp.parse(
134 wxr.wtp.node_to_wikitext(t_node), expand_all=True
135 )
136 for div_tag in expanded_node.find_html(
137 "div", attr_name="class", attr_value="col3"
138 ):
139 extract_phrase_section(wxr, word_entry, div_tag, title_text)
141 for list_node in level_node.find_child(NodeKind.LIST):
142 for list_item in list_node.find_child_recursively(NodeKind.LIST_ITEM):
143 prefix_nodes = []
144 before_link = True
145 word_nodes = []
146 inside_brackets = False
147 for node in list_item.children:
148 if isinstance(node, str) and len(node.strip()) > 0:
149 if before_link:
150 prefix_nodes.append(node)
151 elif node.strip().startswith("("):
152 inside_brackets = True
153 word_nodes.append(node)
154 elif node.strip().startswith(")"):
155 inside_brackets = False
156 word_nodes.append(node.strip(",; "))
157 elif inside_brackets:
158 word_nodes.append(node)
160 if not inside_brackets and node.strip().endswith(
161 (",", ";", "/")
162 ):
163 word = clean_node(wxr, None, prefix_nodes + word_nodes)
164 word_nodes.clear()
165 if len(word) > 0: 165 ↛ 147line 165 didn't jump to line 147 because the condition on line 165 was always true
166 linkage = Linkage(word=word)
167 if title_text not in [
168 "фразеологизмы и устойчивые сочетания",
169 "пословицы и поговорки",
170 ]:
171 linkage.raw_tags.append(title_text)
172 translate_raw_tags(linkage)
173 if title_text == "пословицы и поговорки": 173 ↛ 174line 173 didn't jump to line 174 because the condition on line 173 was never true
174 word_entry.proverbs.append(linkage)
175 else:
176 word_entry.derived.append(linkage)
177 elif isinstance(node, WikiNode):
178 if node.kind == NodeKind.LIST:
179 continue
180 elif node.kind == NodeKind.LINK:
181 before_link = False
182 if before_link:
183 prefix_nodes.append(node)
184 else:
185 word_nodes.append(node)
187 word = clean_node(wxr, None, prefix_nodes + word_nodes)
188 if len(word) > 0: 188 ↛ 142line 188 didn't jump to line 142 because the condition on line 188 was always true
189 linkage = Linkage(word=word)
190 if title_text not in [
191 "фразеологизмы и устойчивые сочетания",
192 "пословицы и поговорки",
193 ]:
194 linkage.raw_tags.append(title_text)
195 translate_raw_tags(linkage)
196 if title_text == "пословицы и поговорки":
197 word_entry.proverbs.append(linkage)
198 else:
199 word_entry.derived.append(linkage)
202def process_semantics_template(
203 wxr: WiktextractContext,
204 word_entry: WordEntry,
205 template_node: TemplateNode,
206 sense_index: int,
207) -> None:
208 # https://ru.wiktionary.org/wiki/Шаблон:семантика
209 for key, value in template_node.template_parameters.items():
210 if key in LINKAGE_TITLES and isinstance(value, str): 210 ↛ 209line 210 didn't jump to line 209 because the condition on line 210 was always true
211 for word in value.split(","):
212 word = word.strip()
213 if word not in ("", "-", "—"):
214 getattr(word_entry, LINKAGE_TITLES[key]).append(
215 Linkage(word=word, sense_index=sense_index)
216 )
219def extract_alt_form_section(
220 wxr: WiktextractContext,
221 word_entry: WordEntry,
222 level_node: LevelNode,
223 tags: list[str],
224) -> None:
225 for link_node in level_node.find_child_recursively(NodeKind.LINK):
226 word = clean_node(wxr, None, link_node)
227 if word != "":
228 word_entry.forms.append(Form(form=word, tags=tags))