Coverage for src/wiktextract/extractor/ru/linkage.py: 86%
125 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1from wikitextprocessor import (
2 HTMLNode,
3 NodeKind,
4 TemplateNode,
5 WikiNode,
6)
8from ...page import clean_node
9from ...wxr_context import WiktextractContext
10from .models import Linkage, WordEntry
11from .section_titles import LINKAGE_TITLES
12from .tags import translate_raw_tags
15def extract_linkages(
16 wxr: WiktextractContext,
17 word_entry: WordEntry,
18 linkage_type: str,
19 level_node: WikiNode,
20):
21 if linkage_type not in word_entry.model_fields: 21 ↛ 22line 21 didn't jump to line 22 because the condition on line 21 was never true
22 wxr.wtp.debug(
23 f"Linkage type {linkage_type} not defined for word entry",
24 sortid="extractor/ru/linkage/extract_linkages/10",
25 )
26 return
27 sense_index = 0
28 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM):
29 if list_item.sarg == "#": 29 ↛ 31line 29 didn't jump to line 31 because the condition on line 29 was always true
30 sense_index += 1
31 linkage = Linkage(sense_index=sense_index)
32 for node in list_item.children:
33 if isinstance(node, WikiNode):
34 if node.kind == NodeKind.LINK:
35 linkage.word = clean_node(wxr, None, node)
36 elif isinstance(node, TemplateNode): 36 ↛ 32line 36 didn't jump to line 32 because the condition on line 36 was always true
37 find_linkage_tag(wxr, linkage, node)
38 elif isinstance(node, str) and node.strip() in (";", ","):
39 if len(linkage.word) > 0:
40 translate_raw_tags(linkage)
41 getattr(word_entry, linkage_type).append(linkage)
42 tags = linkage.raw_tags
43 linkage = Linkage(sense_index=sense_index)
44 if node.strip() == ",":
45 linkage.raw_tags = tags
47 if len(linkage.word) > 0: 47 ↛ 28line 47 didn't jump to line 28 because the condition on line 47 was always true
48 translate_raw_tags(linkage)
49 getattr(word_entry, linkage_type).append(linkage)
50 linkage = Linkage(sense_index=sense_index)
53def find_linkage_tag(
54 wxr: WiktextractContext,
55 linkage: Linkage,
56 template_node: TemplateNode,
57) -> None:
58 expanded_template = wxr.wtp.parse(
59 wxr.wtp.node_to_wikitext(template_node), expand_all=True
60 )
61 for span_node in expanded_template.find_html_recursively("span"):
62 tag = clean_node(wxr, None, span_node)
63 if len(tag) > 0: 63 ↛ 61line 63 didn't jump to line 61 because the condition on line 63 was always true
64 linkage.raw_tags.append(tag)
67def process_related_block_template(
68 wxr: WiktextractContext, word_entry: WordEntry, template_node: TemplateNode
69) -> None:
70 # "Родственные слова" section
71 # Шаблон:родств-блок
72 expanded_template = wxr.wtp.parse(
73 wxr.wtp.node_to_wikitext(template_node), expand_all=True
74 )
75 for table_node in expanded_template.find_child(NodeKind.TABLE):
76 table_header = ""
77 for row in table_node.find_child(NodeKind.TABLE_ROW):
78 row_header = ""
79 for cell in row.find_child(
80 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
81 ):
82 if cell.kind == NodeKind.TABLE_HEADER_CELL: 82 ↛ 83line 82 didn't jump to line 83 because the condition on line 82 was never true
83 cell_text = clean_node(wxr, None, cell)
84 if cell_text.startswith("Список всех слов с корнем"):
85 table_header = cell_text
86 elif cell.kind == NodeKind.TABLE_CELL: 86 ↛ 79line 86 didn't jump to line 79 because the condition on line 86 was always true
87 if "block-head" in cell.attrs.get("class", ""):
88 table_header = clean_node(wxr, None, cell)
89 else:
90 for list_item in cell.find_child_recursively(
91 NodeKind.LIST_ITEM
92 ):
93 for node in list_item.find_child(
94 NodeKind.HTML | NodeKind.LINK
95 ):
96 if (
97 isinstance(node, HTMLNode)
98 and node.tag == "span"
99 ):
100 row_header = clean_node(
101 wxr, None, node
102 ).removesuffix(":")
103 elif node.kind == NodeKind.LINK: 103 ↛ 93line 103 didn't jump to line 93 because the condition on line 103 was always true
104 linkage = Linkage(
105 word=clean_node(wxr, None, node)
106 )
107 if table_header != "": 107 ↛ 109line 107 didn't jump to line 109 because the condition on line 107 was always true
108 linkage.raw_tags.append(table_header)
109 if row_header != "": 109 ↛ 111line 109 didn't jump to line 111 because the condition on line 109 was always true
110 linkage.raw_tags.append(row_header)
111 if linkage.word != "": 111 ↛ 93line 111 didn't jump to line 93 because the condition on line 111 was always true
112 translate_raw_tags(linkage)
113 word_entry.related.append(linkage)
116def extract_phrase_section(
117 wxr: WiktextractContext,
118 word_entry: WordEntry,
119 level_node: WikiNode,
120 title_text: str,
121) -> None:
122 # "Фразеологизмы и устойчивые сочетания" section
123 for t_node in level_node.find_child(NodeKind.TEMPLATE): 123 ↛ 126line 123 didn't jump to line 126 because the loop on line 123 never started
124 # a template that adds links to words in list
125 # https://ru.wiktionary.org/wiki/Шаблон:в_три_колонки
126 if t_node.template_name.lower() in ["в три колонки", "фразеологизмы"]:
127 expanded_node = wxr.wtp.parse(
128 wxr.wtp.node_to_wikitext(t_node), expand_all=True
129 )
130 for div_tag in expanded_node.find_html(
131 "div", attr_name="class", attr_value="col3"
132 ):
133 extract_phrase_section(wxr, word_entry, div_tag, title_text)
135 for list_node in level_node.find_child(NodeKind.LIST):
136 for list_item in list_node.find_child_recursively(NodeKind.LIST_ITEM):
137 prefix_nodes = []
138 before_link = True
139 word_nodes = []
140 inside_brackets = False
141 for node in list_item.children:
142 if isinstance(node, str) and len(node.strip()) > 0:
143 if before_link:
144 prefix_nodes.append(node)
145 elif node.strip().startswith("("):
146 inside_brackets = True
147 word_nodes.append(node)
148 elif node.strip().startswith(")"):
149 inside_brackets = False
150 word_nodes.append(node.strip(",; "))
151 elif inside_brackets:
152 word_nodes.append(node)
154 if not inside_brackets and node.strip().endswith(
155 (",", ";", "/")
156 ):
157 word = clean_node(wxr, None, prefix_nodes + word_nodes)
158 word_nodes.clear()
159 if len(word) > 0: 159 ↛ 141line 159 didn't jump to line 141 because the condition on line 159 was always true
160 linkage = Linkage(word=word)
161 if title_text not in [
162 "фразеологизмы и устойчивые сочетания",
163 "пословицы и поговорки",
164 ]:
165 linkage.raw_tags.append(title_text)
166 translate_raw_tags(linkage)
167 if title_text == "пословицы и поговорки": 167 ↛ 168line 167 didn't jump to line 168 because the condition on line 167 was never true
168 word_entry.proverbs.append(linkage)
169 else:
170 word_entry.derived.append(linkage)
171 elif isinstance(node, WikiNode):
172 if node.kind == NodeKind.LIST:
173 continue
174 elif node.kind == NodeKind.LINK:
175 before_link = False
176 if before_link:
177 prefix_nodes.append(node)
178 else:
179 word_nodes.append(node)
181 word = clean_node(wxr, None, prefix_nodes + word_nodes)
182 if len(word) > 0: 182 ↛ 136line 182 didn't jump to line 136 because the condition on line 182 was always true
183 linkage = Linkage(word=word)
184 if title_text not in [
185 "фразеологизмы и устойчивые сочетания",
186 "пословицы и поговорки",
187 ]:
188 linkage.raw_tags.append(title_text)
189 translate_raw_tags(linkage)
190 if title_text == "пословицы и поговорки":
191 word_entry.proverbs.append(linkage)
192 else:
193 word_entry.derived.append(linkage)
196def process_semantics_template(
197 wxr: WiktextractContext,
198 word_entry: WordEntry,
199 template_node: TemplateNode,
200 sense_index: int,
201) -> None:
202 # https://ru.wiktionary.org/wiki/Шаблон:семантика
203 for key, value in template_node.template_parameters.items():
204 if key in LINKAGE_TITLES and isinstance(value, str): 204 ↛ 203line 204 didn't jump to line 203 because the condition on line 204 was always true
205 for word in value.split(","):
206 word = word.strip()
207 if word not in ("", "-"):
208 getattr(word_entry, LINKAGE_TITLES[key]).append(
209 Linkage(word=word, sense_index=sense_index)
210 )