Coverage for src/wiktextract/extractor/ru/gloss.py: 93%
113 statements
« prev ^ index » next coverage.py v7.9.0, created at 2025-06-13 07:43 +0000
« prev ^ index » next coverage.py v7.9.0, created at 2025-06-13 07:43 +0000
1import re
3from wikitextprocessor.parser import (
4 LEVEL_KIND_FLAGS,
5 NodeKind,
6 TemplateNode,
7 WikiNode,
8 WikiNodeChildrenList,
9)
11from ...page import clean_node
12from ...wxr_context import WiktextractContext
13from .example import EXAMPLE_TEMPLATES, process_example_template
14from .linkage import process_semantics_template
15from .models import Linkage, Sense, WordEntry
16from .section_titles import LINKAGE_TITLES
17from .tags import translate_raw_tags
19IGNORED_TEMPLATES = {"нужен перевод", "??", "?", "Нужен перевод"}
21TAG_GLOSS_TEMPLATES = {
22 "многокр.": "iterative",
23 "нареч.": "adverb",
24 "наречие": "adverb", # redirect to "нареч."
25 "однокр.": "semelefactive",
26 "превосх.": "superlative",
27 "прич.": "participle",
28 "сокр.": "abbreviation",
29 "сравн.": "comparative",
30 "страд.": "passive",
31 "счётн.": "numeral",
32}
35def extract_gloss(
36 wxr: WiktextractContext, word_entry: WordEntry, level_node: WikiNode
37) -> None:
38 has_gloss_list = False
39 section_title = clean_node(wxr, None, level_node.largs)
40 for list_node in level_node.find_child(NodeKind.LIST):
41 for sense_index, list_item in enumerate(
42 list_node.find_child(NodeKind.LIST_ITEM), 1
43 ):
44 process_gloss_list_item(
45 wxr,
46 word_entry,
47 list_item,
48 sense_index,
49 section_title=section_title,
50 )
51 has_gloss_list = True
52 if not has_gloss_list:
53 node = wxr.wtp.parse(
54 wxr.wtp.node_to_wikitext(
55 list(level_node.invert_find_child(LEVEL_KIND_FLAGS))
56 )
57 )
58 process_gloss_list_item(
59 wxr, word_entry, node, 1, section_title=section_title
60 )
63def process_gloss_list_item(
64 wxr: WiktextractContext,
65 word_entry: WordEntry,
66 list_item: WikiNode,
67 sense_index: int,
68 parent_sense: Sense | None = None,
69 section_title: str = "",
70) -> None:
71 sense = (
72 Sense() if parent_sense is None else parent_sense.model_copy(deep=True)
73 )
74 if section_title not in ["", "Значение", "Значения"]:
75 sense.raw_tags.append(section_title)
76 gloss_nodes = []
77 for child in list_item.children:
78 if isinstance(child, TemplateNode):
79 if child.template_name in EXAMPLE_TEMPLATES:
80 process_example_template(wxr, sense, child)
81 elif child.template_name == "семантика":
82 process_semantics_template(wxr, word_entry, child, sense_index)
83 elif child.template_name in TAG_GLOSS_TEMPLATES:
84 sense.tags.append(TAG_GLOSS_TEMPLATES[child.template_name])
85 gloss_nodes.append(child)
86 elif child.template_name.endswith(".") or child.template_name in [
87 "причастие",
88 "умласк",
89 ]:
90 extract_dot_template(wxr, sense, child, gloss_nodes)
91 elif child.template_name == "помета":
92 if "nocolor" in child.template_parameters: 92 ↛ 93line 92 didn't jump to line 93 because the condition on line 92 was never true
93 gloss_nodes.append(child)
94 else:
95 raw_tag = clean_node(wxr, sense, child)
96 if raw_tag not in ["", "?"]: 96 ↛ 77line 96 didn't jump to line 77 because the condition on line 96 was always true
97 sense.raw_tags.append(raw_tag)
98 elif child.template_name == "значение": 98 ↛ 100line 98 didn't jump to line 100 because the condition on line 98 was always true
99 process_meaning_template(wxr, sense, word_entry, child)
100 elif child.template_name.lower() not in IGNORED_TEMPLATES:
101 gloss_nodes.append(child)
102 elif not (isinstance(child, WikiNode) and child.kind == NodeKind.LIST):
103 gloss_nodes.append(child)
105 remove_obsolete_leading_nodes(gloss_nodes)
106 gloss = clean_node(wxr, sense, gloss_nodes)
107 if len(gloss) > 0:
108 sense.glosses.append(gloss)
109 if len(sense.glosses) > 0:
110 translate_raw_tags(sense)
111 word_entry.senses.append(sense)
113 for child_list in list_item.find_child(NodeKind.LIST):
114 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM):
115 process_gloss_list_item(
116 wxr, word_entry, child_list_item, sense_index, sense
117 )
120def remove_obsolete_leading_nodes(nodes: WikiNodeChildrenList):
121 while (
122 nodes
123 and isinstance(nodes[0], str)
124 and nodes[0].strip() in ["", "и", "или", ",", ".", ";", ":"]
125 ):
126 nodes.pop(0)
129def process_meaning_template(
130 wxr: WiktextractContext,
131 sense: Sense | None,
132 word_entry: WordEntry,
133 template_node: TemplateNode,
134) -> Sense:
135 # https://ru.wiktionary.org/wiki/Шаблон:значение
136 if sense is None:
137 sense = Sense()
139 gloss = ""
140 for param_name, param_value in template_node.template_parameters.items():
141 if param_name == "определение":
142 gloss = clean_node(wxr, None, param_value)
143 if len(gloss) > 0: 143 ↛ 140line 143 didn't jump to line 140 because the condition on line 143 was always true
144 sense.glosses.append(gloss)
145 elif param_name == "пометы":
146 raw_tag = clean_node(wxr, None, param_value)
147 if len(raw_tag) > 0: 147 ↛ 140line 147 didn't jump to line 140 because the condition on line 147 was always true
148 sense.raw_tags.append(raw_tag)
149 elif param_name == "примеры" and isinstance(param_value, list):
150 for t_node in param_value:
151 if isinstance(t_node, TemplateNode):
152 process_example_template(wxr, sense, t_node)
153 elif param_name in LINKAGE_TITLES:
154 linkage_type = LINKAGE_TITLES[param_name]
155 if isinstance(param_value, str) and len(param_value.strip()) > 0:
156 for linkage_word in re.split(r",|;", param_value):
157 linkage_word = linkage_word.strip()
158 if len(linkage_word) > 0 and linkage_word != "-":
159 linkage_list = getattr(word_entry, linkage_type)
160 linkage_list.append(
161 Linkage(word=linkage_word, sense=gloss)
162 )
163 elif isinstance(param_value, list): 163 ↛ 140line 163 didn't jump to line 140 because the condition on line 163 was always true
164 for param_node in param_value:
165 if (
166 isinstance(param_node, WikiNode)
167 and param_node.kind == NodeKind.LINK
168 ):
169 linkage_word = clean_node(wxr, None, param_node)
170 if len(linkage_word) > 0: 170 ↛ 164line 170 didn't jump to line 164 because the condition on line 170 was always true
171 linkage_list = getattr(word_entry, linkage_type)
172 linkage_list.append(
173 Linkage(word=linkage_word, sense=gloss)
174 )
176 if len(sense.glosses) > 0: 176 ↛ 179line 176 didn't jump to line 179 because the condition on line 176 was always true
177 translate_raw_tags(sense)
179 clean_node(wxr, sense, template_node)
180 return sense
183def extract_dot_template(
184 wxr: WiktextractContext,
185 sense: Sense,
186 t_node: TemplateNode,
187 gloss_nodes: list[WikiNode | str],
188) -> None:
189 expanded_node = wxr.wtp.parse(
190 wxr.wtp.node_to_wikitext(t_node), expand_all=True
191 )
192 for node in expanded_node.children:
193 if isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
194 is_tag = False
195 for span_tag in node.find_html_recursively("span"):
196 if "background-color:#CCFFFF" in span_tag.attrs.get(
197 "style", ""
198 ):
199 raw_tag = clean_node(wxr, None, node)
200 if raw_tag != "": 200 ↛ 202line 200 didn't jump to line 202 because the condition on line 200 was always true
201 sense.raw_tags.append(raw_tag)
202 is_tag = True
203 break
204 if not is_tag:
205 gloss_nodes.append(node)
206 else:
207 gloss_nodes.append(node)