Coverage for src/wiktextract/extractor/ru/gloss.py: 88%
108 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1import re
2from typing import Optional
4from wikitextprocessor.parser import (
5 LEVEL_KIND_FLAGS,
6 NodeKind,
7 TemplateNode,
8 WikiNode,
9 WikiNodeChildrenList,
10)
12from ...page import clean_node
13from ...wxr_context import WiktextractContext
14from .example import EXAMPLE_TEMPLATES, process_example_template
15from .linkage import process_semantics_template
16from .models import Linkage, Sense, WordEntry
17from .section_titles import LINKAGE_TITLES
18from .tags import translate_raw_tags
20# Templates that are part of the clean gloss when expanded
21GLOSS_TEMPLATES = {
22 "-",
23 "=",
24 "===",
25 "lang",
26 "аббр.",
27 "выдел",
28 "гипокор.",
29 "дееприч.",
30 "действие",
31 "женск.",
32 "ласк.",
33 "мн",
34 "морфема",
35 "нареч.",
36 "наречие",
37 "однокр.",
38 "отн.",
39 "по.",
40 "по",
41 "превосх.",
42 "прич.",
43 "свойство",
44 "совершить",
45 "сокр.",
46 "сокращ",
47 "соотн.",
48 "сравн.",
49 "страд.",
50 "то же",
51 "увелич.",
52 "уменьш.",
53 "умласк",
54 "умласк.",
55 "унич.",
56 "уничиж.",
57 "хим-элем",
58 "элемент",
59}
61IGNORED_TEMPLATES = {"нужен перевод", "??", "?", "Нужен перевод"}
63TAG_GLOSS_TEMPLATES = {
64 "многокр.": "iterative",
65 "нареч.": "adverb",
66 "наречие": "adverb", # redirect to "нареч."
67 "однокр.": "semelefactive",
68 "превосх.": "superlative",
69 "прич.": "participle",
70 "сокр.": "abbreviation",
71 "сравн.": "comparative",
72 "страд.": "passive",
73 "счётн.": "numeral",
74}
77def extract_gloss(
78 wxr: WiktextractContext, word_entry: WordEntry, level_node: WikiNode
79) -> None:
80 has_gloss_list = False
81 for sense_index, list_item in enumerate(
82 level_node.find_child_recursively(NodeKind.LIST_ITEM), 1
83 ):
84 process_gloss_nodes(wxr, word_entry, list_item.children, sense_index)
85 has_gloss_list = True
86 if not has_gloss_list:
87 # no list or empty list
88 process_gloss_nodes(
89 wxr,
90 word_entry,
91 list(level_node.invert_find_child(LEVEL_KIND_FLAGS)),
92 1,
93 )
96def process_gloss_nodes(
97 wxr: WiktextractContext,
98 word_entry: WordEntry,
99 gloss_nodes: WikiNodeChildrenList,
100 sense_index: int,
101) -> None:
102 sense = Sense()
104 raw_gloss_children: WikiNodeChildrenList = []
105 clean_gloss_children: WikiNodeChildrenList = []
106 tag_templates: list[WikiNode] = []
107 note_templates: list[WikiNode] = []
109 for child in gloss_nodes:
110 if isinstance(child, TemplateNode):
111 if child.template_name.lower() in IGNORED_TEMPLATES: 111 ↛ 112line 111 didn't jump to line 112 because the condition on line 111 was never true
112 continue
113 elif child.template_name in EXAMPLE_TEMPLATES:
114 process_example_template(wxr, sense, child)
115 elif child.template_name == "семантика":
116 process_semantics_template(wxr, word_entry, child, sense_index)
117 elif child.template_name in GLOSS_TEMPLATES: 117 ↛ 118line 117 didn't jump to line 118 because the condition on line 117 was never true
118 clean_gloss_children.append(child)
119 raw_gloss_children.append(child)
120 elif child.template_name in TAG_GLOSS_TEMPLATES:
121 sense.tags.append(TAG_GLOSS_TEMPLATES[child.template_name])
122 clean_gloss_children.append(child)
123 raw_gloss_children.append(child)
124 elif (
125 child.template_name.endswith(".")
126 or child.template_name == "помета"
127 ):
128 # Assume node is tag template
129 tag_templates.append(child)
130 raw_gloss_children.append(child)
131 elif child.template_name == "значение": 131 ↛ 134line 131 didn't jump to line 134 because the condition on line 131 was always true
132 process_meaning_template(wxr, sense, word_entry, child)
133 else:
134 clean_gloss_children.append(child)
135 raw_gloss_children.append(child)
136 else:
137 clean_gloss_children.append(child)
138 raw_gloss_children.append(child)
140 remove_obsolete_leading_nodes(raw_gloss_children)
141 remove_obsolete_leading_nodes(clean_gloss_children)
143 gloss = clean_node(wxr, None, clean_gloss_children)
144 if len(gloss) > 0:
145 sense.glosses.append(gloss)
146 raw_gloss = clean_node(wxr, None, raw_gloss_children)
147 if len(raw_gloss) > 0 and raw_gloss != gloss:
148 sense.raw_glosses.append(raw_gloss)
150 for tag_template in tag_templates:
151 raw_tag = clean_node(wxr, None, tag_template)
152 if raw_tag != "": 152 ↛ 150line 152 didn't jump to line 150 because the condition on line 152 was always true
153 sense.raw_tags.append(raw_tag)
155 for note_template in note_templates: 155 ↛ 156line 155 didn't jump to line 156 because the loop on line 155 never started
156 note = clean_node(wxr, None, note_template)
157 if note != "":
158 sense.notes.append(note)
160 if sense != Sense(): 160 ↛ exitline 160 didn't return from function 'process_gloss_nodes' because the condition on line 160 was always true
161 translate_raw_tags(sense)
162 word_entry.senses.append(sense)
165def remove_obsolete_leading_nodes(nodes: WikiNodeChildrenList):
166 while (
167 nodes
168 and isinstance(nodes[0], str)
169 and nodes[0].strip() in ["", "и", "или", ",", ".", ";", ":", "\n"]
170 ):
171 nodes.pop(0)
174def process_meaning_template(
175 wxr: WiktextractContext,
176 sense: Optional[Sense],
177 word_entry: WordEntry,
178 template_node: TemplateNode,
179) -> Sense:
180 # https://ru.wiktionary.org/wiki/Шаблон:значение
181 if sense is None:
182 sense = Sense()
184 gloss = ""
185 for param_name, param_value in template_node.template_parameters.items():
186 if param_name == "определение":
187 gloss = clean_node(wxr, None, param_value)
188 if len(gloss) > 0: 188 ↛ 185line 188 didn't jump to line 185 because the condition on line 188 was always true
189 sense.glosses.append(gloss)
190 elif param_name == "пометы":
191 raw_tag = clean_node(wxr, None, param_value)
192 if len(raw_tag) > 0: 192 ↛ 185line 192 didn't jump to line 185 because the condition on line 192 was always true
193 sense.raw_tags.append(raw_tag)
194 elif param_name == "примеры" and isinstance(param_value, list):
195 for t_node in param_value:
196 if isinstance(t_node, TemplateNode):
197 process_example_template(wxr, sense, t_node)
198 elif param_name in LINKAGE_TITLES:
199 linkage_type = LINKAGE_TITLES[param_name]
200 if isinstance(param_value, str) and len(param_value.strip()) > 0:
201 for linkage_word in re.split(r",|;", param_value):
202 linkage_word = linkage_word.strip()
203 if len(linkage_word) > 0 and linkage_word != "-":
204 linkage_list = getattr(word_entry, linkage_type)
205 linkage_list.append(
206 Linkage(word=linkage_word, sense=gloss)
207 )
208 elif isinstance(param_value, list): 208 ↛ 185line 208 didn't jump to line 185 because the condition on line 208 was always true
209 for param_node in param_value:
210 if (
211 isinstance(param_node, WikiNode)
212 and param_node.kind == NodeKind.LINK
213 ):
214 linkage_word = clean_node(wxr, None, param_node)
215 if len(linkage_word) > 0: 215 ↛ 209line 215 didn't jump to line 209 because the condition on line 215 was always true
216 linkage_list = getattr(word_entry, linkage_type)
217 linkage_list.append(
218 Linkage(word=linkage_word, sense=gloss)
219 )
221 if len(sense.glosses) > 0: 221 ↛ 224line 221 didn't jump to line 224 because the condition on line 221 was always true
222 translate_raw_tags(sense)
224 clean_node(wxr, sense, template_node)
225 return sense