Coverage for src/wiktextract/extractor/ru/gloss.py: 88%

108 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2024-12-27 08:07 +0000

1import re 

2from typing import Optional 

3 

4from wikitextprocessor.parser import ( 

5 LEVEL_KIND_FLAGS, 

6 NodeKind, 

7 TemplateNode, 

8 WikiNode, 

9 WikiNodeChildrenList, 

10) 

11 

12from ...page import clean_node 

13from ...wxr_context import WiktextractContext 

14from .example import EXAMPLE_TEMPLATES, process_example_template 

15from .linkage import process_semantics_template 

16from .models import Linkage, Sense, WordEntry 

17from .section_titles import LINKAGE_TITLES 

18from .tags import translate_raw_tags 

19 

20# Templates that are part of the clean gloss when expanded 

21GLOSS_TEMPLATES = { 

22 "-", 

23 "=", 

24 "===", 

25 "lang", 

26 "аббр.", 

27 "выдел", 

28 "гипокор.", 

29 "дееприч.", 

30 "действие", 

31 "женск.", 

32 "ласк.", 

33 "мн", 

34 "морфема", 

35 "нареч.", 

36 "наречие", 

37 "однокр.", 

38 "отн.", 

39 "по.", 

40 "по", 

41 "превосх.", 

42 "прич.", 

43 "свойство", 

44 "совершить", 

45 "сокр.", 

46 "сокращ", 

47 "соотн.", 

48 "сравн.", 

49 "страд.", 

50 "то же", 

51 "увелич.", 

52 "уменьш.", 

53 "умласк", 

54 "умласк.", 

55 "унич.", 

56 "уничиж.", 

57 "хим-элем", 

58 "элемент", 

59} 

60 

61IGNORED_TEMPLATES = {"нужен перевод", "??", "?", "Нужен перевод"} 

62 

63TAG_GLOSS_TEMPLATES = { 

64 "многокр.": "iterative", 

65 "нареч.": "adverb", 

66 "наречие": "adverb", # redirect to "нареч." 

67 "однокр.": "semelefactive", 

68 "превосх.": "superlative", 

69 "прич.": "participle", 

70 "сокр.": "abbreviation", 

71 "сравн.": "comparative", 

72 "страд.": "passive", 

73 "счётн.": "numeral", 

74} 

75 

76 

77def extract_gloss( 

78 wxr: WiktextractContext, word_entry: WordEntry, level_node: WikiNode 

79) -> None: 

80 has_gloss_list = False 

81 for sense_index, list_item in enumerate( 

82 level_node.find_child_recursively(NodeKind.LIST_ITEM), 1 

83 ): 

84 process_gloss_nodes(wxr, word_entry, list_item.children, sense_index) 

85 has_gloss_list = True 

86 if not has_gloss_list: 

87 # no list or empty list 

88 process_gloss_nodes( 

89 wxr, 

90 word_entry, 

91 list(level_node.invert_find_child(LEVEL_KIND_FLAGS)), 

92 1, 

93 ) 

94 

95 

96def process_gloss_nodes( 

97 wxr: WiktextractContext, 

98 word_entry: WordEntry, 

99 gloss_nodes: WikiNodeChildrenList, 

100 sense_index: int, 

101) -> None: 

102 sense = Sense() 

103 

104 raw_gloss_children: WikiNodeChildrenList = [] 

105 clean_gloss_children: WikiNodeChildrenList = [] 

106 tag_templates: list[WikiNode] = [] 

107 note_templates: list[WikiNode] = [] 

108 

109 for child in gloss_nodes: 

110 if isinstance(child, TemplateNode): 

111 if child.template_name.lower() in IGNORED_TEMPLATES: 111 ↛ 112line 111 didn't jump to line 112 because the condition on line 111 was never true

112 continue 

113 elif child.template_name in EXAMPLE_TEMPLATES: 

114 process_example_template(wxr, sense, child) 

115 elif child.template_name == "семантика": 

116 process_semantics_template(wxr, word_entry, child, sense_index) 

117 elif child.template_name in GLOSS_TEMPLATES: 117 ↛ 118line 117 didn't jump to line 118 because the condition on line 117 was never true

118 clean_gloss_children.append(child) 

119 raw_gloss_children.append(child) 

120 elif child.template_name in TAG_GLOSS_TEMPLATES: 

121 sense.tags.append(TAG_GLOSS_TEMPLATES[child.template_name]) 

122 clean_gloss_children.append(child) 

123 raw_gloss_children.append(child) 

124 elif ( 

125 child.template_name.endswith(".") 

126 or child.template_name == "помета" 

127 ): 

128 # Assume node is tag template 

129 tag_templates.append(child) 

130 raw_gloss_children.append(child) 

131 elif child.template_name == "значение": 131 ↛ 134line 131 didn't jump to line 134 because the condition on line 131 was always true

132 process_meaning_template(wxr, sense, word_entry, child) 

133 else: 

134 clean_gloss_children.append(child) 

135 raw_gloss_children.append(child) 

136 else: 

137 clean_gloss_children.append(child) 

138 raw_gloss_children.append(child) 

139 

140 remove_obsolete_leading_nodes(raw_gloss_children) 

141 remove_obsolete_leading_nodes(clean_gloss_children) 

142 

143 gloss = clean_node(wxr, None, clean_gloss_children) 

144 if len(gloss) > 0: 

145 sense.glosses.append(gloss) 

146 raw_gloss = clean_node(wxr, None, raw_gloss_children) 

147 if len(raw_gloss) > 0 and raw_gloss != gloss: 

148 sense.raw_glosses.append(raw_gloss) 

149 

150 for tag_template in tag_templates: 

151 raw_tag = clean_node(wxr, None, tag_template) 

152 if raw_tag != "": 152 ↛ 150line 152 didn't jump to line 150 because the condition on line 152 was always true

153 sense.raw_tags.append(raw_tag) 

154 

155 for note_template in note_templates: 155 ↛ 156line 155 didn't jump to line 156 because the loop on line 155 never started

156 note = clean_node(wxr, None, note_template) 

157 if note != "": 

158 sense.notes.append(note) 

159 

160 if sense != Sense(): 160 ↛ exitline 160 didn't return from function 'process_gloss_nodes' because the condition on line 160 was always true

161 translate_raw_tags(sense) 

162 word_entry.senses.append(sense) 

163 

164 

165def remove_obsolete_leading_nodes(nodes: WikiNodeChildrenList): 

166 while ( 

167 nodes 

168 and isinstance(nodes[0], str) 

169 and nodes[0].strip() in ["", "и", "или", ",", ".", ";", ":", "\n"] 

170 ): 

171 nodes.pop(0) 

172 

173 

174def process_meaning_template( 

175 wxr: WiktextractContext, 

176 sense: Optional[Sense], 

177 word_entry: WordEntry, 

178 template_node: TemplateNode, 

179) -> Sense: 

180 # https://ru.wiktionary.org/wiki/Шаблон:значение 

181 if sense is None: 

182 sense = Sense() 

183 

184 gloss = "" 

185 for param_name, param_value in template_node.template_parameters.items(): 

186 if param_name == "определение": 

187 gloss = clean_node(wxr, None, param_value) 

188 if len(gloss) > 0: 188 ↛ 185line 188 didn't jump to line 185 because the condition on line 188 was always true

189 sense.glosses.append(gloss) 

190 elif param_name == "пометы": 

191 raw_tag = clean_node(wxr, None, param_value) 

192 if len(raw_tag) > 0: 192 ↛ 185line 192 didn't jump to line 185 because the condition on line 192 was always true

193 sense.raw_tags.append(raw_tag) 

194 elif param_name == "примеры" and isinstance(param_value, list): 

195 for t_node in param_value: 

196 if isinstance(t_node, TemplateNode): 

197 process_example_template(wxr, sense, t_node) 

198 elif param_name in LINKAGE_TITLES: 

199 linkage_type = LINKAGE_TITLES[param_name] 

200 if isinstance(param_value, str) and len(param_value.strip()) > 0: 

201 for linkage_word in re.split(r",|;", param_value): 

202 linkage_word = linkage_word.strip() 

203 if len(linkage_word) > 0 and linkage_word != "-": 

204 linkage_list = getattr(word_entry, linkage_type) 

205 linkage_list.append( 

206 Linkage(word=linkage_word, sense=gloss) 

207 ) 

208 elif isinstance(param_value, list): 208 ↛ 185line 208 didn't jump to line 185 because the condition on line 208 was always true

209 for param_node in param_value: 

210 if ( 

211 isinstance(param_node, WikiNode) 

212 and param_node.kind == NodeKind.LINK 

213 ): 

214 linkage_word = clean_node(wxr, None, param_node) 

215 if len(linkage_word) > 0: 215 ↛ 209line 215 didn't jump to line 209 because the condition on line 215 was always true

216 linkage_list = getattr(word_entry, linkage_type) 

217 linkage_list.append( 

218 Linkage(word=linkage_word, sense=gloss) 

219 ) 

220 

221 if len(sense.glosses) > 0: 221 ↛ 224line 221 didn't jump to line 224 because the condition on line 221 was always true

222 translate_raw_tags(sense) 

223 

224 clean_node(wxr, sense, template_node) 

225 return sense