Coverage for src/wiktextract/extractor/ru/gloss.py: 93%

115 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2025-10-13 10:14 +0000

1import re 

2 

3from wikitextprocessor.parser import ( 

4 LEVEL_KIND_FLAGS, 

5 NodeKind, 

6 TemplateNode, 

7 WikiNode, 

8 WikiNodeChildrenList, 

9) 

10 

11from ...page import clean_node 

12from ...wxr_context import WiktextractContext 

13from .example import EXAMPLE_TEMPLATES, process_example_template 

14from .linkage import process_semantics_template 

15from .models import Linkage, Sense, WordEntry 

16from .section_titles import LINKAGE_TITLES 

17from .tags import translate_raw_tags 

18 

19IGNORED_TEMPLATES = {"нужен перевод", "??", "?", "Нужен перевод"} 

20 

21TAG_GLOSS_TEMPLATES = { 

22 "многокр.": "iterative", 

23 "нареч.": "adverb", 

24 "наречие": "adverb", # redirect to "нареч." 

25 "однокр.": "semelefactive", 

26 "превосх.": "superlative", 

27 "прич.": "participle", 

28 "сокр.": "abbreviation", 

29 "сравн.": "comparative", 

30 "страд.": "passive", 

31 "счётн.": "numeral", 

32} 

33 

34 

35def extract_gloss( 

36 wxr: WiktextractContext, word_entry: WordEntry, level_node: WikiNode 

37) -> None: 

38 has_gloss_list = False 

39 section_title = clean_node(wxr, None, level_node.largs) 

40 for list_node in level_node.find_child(NodeKind.LIST): 

41 for sense_index, list_item in enumerate( 

42 list_node.find_child(NodeKind.LIST_ITEM), 1 

43 ): 

44 process_gloss_list_item( 

45 wxr, 

46 word_entry, 

47 list_item, 

48 sense_index, 

49 section_title=section_title, 

50 ) 

51 has_gloss_list = True 

52 if not has_gloss_list: 

53 node = wxr.wtp.parse( 

54 wxr.wtp.node_to_wikitext( 

55 list( 

56 level_node.invert_find_child( 

57 LEVEL_KIND_FLAGS, include_empty_str=True 

58 ) 

59 ) 

60 ) 

61 ) 

62 process_gloss_list_item( 

63 wxr, word_entry, node, 1, section_title=section_title 

64 ) 

65 

66 

67def process_gloss_list_item( 

68 wxr: WiktextractContext, 

69 word_entry: WordEntry, 

70 list_item: WikiNode, 

71 sense_index: int, 

72 parent_sense: Sense | None = None, 

73 section_title: str = "", 

74) -> None: 

75 sense = ( 

76 Sense() if parent_sense is None else parent_sense.model_copy(deep=True) 

77 ) 

78 if section_title not in ["", "Значение", "Значения"]: 

79 sense.raw_tags.append(section_title) 

80 gloss_nodes = [] 

81 for child in list_item.children: 

82 if isinstance(child, TemplateNode): 

83 if child.template_name in EXAMPLE_TEMPLATES: 

84 process_example_template(wxr, sense, child) 

85 elif child.template_name == "семантика": 

86 process_semantics_template(wxr, word_entry, child, sense_index) 

87 elif child.template_name in TAG_GLOSS_TEMPLATES: 

88 sense.tags.append(TAG_GLOSS_TEMPLATES[child.template_name]) 

89 gloss_nodes.append(child) 

90 elif child.template_name.endswith(".") or child.template_name in [ 

91 "причастие", 

92 "умласк", 

93 ]: 

94 extract_dot_template(wxr, sense, child, gloss_nodes) 

95 elif child.template_name == "помета": 

96 if "nocolor" in child.template_parameters: 96 ↛ 97line 96 didn't jump to line 97 because the condition on line 96 was never true

97 gloss_nodes.append(child) 

98 else: 

99 raw_tag = clean_node(wxr, sense, child) 

100 if raw_tag not in ["", "?"]: 100 ↛ 81line 100 didn't jump to line 81 because the condition on line 100 was always true

101 sense.raw_tags.append(raw_tag) 

102 elif child.template_name == "значение": 102 ↛ 104line 102 didn't jump to line 104 because the condition on line 102 was always true

103 process_meaning_template(wxr, sense, word_entry, child) 

104 elif child.template_name.lower() not in IGNORED_TEMPLATES: 

105 gloss_nodes.append(child) 

106 elif not (isinstance(child, WikiNode) and child.kind == NodeKind.LIST): 

107 gloss_nodes.append(child) 

108 

109 remove_obsolete_leading_nodes(gloss_nodes) 

110 gloss = clean_node(wxr, sense, gloss_nodes) 

111 if len(gloss) > 0: 

112 sense.glosses.append(gloss) 

113 if len(sense.glosses) > 0: 

114 translate_raw_tags(sense) 

115 word_entry.senses.append(sense) 

116 

117 for child_list in list_item.find_child(NodeKind.LIST): 

118 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM): 

119 process_gloss_list_item( 

120 wxr, word_entry, child_list_item, sense_index, sense 

121 ) 

122 

123 

124def remove_obsolete_leading_nodes(nodes: WikiNodeChildrenList): 

125 while ( 

126 nodes 

127 and isinstance(nodes[0], str) 

128 and nodes[0].strip() in ["", "и", "или", ",", ".", ";", ":"] 

129 ): 

130 nodes.pop(0) 

131 

132 

133def process_meaning_template( 

134 wxr: WiktextractContext, 

135 sense: Sense | None, 

136 word_entry: WordEntry, 

137 template_node: TemplateNode, 

138) -> Sense: 

139 # https://ru.wiktionary.org/wiki/Шаблон:значение 

140 if sense is None: 

141 sense = Sense() 

142 

143 gloss = "" 

144 for param_name, param_value in template_node.template_parameters.items(): 

145 if param_name == "определение": 

146 gloss = clean_node(wxr, None, param_value) 

147 if len(gloss) > 0: 147 ↛ 144line 147 didn't jump to line 144 because the condition on line 147 was always true

148 sense.glosses.append(gloss) 

149 elif param_name == "пометы": 

150 raw_tag = clean_node(wxr, None, param_value) 

151 if len(raw_tag) > 0: 151 ↛ 144line 151 didn't jump to line 144 because the condition on line 151 was always true

152 sense.raw_tags.append(raw_tag) 

153 elif param_name == "примеры" and isinstance(param_value, list): 

154 for t_node in param_value: 

155 if isinstance(t_node, TemplateNode): 

156 process_example_template(wxr, sense, t_node) 

157 elif param_name in LINKAGE_TITLES: 

158 linkage_type = LINKAGE_TITLES[param_name] 

159 if isinstance(param_value, str) and len(param_value.strip()) > 0: 

160 for linkage_word in re.split(r",|;", param_value): 

161 linkage_word = linkage_word.strip() 

162 if len(linkage_word) > 0 and linkage_word != "-": 

163 linkage_list = getattr(word_entry, linkage_type) 

164 linkage_list.append( 

165 Linkage(word=linkage_word, sense=gloss) 

166 ) 

167 elif isinstance(param_value, list): 167 ↛ 144line 167 didn't jump to line 144 because the condition on line 167 was always true

168 for param_node in param_value: 

169 if ( 

170 isinstance(param_node, WikiNode) 

171 and param_node.kind == NodeKind.LINK 

172 ): 

173 linkage_word = clean_node(wxr, None, param_node) 

174 if len(linkage_word) > 0: 174 ↛ 168line 174 didn't jump to line 168 because the condition on line 174 was always true

175 linkage_list = getattr(word_entry, linkage_type) 

176 linkage_list.append( 

177 Linkage(word=linkage_word, sense=gloss) 

178 ) 

179 

180 if len(sense.glosses) > 0: 180 ↛ 183line 180 didn't jump to line 183 because the condition on line 180 was always true

181 translate_raw_tags(sense) 

182 

183 clean_node(wxr, sense, template_node) 

184 return sense 

185 

186 

187def extract_dot_template( 

188 wxr: WiktextractContext, 

189 sense: Sense, 

190 t_node: TemplateNode, 

191 gloss_nodes: list[WikiNode | str], 

192) -> None: 

193 expanded_node = wxr.wtp.parse( 

194 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

195 ) 

196 for node in expanded_node.children: 

197 if isinstance(node, WikiNode) and node.kind == NodeKind.LINK: 

198 is_tag = False 

199 for span_tag in node.find_html_recursively("span"): 

200 if "background-color:#CCFFFF" in span_tag.attrs.get( 

201 "style", "" 

202 ): 

203 raw_tag = clean_node(wxr, None, node) 

204 if raw_tag != "": 204 ↛ 206line 204 didn't jump to line 206 because the condition on line 204 was always true

205 sense.raw_tags.append(raw_tag) 

206 is_tag = True 

207 break 

208 if not is_tag: 

209 node_text = clean_node(wxr, sense, node) 

210 if node_text != "": 

211 gloss_nodes.append(node_text) 

212 else: 

213 gloss_nodes.append(node)