Coverage for src/wiktextract/extractor/ru/gloss.py: 93%

113 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-06-13 07:43 +0000

1import re 

2 

3from wikitextprocessor.parser import ( 

4 LEVEL_KIND_FLAGS, 

5 NodeKind, 

6 TemplateNode, 

7 WikiNode, 

8 WikiNodeChildrenList, 

9) 

10 

11from ...page import clean_node 

12from ...wxr_context import WiktextractContext 

13from .example import EXAMPLE_TEMPLATES, process_example_template 

14from .linkage import process_semantics_template 

15from .models import Linkage, Sense, WordEntry 

16from .section_titles import LINKAGE_TITLES 

17from .tags import translate_raw_tags 

18 

19IGNORED_TEMPLATES = {"нужен перевод", "??", "?", "Нужен перевод"} 

20 

21TAG_GLOSS_TEMPLATES = { 

22 "многокр.": "iterative", 

23 "нареч.": "adverb", 

24 "наречие": "adverb", # redirect to "нареч." 

25 "однокр.": "semelefactive", 

26 "превосх.": "superlative", 

27 "прич.": "participle", 

28 "сокр.": "abbreviation", 

29 "сравн.": "comparative", 

30 "страд.": "passive", 

31 "счётн.": "numeral", 

32} 

33 

34 

35def extract_gloss( 

36 wxr: WiktextractContext, word_entry: WordEntry, level_node: WikiNode 

37) -> None: 

38 has_gloss_list = False 

39 section_title = clean_node(wxr, None, level_node.largs) 

40 for list_node in level_node.find_child(NodeKind.LIST): 

41 for sense_index, list_item in enumerate( 

42 list_node.find_child(NodeKind.LIST_ITEM), 1 

43 ): 

44 process_gloss_list_item( 

45 wxr, 

46 word_entry, 

47 list_item, 

48 sense_index, 

49 section_title=section_title, 

50 ) 

51 has_gloss_list = True 

52 if not has_gloss_list: 

53 node = wxr.wtp.parse( 

54 wxr.wtp.node_to_wikitext( 

55 list(level_node.invert_find_child(LEVEL_KIND_FLAGS)) 

56 ) 

57 ) 

58 process_gloss_list_item( 

59 wxr, word_entry, node, 1, section_title=section_title 

60 ) 

61 

62 

63def process_gloss_list_item( 

64 wxr: WiktextractContext, 

65 word_entry: WordEntry, 

66 list_item: WikiNode, 

67 sense_index: int, 

68 parent_sense: Sense | None = None, 

69 section_title: str = "", 

70) -> None: 

71 sense = ( 

72 Sense() if parent_sense is None else parent_sense.model_copy(deep=True) 

73 ) 

74 if section_title not in ["", "Значение", "Значения"]: 

75 sense.raw_tags.append(section_title) 

76 gloss_nodes = [] 

77 for child in list_item.children: 

78 if isinstance(child, TemplateNode): 

79 if child.template_name in EXAMPLE_TEMPLATES: 

80 process_example_template(wxr, sense, child) 

81 elif child.template_name == "семантика": 

82 process_semantics_template(wxr, word_entry, child, sense_index) 

83 elif child.template_name in TAG_GLOSS_TEMPLATES: 

84 sense.tags.append(TAG_GLOSS_TEMPLATES[child.template_name]) 

85 gloss_nodes.append(child) 

86 elif child.template_name.endswith(".") or child.template_name in [ 

87 "причастие", 

88 "умласк", 

89 ]: 

90 extract_dot_template(wxr, sense, child, gloss_nodes) 

91 elif child.template_name == "помета": 

92 if "nocolor" in child.template_parameters: 92 ↛ 93line 92 didn't jump to line 93 because the condition on line 92 was never true

93 gloss_nodes.append(child) 

94 else: 

95 raw_tag = clean_node(wxr, sense, child) 

96 if raw_tag not in ["", "?"]: 96 ↛ 77line 96 didn't jump to line 77 because the condition on line 96 was always true

97 sense.raw_tags.append(raw_tag) 

98 elif child.template_name == "значение": 98 ↛ 100line 98 didn't jump to line 100 because the condition on line 98 was always true

99 process_meaning_template(wxr, sense, word_entry, child) 

100 elif child.template_name.lower() not in IGNORED_TEMPLATES: 

101 gloss_nodes.append(child) 

102 elif not (isinstance(child, WikiNode) and child.kind == NodeKind.LIST): 

103 gloss_nodes.append(child) 

104 

105 remove_obsolete_leading_nodes(gloss_nodes) 

106 gloss = clean_node(wxr, sense, gloss_nodes) 

107 if len(gloss) > 0: 

108 sense.glosses.append(gloss) 

109 if len(sense.glosses) > 0: 

110 translate_raw_tags(sense) 

111 word_entry.senses.append(sense) 

112 

113 for child_list in list_item.find_child(NodeKind.LIST): 

114 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM): 

115 process_gloss_list_item( 

116 wxr, word_entry, child_list_item, sense_index, sense 

117 ) 

118 

119 

120def remove_obsolete_leading_nodes(nodes: WikiNodeChildrenList): 

121 while ( 

122 nodes 

123 and isinstance(nodes[0], str) 

124 and nodes[0].strip() in ["", "и", "или", ",", ".", ";", ":"] 

125 ): 

126 nodes.pop(0) 

127 

128 

129def process_meaning_template( 

130 wxr: WiktextractContext, 

131 sense: Sense | None, 

132 word_entry: WordEntry, 

133 template_node: TemplateNode, 

134) -> Sense: 

135 # https://ru.wiktionary.org/wiki/Шаблон:значение 

136 if sense is None: 

137 sense = Sense() 

138 

139 gloss = "" 

140 for param_name, param_value in template_node.template_parameters.items(): 

141 if param_name == "определение": 

142 gloss = clean_node(wxr, None, param_value) 

143 if len(gloss) > 0: 143 ↛ 140line 143 didn't jump to line 140 because the condition on line 143 was always true

144 sense.glosses.append(gloss) 

145 elif param_name == "пометы": 

146 raw_tag = clean_node(wxr, None, param_value) 

147 if len(raw_tag) > 0: 147 ↛ 140line 147 didn't jump to line 140 because the condition on line 147 was always true

148 sense.raw_tags.append(raw_tag) 

149 elif param_name == "примеры" and isinstance(param_value, list): 

150 for t_node in param_value: 

151 if isinstance(t_node, TemplateNode): 

152 process_example_template(wxr, sense, t_node) 

153 elif param_name in LINKAGE_TITLES: 

154 linkage_type = LINKAGE_TITLES[param_name] 

155 if isinstance(param_value, str) and len(param_value.strip()) > 0: 

156 for linkage_word in re.split(r",|;", param_value): 

157 linkage_word = linkage_word.strip() 

158 if len(linkage_word) > 0 and linkage_word != "-": 

159 linkage_list = getattr(word_entry, linkage_type) 

160 linkage_list.append( 

161 Linkage(word=linkage_word, sense=gloss) 

162 ) 

163 elif isinstance(param_value, list): 163 ↛ 140line 163 didn't jump to line 140 because the condition on line 163 was always true

164 for param_node in param_value: 

165 if ( 

166 isinstance(param_node, WikiNode) 

167 and param_node.kind == NodeKind.LINK 

168 ): 

169 linkage_word = clean_node(wxr, None, param_node) 

170 if len(linkage_word) > 0: 170 ↛ 164line 170 didn't jump to line 164 because the condition on line 170 was always true

171 linkage_list = getattr(word_entry, linkage_type) 

172 linkage_list.append( 

173 Linkage(word=linkage_word, sense=gloss) 

174 ) 

175 

176 if len(sense.glosses) > 0: 176 ↛ 179line 176 didn't jump to line 179 because the condition on line 176 was always true

177 translate_raw_tags(sense) 

178 

179 clean_node(wxr, sense, template_node) 

180 return sense 

181 

182 

183def extract_dot_template( 

184 wxr: WiktextractContext, 

185 sense: Sense, 

186 t_node: TemplateNode, 

187 gloss_nodes: list[WikiNode | str], 

188) -> None: 

189 expanded_node = wxr.wtp.parse( 

190 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

191 ) 

192 for node in expanded_node.children: 

193 if isinstance(node, WikiNode) and node.kind == NodeKind.LINK: 

194 is_tag = False 

195 for span_tag in node.find_html_recursively("span"): 

196 if "background-color:#CCFFFF" in span_tag.attrs.get( 

197 "style", "" 

198 ): 

199 raw_tag = clean_node(wxr, None, node) 

200 if raw_tag != "": 200 ↛ 202line 200 didn't jump to line 202 because the condition on line 200 was always true

201 sense.raw_tags.append(raw_tag) 

202 is_tag = True 

203 break 

204 if not is_tag: 

205 gloss_nodes.append(node) 

206 else: 

207 gloss_nodes.append(node)