Coverage for src/wiktextract/extractor/ru/linkage.py: 83%

130 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-06-13 07:43 +0000

1from wikitextprocessor import ( 

2 HTMLNode, 

3 LevelNode, 

4 NodeKind, 

5 TemplateNode, 

6 WikiNode, 

7) 

8 

9from ...page import clean_node 

10from ...wxr_context import WiktextractContext 

11from .models import Form, Linkage, WordEntry 

12from .section_titles import LINKAGE_TITLES 

13from .tags import translate_raw_tags 

14 

15 

16def extract_linkage_section( 

17 wxr: WiktextractContext, 

18 word_entry: WordEntry, 

19 linkage_type: str, 

20 level_node: LevelNode, 

21) -> None: 

22 sense_index = 0 

23 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM): 

24 if list_item.sarg == "#": 24 ↛ 26line 24 didn't jump to line 26 because the condition on line 24 was always true

25 sense_index += 1 

26 linkage = Linkage(sense_index=sense_index) 

27 for node in list_item.children: 

28 if isinstance(node, WikiNode): 

29 if node.kind == NodeKind.LINK: 

30 linkage.word = clean_node(wxr, None, node) 

31 elif isinstance(node, TemplateNode): 31 ↛ 27line 31 didn't jump to line 27 because the condition on line 31 was always true

32 find_linkage_tag(wxr, linkage, node) 

33 elif isinstance(node, str) and node.strip() in (";", ","): 

34 if len(linkage.word) > 0: 

35 translate_raw_tags(linkage) 

36 getattr(word_entry, linkage_type).append(linkage) 

37 tags = linkage.raw_tags 

38 linkage = Linkage(sense_index=sense_index) 

39 if node.strip() == ",": 

40 linkage.raw_tags = tags 

41 

42 if len(linkage.word) > 0: 42 ↛ 23line 42 didn't jump to line 23 because the condition on line 42 was always true

43 translate_raw_tags(linkage) 

44 getattr(word_entry, linkage_type).append(linkage) 

45 linkage = Linkage(sense_index=sense_index) 

46 

47 for t_node in level_node.find_child(NodeKind.TEMPLATE): 47 ↛ 48line 47 didn't jump to line 48 because the loop on line 47 never started

48 if t_node.template_name == "родств-блок": 

49 process_related_block_template( 

50 wxr, word_entry, t_node, linkage_type 

51 ) 

52 

53 

54def find_linkage_tag( 

55 wxr: WiktextractContext, 

56 linkage: Linkage, 

57 template_node: TemplateNode, 

58) -> None: 

59 expanded_template = wxr.wtp.parse( 

60 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

61 ) 

62 for span_node in expanded_template.find_html_recursively("span"): 

63 tag = clean_node(wxr, None, span_node) 

64 if len(tag) > 0: 64 ↛ 62line 64 didn't jump to line 62 because the condition on line 64 was always true

65 linkage.raw_tags.append(tag) 

66 

67 

68def process_related_block_template( 

69 wxr: WiktextractContext, 

70 word_entry: WordEntry, 

71 t_node: TemplateNode, 

72 l_type: str, 

73) -> None: 

74 # "Родственные слова" section 

75 # Шаблон:родств-блок 

76 expanded_template = wxr.wtp.parse( 

77 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

78 ) 

79 for table_node in expanded_template.find_child(NodeKind.TABLE): 

80 table_header = "" 

81 for row in table_node.find_child(NodeKind.TABLE_ROW): 

82 row_header = "" 

83 for cell in row.find_child( 

84 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

85 ): 

86 if cell.kind == NodeKind.TABLE_HEADER_CELL: 86 ↛ 87line 86 didn't jump to line 87 because the condition on line 86 was never true

87 cell_text = clean_node(wxr, None, cell) 

88 if cell_text.startswith("Список всех слов с корнем"): 

89 table_header = cell_text 

90 elif cell.kind == NodeKind.TABLE_CELL: 90 ↛ 83line 90 didn't jump to line 83 because the condition on line 90 was always true

91 if "block-head" in cell.attrs.get("class", ""): 

92 table_header = clean_node(wxr, None, cell) 

93 else: 

94 for list_item in cell.find_child_recursively( 

95 NodeKind.LIST_ITEM 

96 ): 

97 for node in list_item.find_child( 

98 NodeKind.HTML | NodeKind.LINK 

99 ): 

100 if ( 

101 isinstance(node, HTMLNode) 

102 and node.tag == "span" 

103 ): 

104 row_header = clean_node( 

105 wxr, None, node 

106 ).removesuffix(":") 

107 elif node.kind == NodeKind.LINK: 107 ↛ 97line 107 didn't jump to line 97 because the condition on line 107 was always true

108 linkage = Linkage( 

109 word=clean_node(wxr, None, node) 

110 ) 

111 if table_header != "": 111 ↛ 113line 111 didn't jump to line 113 because the condition on line 111 was always true

112 linkage.raw_tags.append(table_header) 

113 if row_header != "": 113 ↛ 115line 113 didn't jump to line 115 because the condition on line 113 was always true

114 linkage.raw_tags.append(row_header) 

115 if linkage.word != "": 115 ↛ 97line 115 didn't jump to line 97 because the condition on line 115 was always true

116 translate_raw_tags(linkage) 

117 getattr(word_entry, l_type).append( 

118 linkage 

119 ) 

120 

121 

122def extract_phrase_section( 

123 wxr: WiktextractContext, 

124 word_entry: WordEntry, 

125 level_node: WikiNode, 

126 title_text: str, 

127) -> None: 

128 # "Фразеологизмы и устойчивые сочетания" section 

129 for t_node in level_node.find_child(NodeKind.TEMPLATE): 129 ↛ 132line 129 didn't jump to line 132 because the loop on line 129 never started

130 # a template that adds links to words in list 

131 # https://ru.wiktionary.org/wiki/Шаблон:в_три_колонки 

132 if t_node.template_name.lower() in ["в три колонки", "фразеологизмы"]: 

133 expanded_node = wxr.wtp.parse( 

134 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

135 ) 

136 for div_tag in expanded_node.find_html( 

137 "div", attr_name="class", attr_value="col3" 

138 ): 

139 extract_phrase_section(wxr, word_entry, div_tag, title_text) 

140 

141 for list_node in level_node.find_child(NodeKind.LIST): 

142 for list_item in list_node.find_child_recursively(NodeKind.LIST_ITEM): 

143 prefix_nodes = [] 

144 before_link = True 

145 word_nodes = [] 

146 inside_brackets = False 

147 for node in list_item.children: 

148 if isinstance(node, str) and len(node.strip()) > 0: 

149 if before_link: 

150 prefix_nodes.append(node) 

151 elif node.strip().startswith("("): 

152 inside_brackets = True 

153 word_nodes.append(node) 

154 elif node.strip().startswith(")"): 

155 inside_brackets = False 

156 word_nodes.append(node.strip(",; ")) 

157 elif inside_brackets: 

158 word_nodes.append(node) 

159 

160 if not inside_brackets and node.strip().endswith( 

161 (",", ";", "/") 

162 ): 

163 word = clean_node(wxr, None, prefix_nodes + word_nodes) 

164 word_nodes.clear() 

165 if len(word) > 0: 165 ↛ 147line 165 didn't jump to line 147 because the condition on line 165 was always true

166 linkage = Linkage(word=word) 

167 if title_text not in [ 

168 "фразеологизмы и устойчивые сочетания", 

169 "пословицы и поговорки", 

170 ]: 

171 linkage.raw_tags.append(title_text) 

172 translate_raw_tags(linkage) 

173 if title_text == "пословицы и поговорки": 173 ↛ 174line 173 didn't jump to line 174 because the condition on line 173 was never true

174 word_entry.proverbs.append(linkage) 

175 else: 

176 word_entry.derived.append(linkage) 

177 elif isinstance(node, WikiNode): 

178 if node.kind == NodeKind.LIST: 

179 continue 

180 elif node.kind == NodeKind.LINK: 

181 before_link = False 

182 if before_link: 

183 prefix_nodes.append(node) 

184 else: 

185 word_nodes.append(node) 

186 

187 word = clean_node(wxr, None, prefix_nodes + word_nodes) 

188 if len(word) > 0: 188 ↛ 142line 188 didn't jump to line 142 because the condition on line 188 was always true

189 linkage = Linkage(word=word) 

190 if title_text not in [ 

191 "фразеологизмы и устойчивые сочетания", 

192 "пословицы и поговорки", 

193 ]: 

194 linkage.raw_tags.append(title_text) 

195 translate_raw_tags(linkage) 

196 if title_text == "пословицы и поговорки": 

197 word_entry.proverbs.append(linkage) 

198 else: 

199 word_entry.derived.append(linkage) 

200 

201 

202def process_semantics_template( 

203 wxr: WiktextractContext, 

204 word_entry: WordEntry, 

205 template_node: TemplateNode, 

206 sense_index: int, 

207) -> None: 

208 # https://ru.wiktionary.org/wiki/Шаблон:семантика 

209 for key, value in template_node.template_parameters.items(): 

210 if key in LINKAGE_TITLES and isinstance(value, str): 210 ↛ 209line 210 didn't jump to line 209 because the condition on line 210 was always true

211 for word in value.split(","): 

212 word = word.strip() 

213 if word not in ("", "-", "—"): 

214 getattr(word_entry, LINKAGE_TITLES[key]).append( 

215 Linkage(word=word, sense_index=sense_index) 

216 ) 

217 

218 

219def extract_alt_form_section( 

220 wxr: WiktextractContext, 

221 word_entry: WordEntry, 

222 level_node: LevelNode, 

223 tags: list[str], 

224) -> None: 

225 for link_node in level_node.find_child_recursively(NodeKind.LINK): 

226 word = clean_node(wxr, None, link_node) 

227 if word != "": 

228 word_entry.forms.append(Form(form=word, tags=tags))