Coverage for src/wiktextract/extractor/ru/linkage.py: 86%

125 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-10-25 10:11 +0000

1from wikitextprocessor import ( 

2 HTMLNode, 

3 NodeKind, 

4 TemplateNode, 

5 WikiNode, 

6) 

7 

8from ...page import clean_node 

9from ...wxr_context import WiktextractContext 

10from .models import Linkage, WordEntry 

11from .section_titles import LINKAGE_TITLES 

12from .tags import translate_raw_tags 

13 

14 

15def extract_linkages( 

16 wxr: WiktextractContext, 

17 word_entry: WordEntry, 

18 linkage_type: str, 

19 level_node: WikiNode, 

20): 

21 if linkage_type not in word_entry.model_fields: 21 ↛ 22line 21 didn't jump to line 22 because the condition on line 21 was never true

22 wxr.wtp.debug( 

23 f"Linkage type {linkage_type} not defined for word entry", 

24 sortid="extractor/ru/linkage/extract_linkages/10", 

25 ) 

26 return 

27 sense_index = 0 

28 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM): 

29 if list_item.sarg == "#": 29 ↛ 31line 29 didn't jump to line 31 because the condition on line 29 was always true

30 sense_index += 1 

31 linkage = Linkage(sense_index=sense_index) 

32 for node in list_item.children: 

33 if isinstance(node, WikiNode): 

34 if node.kind == NodeKind.LINK: 

35 linkage.word = clean_node(wxr, None, node) 

36 elif isinstance(node, TemplateNode): 36 ↛ 32line 36 didn't jump to line 32 because the condition on line 36 was always true

37 find_linkage_tag(wxr, linkage, node) 

38 elif isinstance(node, str) and node.strip() in (";", ","): 

39 if len(linkage.word) > 0: 

40 translate_raw_tags(linkage) 

41 getattr(word_entry, linkage_type).append(linkage) 

42 tags = linkage.raw_tags 

43 linkage = Linkage(sense_index=sense_index) 

44 if node.strip() == ",": 

45 linkage.raw_tags = tags 

46 

47 if len(linkage.word) > 0: 47 ↛ 28line 47 didn't jump to line 28 because the condition on line 47 was always true

48 translate_raw_tags(linkage) 

49 getattr(word_entry, linkage_type).append(linkage) 

50 linkage = Linkage(sense_index=sense_index) 

51 

52 

53def find_linkage_tag( 

54 wxr: WiktextractContext, 

55 linkage: Linkage, 

56 template_node: TemplateNode, 

57) -> None: 

58 expanded_template = wxr.wtp.parse( 

59 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

60 ) 

61 for span_node in expanded_template.find_html_recursively("span"): 

62 tag = clean_node(wxr, None, span_node) 

63 if len(tag) > 0: 63 ↛ 61line 63 didn't jump to line 61 because the condition on line 63 was always true

64 linkage.raw_tags.append(tag) 

65 

66 

67def process_related_block_template( 

68 wxr: WiktextractContext, word_entry: WordEntry, template_node: TemplateNode 

69) -> None: 

70 # "Родственные слова" section 

71 # Шаблон:родств-блок 

72 expanded_template = wxr.wtp.parse( 

73 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

74 ) 

75 for table_node in expanded_template.find_child(NodeKind.TABLE): 

76 table_header = "" 

77 for row in table_node.find_child(NodeKind.TABLE_ROW): 

78 row_header = "" 

79 for cell in row.find_child( 

80 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

81 ): 

82 if cell.kind == NodeKind.TABLE_HEADER_CELL: 82 ↛ 83line 82 didn't jump to line 83 because the condition on line 82 was never true

83 cell_text = clean_node(wxr, None, cell) 

84 if cell_text.startswith("Список всех слов с корнем"): 

85 table_header = cell_text 

86 elif cell.kind == NodeKind.TABLE_CELL: 86 ↛ 79line 86 didn't jump to line 79 because the condition on line 86 was always true

87 if "block-head" in cell.attrs.get("class", ""): 

88 table_header = clean_node(wxr, None, cell) 

89 else: 

90 for list_item in cell.find_child_recursively( 

91 NodeKind.LIST_ITEM 

92 ): 

93 for node in list_item.find_child( 

94 NodeKind.HTML | NodeKind.LINK 

95 ): 

96 if ( 

97 isinstance(node, HTMLNode) 

98 and node.tag == "span" 

99 ): 

100 row_header = clean_node( 

101 wxr, None, node 

102 ).removesuffix(":") 

103 elif node.kind == NodeKind.LINK: 103 ↛ 93line 103 didn't jump to line 93 because the condition on line 103 was always true

104 linkage = Linkage( 

105 word=clean_node(wxr, None, node) 

106 ) 

107 if table_header != "": 107 ↛ 109line 107 didn't jump to line 109 because the condition on line 107 was always true

108 linkage.raw_tags.append(table_header) 

109 if row_header != "": 109 ↛ 111line 109 didn't jump to line 111 because the condition on line 109 was always true

110 linkage.raw_tags.append(row_header) 

111 if linkage.word != "": 111 ↛ 93line 111 didn't jump to line 93 because the condition on line 111 was always true

112 translate_raw_tags(linkage) 

113 word_entry.related.append(linkage) 

114 

115 

116def extract_phrase_section( 

117 wxr: WiktextractContext, 

118 word_entry: WordEntry, 

119 level_node: WikiNode, 

120 title_text: str, 

121) -> None: 

122 # "Фразеологизмы и устойчивые сочетания" section 

123 for t_node in level_node.find_child(NodeKind.TEMPLATE): 123 ↛ 126line 123 didn't jump to line 126 because the loop on line 123 never started

124 # a template that adds links to words in list 

125 # https://ru.wiktionary.org/wiki/Шаблон:в_три_колонки 

126 if t_node.template_name.lower() in ["в три колонки", "фразеологизмы"]: 

127 expanded_node = wxr.wtp.parse( 

128 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

129 ) 

130 for div_tag in expanded_node.find_html( 

131 "div", attr_name="class", attr_value="col3" 

132 ): 

133 extract_phrase_section(wxr, word_entry, div_tag, title_text) 

134 

135 for list_node in level_node.find_child(NodeKind.LIST): 

136 for list_item in list_node.find_child_recursively(NodeKind.LIST_ITEM): 

137 prefix_nodes = [] 

138 before_link = True 

139 word_nodes = [] 

140 inside_brackets = False 

141 for node in list_item.children: 

142 if isinstance(node, str) and len(node.strip()) > 0: 

143 if before_link: 

144 prefix_nodes.append(node) 

145 elif node.strip().startswith("("): 

146 inside_brackets = True 

147 word_nodes.append(node) 

148 elif node.strip().startswith(")"): 

149 inside_brackets = False 

150 word_nodes.append(node.strip(",; ")) 

151 elif inside_brackets: 

152 word_nodes.append(node) 

153 

154 if not inside_brackets and node.strip().endswith( 

155 (",", ";", "/") 

156 ): 

157 word = clean_node(wxr, None, prefix_nodes + word_nodes) 

158 word_nodes.clear() 

159 if len(word) > 0: 159 ↛ 141line 159 didn't jump to line 141 because the condition on line 159 was always true

160 linkage = Linkage(word=word) 

161 if title_text not in [ 

162 "фразеологизмы и устойчивые сочетания", 

163 "пословицы и поговорки", 

164 ]: 

165 linkage.raw_tags.append(title_text) 

166 translate_raw_tags(linkage) 

167 if title_text == "пословицы и поговорки": 167 ↛ 168line 167 didn't jump to line 168 because the condition on line 167 was never true

168 word_entry.proverbs.append(linkage) 

169 else: 

170 word_entry.derived.append(linkage) 

171 elif isinstance(node, WikiNode): 

172 if node.kind == NodeKind.LIST: 

173 continue 

174 elif node.kind == NodeKind.LINK: 

175 before_link = False 

176 if before_link: 

177 prefix_nodes.append(node) 

178 else: 

179 word_nodes.append(node) 

180 

181 word = clean_node(wxr, None, prefix_nodes + word_nodes) 

182 if len(word) > 0: 182 ↛ 136line 182 didn't jump to line 136 because the condition on line 182 was always true

183 linkage = Linkage(word=word) 

184 if title_text not in [ 

185 "фразеологизмы и устойчивые сочетания", 

186 "пословицы и поговорки", 

187 ]: 

188 linkage.raw_tags.append(title_text) 

189 translate_raw_tags(linkage) 

190 if title_text == "пословицы и поговорки": 

191 word_entry.proverbs.append(linkage) 

192 else: 

193 word_entry.derived.append(linkage) 

194 

195 

196def process_semantics_template( 

197 wxr: WiktextractContext, 

198 word_entry: WordEntry, 

199 template_node: TemplateNode, 

200 sense_index: int, 

201) -> None: 

202 # https://ru.wiktionary.org/wiki/Шаблон:семантика 

203 for key, value in template_node.template_parameters.items(): 

204 if key in LINKAGE_TITLES and isinstance(value, str): 204 ↛ 203line 204 didn't jump to line 203 because the condition on line 204 was always true

205 for word in value.split(","): 

206 word = word.strip() 

207 if word not in ("", "-"): 

208 getattr(word_entry, LINKAGE_TITLES[key]).append( 

209 Linkage(word=word, sense_index=sense_index) 

210 )