Coverage for src/wiktextract/extractor/ru/linkage.py: 86%

1from wikitextprocessor import (

2 HTMLNode,

3 NodeKind,

4 TemplateNode,

5 WikiNode,

8from ...page import clean_node

9from ...wxr_context import WiktextractContext

10from .models import Linkage, WordEntry

11from .section_titles import LINKAGE_TITLES

12from .tags import translate_raw_tags

15def extract_linkages(

16 wxr: WiktextractContext,

17 word_entry: WordEntry,

18 linkage_type: str,

19 level_node: WikiNode,

20):

21 if linkage_type not in word_entry.model_fields: 21 ↛ 22line 21 didn't jump to line 22 because the condition on line 21 was never true

22 wxr.wtp.debug(

23 f"Linkage type {linkage_type} not defined for word entry",

24 sortid="extractor/ru/linkage/extract_linkages/10",

25 )

26 return

27 sense_index = 0

28 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM):

29 if list_item.sarg == "#": 29 ↛ 31line 29 didn't jump to line 31 because the condition on line 29 was always true

30 sense_index += 1

31 linkage = Linkage(sense_index=sense_index)

32 for node in list_item.children:

33 if isinstance(node, WikiNode):

34 if node.kind == NodeKind.LINK:

35 linkage.word = clean_node(wxr, None, node)

36 elif isinstance(node, TemplateNode): 36 ↛ 32line 36 didn't jump to line 32 because the condition on line 36 was always true

37 find_linkage_tag(wxr, linkage, node)

38 elif isinstance(node, str) and node.strip() in (";", ","):

39 if len(linkage.word) > 0:

40 translate_raw_tags(linkage)

41 getattr(word_entry, linkage_type).append(linkage)

42 tags = linkage.raw_tags

43 linkage = Linkage(sense_index=sense_index)

44 if node.strip() == ",":

45 linkage.raw_tags = tags

47 if len(linkage.word) > 0: 47 ↛ 28line 47 didn't jump to line 28 because the condition on line 47 was always true

48 translate_raw_tags(linkage)

49 getattr(word_entry, linkage_type).append(linkage)

50 linkage = Linkage(sense_index=sense_index)

53def find_linkage_tag(

54 wxr: WiktextractContext,

55 linkage: Linkage,

56 template_node: TemplateNode,

57) -> None:

58 expanded_template = wxr.wtp.parse(

59 wxr.wtp.node_to_wikitext(template_node), expand_all=True

60 )

61 for span_node in expanded_template.find_html_recursively("span"):

62 tag = clean_node(wxr, None, span_node)

63 if len(tag) > 0: 63 ↛ 61line 63 didn't jump to line 61 because the condition on line 63 was always true

64 linkage.raw_tags.append(tag)

67def process_related_block_template(

68 wxr: WiktextractContext, word_entry: WordEntry, template_node: TemplateNode

69) -> None:

70 # "Родственные слова" section

71 # Шаблон:родств-блок

72 expanded_template = wxr.wtp.parse(

73 wxr.wtp.node_to_wikitext(template_node), expand_all=True

74 )

75 for table_node in expanded_template.find_child(NodeKind.TABLE):

76 table_header = ""

77 for row in table_node.find_child(NodeKind.TABLE_ROW):

78 row_header = ""

79 for cell in row.find_child(

80 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL

81 ):

82 if cell.kind == NodeKind.TABLE_HEADER_CELL: 82 ↛ 83line 82 didn't jump to line 83 because the condition on line 82 was never true

83 cell_text = clean_node(wxr, None, cell)

84 if cell_text.startswith("Список всех слов с корнем"):

85 table_header = cell_text

86 elif cell.kind == NodeKind.TABLE_CELL: 86 ↛ 79line 86 didn't jump to line 79 because the condition on line 86 was always true

87 if "block-head" in cell.attrs.get("class", ""):

88 table_header = clean_node(wxr, None, cell)

89 else:

90 for list_item in cell.find_child_recursively(

91 NodeKind.LIST_ITEM

92 ):

93 for node in list_item.find_child(

94 NodeKind.HTML | NodeKind.LINK

95 ):

96 if (

97 isinstance(node, HTMLNode)

98 and node.tag == "span"

99 ):

100 row_header = clean_node(

101 wxr, None, node

102 ).removesuffix(":")

103 elif node.kind == NodeKind.LINK: 103 ↛ 93line 103 didn't jump to line 93 because the condition on line 103 was always true

104 linkage = Linkage(

105 word=clean_node(wxr, None, node)

106 )

107 if table_header != "": 107 ↛ 109line 107 didn't jump to line 109 because the condition on line 107 was always true

108 linkage.raw_tags.append(table_header)

109 if row_header != "": 109 ↛ 111line 109 didn't jump to line 111 because the condition on line 109 was always true

110 linkage.raw_tags.append(row_header)

111 if linkage.word != "": 111 ↛ 93line 111 didn't jump to line 93 because the condition on line 111 was always true

112 translate_raw_tags(linkage)

113 word_entry.related.append(linkage)

114

115

116def extract_phrase_section(

117 wxr: WiktextractContext,

118 word_entry: WordEntry,

119 level_node: WikiNode,

120 title_text: str,

121) -> None:

122 # "Фразеологизмы и устойчивые сочетания" section

123 for t_node in level_node.find_child(NodeKind.TEMPLATE): 123 ↛ 126line 123 didn't jump to line 126 because the loop on line 123 never started

124 # a template that adds links to words in list

125 # https://ru.wiktionary.org/wiki/Шаблон:в_три_колонки

126 if t_node.template_name.lower() in ["в три колонки", "фразеологизмы"]:

127 expanded_node = wxr.wtp.parse(

128 wxr.wtp.node_to_wikitext(t_node), expand_all=True

129 )

130 for div_tag in expanded_node.find_html(

131 "div", attr_name="class", attr_value="col3"

132 ):

133 extract_phrase_section(wxr, word_entry, div_tag, title_text)

134

135 for list_node in level_node.find_child(NodeKind.LIST):

136 for list_item in list_node.find_child_recursively(NodeKind.LIST_ITEM):

137 prefix_nodes = []

138 before_link = True

139 word_nodes = []

140 inside_brackets = False

141 for node in list_item.children:

142 if isinstance(node, str) and len(node.strip()) > 0:

143 if before_link:

144 prefix_nodes.append(node)

145 elif node.strip().startswith("("):

146 inside_brackets = True

147 word_nodes.append(node)

148 elif node.strip().startswith(")"):

149 inside_brackets = False

150 word_nodes.append(node.strip(",; "))

151 elif inside_brackets:

152 word_nodes.append(node)

153

154 if not inside_brackets and node.strip().endswith(

155 (",", ";", "/")

156 ):

157 word = clean_node(wxr, None, prefix_nodes + word_nodes)

158 word_nodes.clear()

159 if len(word) > 0: 159 ↛ 141line 159 didn't jump to line 141 because the condition on line 159 was always true

160 linkage = Linkage(word=word)

161 if title_text not in [

162 "фразеологизмы и устойчивые сочетания",

163 "пословицы и поговорки",

164 ]:

165 linkage.raw_tags.append(title_text)

166 translate_raw_tags(linkage)

167 if title_text == "пословицы и поговорки": 167 ↛ 168line 167 didn't jump to line 168 because the condition on line 167 was never true

168 word_entry.proverbs.append(linkage)

169 else:

170 word_entry.derived.append(linkage)

171 elif isinstance(node, WikiNode):

172 if node.kind == NodeKind.LIST:

173 continue

174 elif node.kind == NodeKind.LINK:

175 before_link = False

176 if before_link:

177 prefix_nodes.append(node)

178 else:

179 word_nodes.append(node)

180

181 word = clean_node(wxr, None, prefix_nodes + word_nodes)

182 if len(word) > 0: 182 ↛ 136line 182 didn't jump to line 136 because the condition on line 182 was always true

183 linkage = Linkage(word=word)

184 if title_text not in [

185 "фразеологизмы и устойчивые сочетания",

186 "пословицы и поговорки",

187 ]:

188 linkage.raw_tags.append(title_text)

189 translate_raw_tags(linkage)

190 if title_text == "пословицы и поговорки":

191 word_entry.proverbs.append(linkage)

192 else:

193 word_entry.derived.append(linkage)

194

195

196def process_semantics_template(

197 wxr: WiktextractContext,

198 word_entry: WordEntry,

199 template_node: TemplateNode,

200 sense_index: int,

201) -> None:

202 # https://ru.wiktionary.org/wiki/Шаблон:семантика

203 for key, value in template_node.template_parameters.items():

204 if key in LINKAGE_TITLES and isinstance(value, str): 204 ↛ 203line 204 didn't jump to line 203 because the condition on line 204 was always true

205 for word in value.split(","):

206 word = word.strip()

207 if word not in ("", "-"):

208 getattr(word_entry, LINKAGE_TITLES[key]).append(

209 Linkage(word=word, sense_index=sense_index)

210 )