Coverage for src/wiktextract/extractor/pl/linkage.py: 91%

1import re

2from collections import defaultdict

4from wikitextprocessor.parser import NodeKind, TemplateNode, WikiNode

6from ...page import clean_node

7from ...wxr_context import WiktextractContext

8from .models import Linkage, WordEntry

9from .tags import TAGS, translate_raw_tags

11LINKAGE_TYPES = {

12 "antonimy": "antonyms",

13 "hiperonimy": "hypernyms",

14 "hiponimy": "hyponyms",

15 "holonimy": "holonyms",

16 "kolokacje": "related",

17 "meronimy": "meronyms",

18 "synonimy": "synonyms",

19 "wyrazy pochodne": "derived",

20 "wyrazy pokrewne": "related",

21 "związki frazeologiczne": "proverbs",

22 "złożenia": "derived",

23}

26def extract_linkage_section(

27 wxr: WiktextractContext,

28 page_data: list[WordEntry],

29 level_node: WikiNode,

30 linkage_type: str,

31 lang_code: str,

32) -> None:

33 from .page import match_sense_index

35 linkages = defaultdict(list)

36 has_list = False

37 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM):

38 process_linkage_list_item(wxr, list_item, linkages)

39 has_list = True

41 if not has_list:

42 # get around "preformatted" node

43 for node in level_node.find_child_recursively(

44 NodeKind.LINK | NodeKind.TEMPLATE

45 ):

46 if node.kind == NodeKind.LINK:

47 word = clean_node(wxr, None, node)

48 if word != "": 48 ↛ 43line 48 didn't jump to line 43 because the condition on line 48 was always true

49 linkages[""].append(Linkage(word=word))

50 elif isinstance(node, TemplateNode): 50 ↛ 43line 50 didn't jump to line 43 because the condition on line 50 was always true

51 process_linkage_template(

52 wxr, node, linkages, "", False, [], [], []

53 )

55 matched_indexes = set()

56 for data in page_data:

57 if data.lang_code == lang_code: 57 ↛ 56line 57 didn't jump to line 56 because the condition on line 57 was always true

58 for sense_index in linkages.keys():

59 if match_sense_index(sense_index, data):

60 getattr(data, linkage_type).extend(linkages[sense_index])

61 matched_indexes.add(sense_index)

62 getattr(data, linkage_type).extend(linkages.get("", []))

64 # add not matched data

65 if "" in linkages:

66 del linkages[""]

67 for data in page_data: 67 ↛ exitline 67 didn't return from function 'extract_linkage_section' because the loop on line 67 didn't complete

68 if data.lang_code == lang_code: 68 ↛ 67line 68 didn't jump to line 67 because the condition on line 68 was always true

69 for sense_index, linkage_list in linkages.items():

70 if sense_index not in matched_indexes: 70 ↛ 71line 70 didn't jump to line 71 because the condition on line 70 was never true

71 getattr(data, linkage_type).extend(linkage_list)

72 break

75def process_linkage_list_item(

76 wxr: WiktextractContext,

77 list_item: WikiNode,

78 linkages: dict[str, list[Linkage]],

79) -> None:

80 raw_tags = []

81 sense_index = ""

82 word_nodes = []

83 translation_nodes = []

84 is_translation = False

85 for node in list_item.children:

86 if isinstance(node, str):

87 m = re.search(r"\([\d\s,-.]+\)", node)

88 if m is not None:

89 sense_index = m.group(0).strip("()")

90 node = node[m.end() :]

92 if "→" in node:

93 is_translation = True

94 tr_start = node.index("→")

95 word_nodes.append(node[:tr_start])

96 translation_nodes.append(node[tr_start + 1 :])

97 else:

98 has_sep = False

99 for sep in [";", "•", ",", "/"]:

100 if sep in node:

101 has_sep = True

102 sep_index = node.index(sep)

103 if is_translation:

104 translation_nodes.append(node[:sep_index])

105 else:

106 word_nodes.append(node[:sep_index])

107 linkage = Linkage(

108 word=clean_node(wxr, None, word_nodes),

109 translation=clean_node(

110 wxr, None, translation_nodes

111 ),

112 raw_tags=raw_tags,

113 sense_index=sense_index,

114 )

115 translate_raw_tags(linkage)

116 if len(linkage.word) > 0: 116 ↛ 119line 116 didn't jump to line 119 because the condition on line 116 was always true

117 linkages[sense_index].append(linkage)

118

119 word_nodes.clear()

120 translation_nodes.clear()

121 is_translation = False

122 raw_tags.clear()

123 word_nodes.append(node[sep_index + 1 :])

124 break

125 if not has_sep:

126 if is_translation:

127 translation_nodes.append(node)

128 else:

129 word_nodes.append(node)

130 elif isinstance(node, TemplateNode):

131 process_linkage_template(

132 wxr,

133 node,

134 linkages,

135 sense_index,

136 is_translation,

137 word_nodes,

138 translation_nodes,

139 raw_tags,

140 )

141 elif is_translation:

142 translation_nodes.append(node)

143 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:

144 continue

145 else:

146 word_nodes.append(node)

147

148 if len(word_nodes) > 0: 148 ↛ exitline 148 didn't return from function 'process_linkage_list_item' because the condition on line 148 was always true

149 word = clean_node(wxr, None, word_nodes)

150 if len(word) > 0:

151 linkage = Linkage(

152 word=word,

153 translation=clean_node(wxr, None, translation_nodes),

154 raw_tags=raw_tags,

155 sense_index=sense_index,

156 )

157 translate_raw_tags(linkage)

158 linkages[sense_index].append(linkage)

159

160

161def process_linkage_template(

162 wxr: WiktextractContext,

163 template_node: TemplateNode,

164 linkages: dict[str, list[Linkage]],

165 sense_index: str,

166 is_translation: bool,

167 word_nodes: list[WikiNode],

168 tr_nodes: list[WikiNode],

169 raw_tags: list[str],

170) -> None:

171 if template_node.template_name == "furi":

172 expanded_text = clean_node(wxr, None, template_node)

173 if "(" in expanded_text: 173 ↛ exitline 173 didn't return from function 'process_linkage_template' because the condition on line 173 was always true

174 furigana_start = expanded_text.rindex("(")

175 linkage = Linkage(

176 word=expanded_text[:furigana_start],

177 furigana=expanded_text[furigana_start:].strip("() "),

178 sense_index=sense_index,

179 )

180 linkages[sense_index].append(linkage)

181 else:

182 raw_tag = clean_node(wxr, None, template_node)

183 if raw_tag.endswith(".") or raw_tag in TAGS: 183 ↛ 185line 183 didn't jump to line 185 because the condition on line 183 was always true

184 raw_tags.append(raw_tag)

185 elif is_translation:

186 tr_nodes.append(raw_tag)

187 else:

188 word_nodes.append(raw_tag)