Coverage for src/wiktextract/extractor/pl/linkage.py: 91%

104 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-10-25 10:11 +0000

1import re 

2from collections import defaultdict 

3 

4from wikitextprocessor.parser import NodeKind, TemplateNode, WikiNode 

5 

6from ...page import clean_node 

7from ...wxr_context import WiktextractContext 

8from .models import Linkage, WordEntry 

9from .tags import TAGS, translate_raw_tags 

10 

11LINKAGE_TYPES = { 

12 "antonimy": "antonyms", 

13 "hiperonimy": "hypernyms", 

14 "hiponimy": "hyponyms", 

15 "holonimy": "holonyms", 

16 "kolokacje": "related", 

17 "meronimy": "meronyms", 

18 "synonimy": "synonyms", 

19 "wyrazy pokrewne": "related", 

20 "związki frazeologiczne": "proverbs", 

21 "złożenia": "derived", 

22} 

23 

24 

25def extract_linkage_section( 

26 wxr: WiktextractContext, 

27 page_data: list[WordEntry], 

28 level_node: WikiNode, 

29 linkage_type: str, 

30 lang_code: str, 

31) -> None: 

32 from .page import match_sense_index 

33 

34 linkages = defaultdict(list) 

35 has_list = False 

36 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM): 

37 process_linkage_list_item(wxr, list_item, linkages) 

38 has_list = True 

39 

40 if not has_list: 

41 # get around "preformatted" node 

42 for node in level_node.find_child_recursively( 

43 NodeKind.LINK | NodeKind.TEMPLATE 

44 ): 

45 if node.kind == NodeKind.LINK: 

46 word = clean_node(wxr, None, node) 

47 if word != "": 47 ↛ 42line 47 didn't jump to line 42 because the condition on line 47 was always true

48 linkages[""].append(Linkage(word=word)) 

49 elif isinstance(node, TemplateNode): 49 ↛ 42line 49 didn't jump to line 42 because the condition on line 49 was always true

50 process_linkage_template( 

51 wxr, node, linkages, "", False, [], [], [] 

52 ) 

53 

54 matched_indexes = set() 

55 for data in page_data: 

56 if data.lang_code == lang_code: 56 ↛ 55line 56 didn't jump to line 55 because the condition on line 56 was always true

57 for sense_index in linkages.keys(): 

58 if match_sense_index(sense_index, data): 

59 getattr(data, linkage_type).extend(linkages[sense_index]) 

60 matched_indexes.add(sense_index) 

61 getattr(data, linkage_type).extend(linkages.get("", [])) 

62 

63 # add not matched data 

64 if "" in linkages: 

65 del linkages[""] 

66 for data in page_data: 66 ↛ exitline 66 didn't return from function 'extract_linkage_section' because the loop on line 66 didn't complete

67 if data.lang_code == lang_code: 67 ↛ 66line 67 didn't jump to line 66 because the condition on line 67 was always true

68 for sense_index, linkage_list in linkages.items(): 

69 if sense_index not in matched_indexes: 69 ↛ 70line 69 didn't jump to line 70 because the condition on line 69 was never true

70 getattr(data, linkage_type).extend(linkage_list) 

71 break 

72 

73 

74def process_linkage_list_item( 

75 wxr: WiktextractContext, 

76 list_item: WikiNode, 

77 linkages: dict[str, list[Linkage]], 

78) -> None: 

79 raw_tags = [] 

80 sense_index = "" 

81 word_nodes = [] 

82 translation_nodes = [] 

83 is_translation = False 

84 for node in list_item.children: 

85 if isinstance(node, str): 

86 m = re.search(r"\([\d\s,-.]+\)", node) 

87 if m is not None: 

88 sense_index = m.group(0).strip("()") 

89 node = node[m.end() :] 

90 

91 if "→" in node: 

92 is_translation = True 

93 tr_start = node.index("→") 

94 word_nodes.append(node[:tr_start]) 

95 translation_nodes.append(node[tr_start + 1 :]) 

96 else: 

97 has_sep = False 

98 for sep in [";", "•", ",", "/"]: 

99 if sep in node: 

100 has_sep = True 

101 sep_index = node.index(sep) 

102 if is_translation: 

103 translation_nodes.append(node[:sep_index]) 

104 else: 

105 word_nodes.append(node[:sep_index]) 

106 linkage = Linkage( 

107 word=clean_node(wxr, None, word_nodes), 

108 translation=clean_node( 

109 wxr, None, translation_nodes 

110 ), 

111 raw_tags=raw_tags, 

112 sense_index=sense_index, 

113 ) 

114 translate_raw_tags(linkage) 

115 if len(linkage.word) > 0: 115 ↛ 118line 115 didn't jump to line 118 because the condition on line 115 was always true

116 linkages[sense_index].append(linkage) 

117 

118 word_nodes.clear() 

119 translation_nodes.clear() 

120 is_translation = False 

121 raw_tags.clear() 

122 word_nodes.append(node[sep_index + 1 :]) 

123 break 

124 if not has_sep: 

125 if is_translation: 

126 translation_nodes.append(node) 

127 else: 

128 word_nodes.append(node) 

129 elif isinstance(node, TemplateNode): 

130 process_linkage_template( 

131 wxr, 

132 node, 

133 linkages, 

134 sense_index, 

135 is_translation, 

136 word_nodes, 

137 translation_nodes, 

138 raw_tags, 

139 ) 

140 elif is_translation: 

141 translation_nodes.append(node) 

142 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

143 continue 

144 else: 

145 word_nodes.append(node) 

146 

147 if len(word_nodes) > 0: 147 ↛ exitline 147 didn't return from function 'process_linkage_list_item' because the condition on line 147 was always true

148 word = clean_node(wxr, None, word_nodes) 

149 if len(word) > 0: 

150 linkage = Linkage( 

151 word=word, 

152 translation=clean_node(wxr, None, translation_nodes), 

153 raw_tags=raw_tags, 

154 sense_index=sense_index, 

155 ) 

156 translate_raw_tags(linkage) 

157 linkages[sense_index].append(linkage) 

158 

159 

160def process_linkage_template( 

161 wxr: WiktextractContext, 

162 template_node: TemplateNode, 

163 linkages: dict[str, list[Linkage]], 

164 sense_index: str, 

165 is_translation: bool, 

166 word_nodes: list[WikiNode], 

167 tr_nodes: list[WikiNode], 

168 raw_tags: list[str], 

169) -> None: 

170 if template_node.template_name == "furi": 

171 expanded_text = clean_node(wxr, None, template_node) 

172 if "(" in expanded_text: 172 ↛ exitline 172 didn't return from function 'process_linkage_template' because the condition on line 172 was always true

173 furigana_start = expanded_text.rindex("(") 

174 linkage = Linkage( 

175 word=expanded_text[:furigana_start], 

176 furigana=expanded_text[furigana_start:].strip("() "), 

177 sense_index=sense_index, 

178 ) 

179 linkages[sense_index].append(linkage) 

180 else: 

181 raw_tag = clean_node(wxr, None, template_node) 

182 if raw_tag.endswith(".") or raw_tag in TAGS: 182 ↛ 184line 182 didn't jump to line 184 because the condition on line 182 was always true

183 raw_tags.append(raw_tag) 

184 elif is_translation: 

185 tr_nodes.append(raw_tag) 

186 else: 

187 word_nodes.append(raw_tag)