Coverage for src/wiktextract/extractor/pl/linkage.py: 91%

104 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-06-13 07:43 +0000

1import re 

2from collections import defaultdict 

3 

4from wikitextprocessor.parser import NodeKind, TemplateNode, WikiNode 

5 

6from ...page import clean_node 

7from ...wxr_context import WiktextractContext 

8from .models import Linkage, WordEntry 

9from .tags import TAGS, translate_raw_tags 

10 

11LINKAGE_TYPES = { 

12 "antonimy": "antonyms", 

13 "hiperonimy": "hypernyms", 

14 "hiponimy": "hyponyms", 

15 "holonimy": "holonyms", 

16 "kolokacje": "related", 

17 "meronimy": "meronyms", 

18 "synonimy": "synonyms", 

19 "wyrazy pochodne": "derived", 

20 "wyrazy pokrewne": "related", 

21 "związki frazeologiczne": "proverbs", 

22 "złożenia": "derived", 

23} 

24 

25 

26def extract_linkage_section( 

27 wxr: WiktextractContext, 

28 page_data: list[WordEntry], 

29 level_node: WikiNode, 

30 linkage_type: str, 

31 lang_code: str, 

32) -> None: 

33 from .page import match_sense_index 

34 

35 linkages = defaultdict(list) 

36 has_list = False 

37 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM): 

38 process_linkage_list_item(wxr, list_item, linkages) 

39 has_list = True 

40 

41 if not has_list: 

42 # get around "preformatted" node 

43 for node in level_node.find_child_recursively( 

44 NodeKind.LINK | NodeKind.TEMPLATE 

45 ): 

46 if node.kind == NodeKind.LINK: 

47 word = clean_node(wxr, None, node) 

48 if word != "": 48 ↛ 43line 48 didn't jump to line 43 because the condition on line 48 was always true

49 linkages[""].append(Linkage(word=word)) 

50 elif isinstance(node, TemplateNode): 50 ↛ 43line 50 didn't jump to line 43 because the condition on line 50 was always true

51 process_linkage_template( 

52 wxr, node, linkages, "", False, [], [], [] 

53 ) 

54 

55 matched_indexes = set() 

56 for data in page_data: 

57 if data.lang_code == lang_code: 57 ↛ 56line 57 didn't jump to line 56 because the condition on line 57 was always true

58 for sense_index in linkages.keys(): 

59 if match_sense_index(sense_index, data): 

60 getattr(data, linkage_type).extend(linkages[sense_index]) 

61 matched_indexes.add(sense_index) 

62 getattr(data, linkage_type).extend(linkages.get("", [])) 

63 

64 # add not matched data 

65 if "" in linkages: 

66 del linkages[""] 

67 for data in page_data: 67 ↛ exitline 67 didn't return from function 'extract_linkage_section' because the loop on line 67 didn't complete

68 if data.lang_code == lang_code: 68 ↛ 67line 68 didn't jump to line 67 because the condition on line 68 was always true

69 for sense_index, linkage_list in linkages.items(): 

70 if sense_index not in matched_indexes: 70 ↛ 71line 70 didn't jump to line 71 because the condition on line 70 was never true

71 getattr(data, linkage_type).extend(linkage_list) 

72 break 

73 

74 

75def process_linkage_list_item( 

76 wxr: WiktextractContext, 

77 list_item: WikiNode, 

78 linkages: dict[str, list[Linkage]], 

79) -> None: 

80 raw_tags = [] 

81 sense_index = "" 

82 word_nodes = [] 

83 translation_nodes = [] 

84 is_translation = False 

85 for node in list_item.children: 

86 if isinstance(node, str): 

87 m = re.search(r"\([\d\s,-.]+\)", node) 

88 if m is not None: 

89 sense_index = m.group(0).strip("()") 

90 node = node[m.end() :] 

91 

92 if "→" in node: 

93 is_translation = True 

94 tr_start = node.index("→") 

95 word_nodes.append(node[:tr_start]) 

96 translation_nodes.append(node[tr_start + 1 :]) 

97 else: 

98 has_sep = False 

99 for sep in [";", "•", ",", "/"]: 

100 if sep in node: 

101 has_sep = True 

102 sep_index = node.index(sep) 

103 if is_translation: 

104 translation_nodes.append(node[:sep_index]) 

105 else: 

106 word_nodes.append(node[:sep_index]) 

107 linkage = Linkage( 

108 word=clean_node(wxr, None, word_nodes), 

109 translation=clean_node( 

110 wxr, None, translation_nodes 

111 ), 

112 raw_tags=raw_tags, 

113 sense_index=sense_index, 

114 ) 

115 translate_raw_tags(linkage) 

116 if len(linkage.word) > 0: 116 ↛ 119line 116 didn't jump to line 119 because the condition on line 116 was always true

117 linkages[sense_index].append(linkage) 

118 

119 word_nodes.clear() 

120 translation_nodes.clear() 

121 is_translation = False 

122 raw_tags.clear() 

123 word_nodes.append(node[sep_index + 1 :]) 

124 break 

125 if not has_sep: 

126 if is_translation: 

127 translation_nodes.append(node) 

128 else: 

129 word_nodes.append(node) 

130 elif isinstance(node, TemplateNode): 

131 process_linkage_template( 

132 wxr, 

133 node, 

134 linkages, 

135 sense_index, 

136 is_translation, 

137 word_nodes, 

138 translation_nodes, 

139 raw_tags, 

140 ) 

141 elif is_translation: 

142 translation_nodes.append(node) 

143 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

144 continue 

145 else: 

146 word_nodes.append(node) 

147 

148 if len(word_nodes) > 0: 148 ↛ exitline 148 didn't return from function 'process_linkage_list_item' because the condition on line 148 was always true

149 word = clean_node(wxr, None, word_nodes) 

150 if len(word) > 0: 

151 linkage = Linkage( 

152 word=word, 

153 translation=clean_node(wxr, None, translation_nodes), 

154 raw_tags=raw_tags, 

155 sense_index=sense_index, 

156 ) 

157 translate_raw_tags(linkage) 

158 linkages[sense_index].append(linkage) 

159 

160 

161def process_linkage_template( 

162 wxr: WiktextractContext, 

163 template_node: TemplateNode, 

164 linkages: dict[str, list[Linkage]], 

165 sense_index: str, 

166 is_translation: bool, 

167 word_nodes: list[WikiNode], 

168 tr_nodes: list[WikiNode], 

169 raw_tags: list[str], 

170) -> None: 

171 if template_node.template_name == "furi": 

172 expanded_text = clean_node(wxr, None, template_node) 

173 if "(" in expanded_text: 173 ↛ exitline 173 didn't return from function 'process_linkage_template' because the condition on line 173 was always true

174 furigana_start = expanded_text.rindex("(") 

175 linkage = Linkage( 

176 word=expanded_text[:furigana_start], 

177 furigana=expanded_text[furigana_start:].strip("() "), 

178 sense_index=sense_index, 

179 ) 

180 linkages[sense_index].append(linkage) 

181 else: 

182 raw_tag = clean_node(wxr, None, template_node) 

183 if raw_tag.endswith(".") or raw_tag in TAGS: 183 ↛ 185line 183 didn't jump to line 185 because the condition on line 183 was always true

184 raw_tags.append(raw_tag) 

185 elif is_translation: 

186 tr_nodes.append(raw_tag) 

187 else: 

188 word_nodes.append(raw_tag)