Coverage for src/wiktextract/datautils.py: 95%

133 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-10-25 10:11 +0000

1# Utilities for manipulating word data structures 

2# 

3# Copyright (c) 2018-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org 

4import re 

5from collections import defaultdict 

6from typing import Any, Iterable, Optional 

7 

8# Keys in ``data`` that can only have string values (a list of them) 

9STR_KEYS = frozenset({"tags", "glosses"}) 

10# Keys in ``data`` that can only have dict values (a list of them) 

11DICT_KEYS = frozenset( 

12 { 

13 "pronunciations", 

14 "senses", 

15 "synonyms", 

16 "related", 

17 "antonyms", 

18 "hypernyms", 

19 "holonyms", 

20 "forms", 

21 } 

22) 

23 

24 

25def data_append(data: Any, key: str, value: Any) -> None: 

26 """Appends ``value`` under ``key`` in the dictionary ``data``. The key 

27 is created if it does not exist.""" 

28 assert isinstance(key, str) 

29 

30 if key in STR_KEYS: 

31 assert isinstance(value, str) 

32 elif key in DICT_KEYS: 

33 assert isinstance(value, dict) 

34 list_value = ( 

35 getattr(data, key, []) if hasattr(data, key) else data.get(key, []) 

36 ) 

37 list_value.append(value) 

38 if hasattr(data, key): 

39 setattr(data, key, list_value) 

40 elif isinstance(data, dict): 40 ↛ exitline 40 didn't return from function 'data_append' because the condition on line 40 was always true

41 data[key] = list_value 

42 

43 

44def data_extend(data: Any, key: str, values: Iterable) -> None: 

45 """ 

46 Appends all values in a list under ``key`` in the dictionary ``data``. 

47 """ 

48 assert isinstance(data, dict) 

49 assert isinstance(key, str) 

50 assert isinstance(values, (list, tuple)) 

51 

52 # Note: we copy values, just in case it would actually be the same as 

53 # data[key]. This has happened, and leads to iterating for ever, running 

54 # out of memory. Other ways of avoiding the sharing may be more 

55 # complex. 

56 for x in tuple(values): 

57 data_append(data, key, x) 

58 

59 

60def split_at_comma_semi( 

61 text: str, 

62 separators: Iterable[str] = (",", ";", ",", "،"), 

63 extra: Iterable[str] = (), 

64 skipped: Optional[Iterable[str]] = None, 

65) -> list[str]: 

66 """Splits the text at commas and semicolons, unless they are inside 

67 parenthesis. ``separators`` is default separators (setting it eliminates 

68 default separators). ``extra`` is extra separators to be used in addition 

69 to ``separators``. The separators in ``separators`` and ``extra`` must 

70 be valid regexp pieces (already escaped if needed). ``skipped`` can be a 

71 list of strings, containing material that might be otherwise split, but 

72 should not; for example phrases like 'Hunde, die bellen, beißen nicht', 

73 which would otherwise be split on the commas. Often link text data, becase 

74 those are prototypically one unit.""" 

75 assert isinstance(text, str) 

76 assert isinstance(separators, (list, tuple)) 

77 assert isinstance(extra, (list, tuple)) 

78 lst = [] 

79 paren_cnt = 0 

80 bracket_cnt = 0 

81 ofs = 0 

82 parts = [] 

83 if extra: 

84 separators = tuple(separators) + tuple(extra) 

85 splitters: list[str] = [] 

86 if skipped: 

87 splitters.extend(re.escape(s) for s in skipped) 

88 splitters.append(r"[][()]") 

89 splitters.extend(sorted(separators, key=lambda x: -len(x))) 

90 split_re = "|".join(splitters) 

91 for m in re.finditer(split_re, text): 

92 if ofs < m.start(): 

93 parts.append(text[ofs : m.start()]) 

94 if m.start() == 0 and m.end() == len(text): 

95 return [text] # Don't split if it is the only content 

96 ofs = m.end() 

97 if skipped and m.group(0) in skipped: 

98 parts.append(m.group(0)) 

99 continue 

100 token = m.group(0) 

101 if token in "([": 

102 bracket_cnt += 1 

103 parts.append(token) 

104 elif token in ")]": 

105 bracket_cnt -= 1 

106 parts.append(token) 

107 elif paren_cnt > 0 or bracket_cnt > 0: 

108 parts.append(token) 

109 else: 

110 if parts: 

111 lst.append("".join(parts).strip()) 

112 parts = [] 

113 if ofs < len(text): 

114 parts.append(text[ofs:]) 

115 if parts: 

116 lst.append("".join(parts).strip()) 

117 return lst 

118 

119 

120def split_slashes(wxr, text): 

121 """Splits the text at slashes. This tries to use heuristics on how the 

122 split is to be interpreted, trying to prefer longer forms that can be 

123 found in the dictionary.""" 

124 text = text.strip() 

125 if wxr.wtp.page_exists(text): 125 ↛ 126line 125 didn't jump to line 126 because the condition on line 125 was never true

126 return [text] 

127 

128 text = text.replace("/", "/") 

129 alts = text.split(" / ") # Always full split at " / " 

130 ret = [] 

131 for alt in alts: 

132 alt = alt.strip() 

133 if "/" not in alt or alt[0] == "/" or alt[-1] == "/": 

134 # No slashes, no splitting; or starts/ends with a slash 

135 ret.append(alt) 

136 continue 

137 

138 # Split text into words. If only one word, assume single-word splits 

139 words = alt.split() 

140 if len(words) == 1: 

141 # Only one word 

142 ret.extend(x.strip() for x in alt.split("/")) 

143 continue 

144 

145 # More than one word 

146 cands = [((), ())] 

147 for word in alt.split(): 

148 new_cands = [] 

149 parts = word.split("/") 

150 if len(parts) == 1: 

151 for ws, divs in cands: 

152 ws = ws + tuple(parts) 

153 new_cands.append([ws, divs]) 

154 else: 

155 # Otherwise we might either just add alternatives for this word 

156 # or add alternatives for the whole phrase 

157 for p in parts: 

158 for ws, divs in cands: 

159 ws = ws + (p,) 

160 new_cands.append(((), divs + (ws,))) 

161 new_cands.append((ws, divs)) 

162 cands = new_cands 

163 

164 # Finalize candidates 

165 final_cands = set() 

166 for ws, divs in cands: 

167 if not ws: 

168 final_cands.add(divs) 

169 continue 

170 final_cands.add(divs + (ws,)) 

171 print("final_cands", final_cands) 

172 

173 # XXX this does not work yet 

174 ht = defaultdict(list) 

175 for divs in final_cands: 

176 assert isinstance(divs, tuple) and isinstance(divs[0], tuple) 

177 score = 0 

178 words = [] 

179 for ws in divs: 

180 assert isinstance(ws, tuple) 

181 # exists = wxr.wtp.page_exists(" ".join(ws)) 

182 words.extend(ws) 

183 score += 100 

184 score += 1 / len(ws) 

185 # if not exists: 

186 # score += 1000 * len(ws) 

187 key = tuple(words) 

188 ht[key].append((score, divs)) 

189 for key, items in sorted(ht.items()): 

190 print("key={} items={}".format(key, items)) 

191 score, divs = min(items) 

192 for ws in divs: 

193 ret.append(" ".join(ws)) 

194 

195 return ret 

196 

197 

198def freeze(x): 

199 """Produces a read-only key for sets/dictionaries from the data. This 

200 ignores "source" field from dictionaries.""" 

201 if isinstance(x, dict): 

202 # XXX pending removal - we now add all entries from inflection tables 

203 # if "source" in x: 

204 # x = x.copy() 

205 # del x["source"] 

206 return frozenset((k, freeze(v)) for k, v in x.items()) 

207 if isinstance(x, set): 207 ↛ 208line 207 didn't jump to line 208 because the condition on line 207 was never true

208 return frozenset(x) 

209 if isinstance(x, (list, tuple)): 

210 return tuple(freeze(v) for v in x) 

211 # XXX objects not current handled 

212 return x 

213 

214 

215def ns_title_prefix_tuple( 

216 wxr, namespace: str, lower: bool = False 

217) -> tuple[str, ...]: 

218 """Based on given namespace name, create a tuple of aliases""" 

219 if namespace in wxr.wtp.NAMESPACE_DATA: 

220 return tuple( 

221 map( 

222 lambda x: x.lower() + ":" if lower else x + ":", 

223 [wxr.wtp.NAMESPACE_DATA[namespace]["name"]] 

224 + wxr.wtp.NAMESPACE_DATA[namespace]["aliases"], 

225 ) 

226 ) 

227 else: 

228 return ()