Coverage for src/wiktextract/datautils.py: 95%

1# Utilities for manipulating word data structures

4import re

5from collections import defaultdict

6from typing import Any, Iterable, Optional

8# Keys in ``data`` that can only have string values (a list of them)

9STR_KEYS = frozenset({"tags", "glosses"})

10# Keys in ``data`` that can only have dict values (a list of them)

11DICT_KEYS = frozenset(

12 {

13 "pronunciations",

14 "senses",

15 "synonyms",

16 "related",

17 "antonyms",

18 "hypernyms",

19 "holonyms",

20 "forms",

21 }

22)

25def data_append(data: Any, key: str, value: Any) -> None:

26 """Appends ``value`` under ``key`` in the dictionary ``data``. The key

27 is created if it does not exist."""

28 assert isinstance(key, str)

30 if key in STR_KEYS:

31 assert isinstance(value, str)

32 elif key in DICT_KEYS:

33 assert isinstance(value, dict)

34 list_value = (

35 getattr(data, key, []) if hasattr(data, key) else data.get(key, [])

36 )

37 list_value.append(value)

38 if hasattr(data, key):

39 setattr(data, key, list_value)

40 elif isinstance(data, dict): 40 ↛ exitline 40 didn't return from function 'data_append' because the condition on line 40 was always true

41 data[key] = list_value

44def data_extend(data: Any, key: str, values: Iterable) -> None:

45 """

46 Appends all values in a list under ``key`` in the dictionary ``data``.

47 """

48 assert isinstance(data, dict)

49 assert isinstance(key, str)

50 assert isinstance(values, (list, tuple))

52 # Note: we copy values, just in case it would actually be the same as

53 # data[key]. This has happened, and leads to iterating for ever, running

54 # out of memory. Other ways of avoiding the sharing may be more

55 # complex.

56 for x in tuple(values):

57 data_append(data, key, x)

60def split_at_comma_semi(

61 text: str,

62 separators: Iterable[str] = (",", ";", "，", "،"),

63 extra: Iterable[str] = (),

64 skipped: Optional[Iterable[str]] = None,

65) -> list[str]:

66 """Splits the text at commas and semicolons, unless they are inside

67 parenthesis. ``separators`` is default separators (setting it eliminates

68 default separators). ``extra`` is extra separators to be used in addition

69 to ``separators``. The separators in ``separators`` and ``extra`` must

70 be valid regexp pieces (already escaped if needed). ``skipped`` can be a

71 list of strings, containing material that might be otherwise split, but

72 should not; for example phrases like 'Hunde, die bellen, beißen nicht',

73 which would otherwise be split on the commas. Often link text data, becase

74 those are prototypically one unit."""

75 assert isinstance(text, str)

76 assert isinstance(separators, (list, tuple))

77 assert isinstance(extra, (list, tuple))

78 lst = []

79 paren_cnt = 0

80 bracket_cnt = 0

81 ofs = 0

82 parts = []

83 if extra:

84 separators = tuple(separators) + tuple(extra)

85 splitters: list[str] = []

86 if skipped:

87 splitters.extend(re.escape(s) for s in skipped)

88 splitters.append(r"[][()]")

89 splitters.extend(sorted(separators, key=lambda x: -len(x)))

90 split_re = "|".join(splitters)

91 for m in re.finditer(split_re, text):

92 if ofs < m.start():

93 parts.append(text[ofs : m.start()])

94 if m.start() == 0 and m.end() == len(text):

95 return [text] # Don't split if it is the only content

96 ofs = m.end()

97 if skipped and m.group(0) in skipped:

98 parts.append(m.group(0))

99 continue

100 token = m.group(0)

101 if token in "([":

102 bracket_cnt += 1

103 parts.append(token)

104 elif token in ")]":

105 bracket_cnt -= 1

106 parts.append(token)

107 elif paren_cnt > 0 or bracket_cnt > 0:

108 parts.append(token)

109 else:

110 if parts:

111 lst.append("".join(parts).strip())

112 parts = []

113 if ofs < len(text):

114 parts.append(text[ofs:])

115 if parts:

116 lst.append("".join(parts).strip())

117 return lst

118

119

120def split_slashes(wxr, text):

121 """Splits the text at slashes. This tries to use heuristics on how the

122 split is to be interpreted, trying to prefer longer forms that can be

123 found in the dictionary."""

124 text = text.strip()

125 if wxr.wtp.page_exists(text): 125 ↛ 126line 125 didn't jump to line 126 because the condition on line 125 was never true

126 return [text]

127

128 text = text.replace("／", "/")

129 alts = text.split(" / ") # Always full split at " / "

130 ret = []

131 for alt in alts:

132 alt = alt.strip()

133 if "/" not in alt or alt[0] == "/" or alt[-1] == "/":

134 # No slashes, no splitting; or starts/ends with a slash

135 ret.append(alt)

136 continue

137

138 # Split text into words. If only one word, assume single-word splits

139 words = alt.split()

140 if len(words) == 1:

141 # Only one word

142 ret.extend(x.strip() for x in alt.split("/"))

143 continue

144

145 # More than one word

146 cands = [((), ())]

147 for word in alt.split():

148 new_cands = []

149 parts = word.split("/")

150 if len(parts) == 1:

151 for ws, divs in cands:

152 ws = ws + tuple(parts)

153 new_cands.append([ws, divs])

154 else:

155 # Otherwise we might either just add alternatives for this word

156 # or add alternatives for the whole phrase

157 for p in parts:

158 for ws, divs in cands:

159 ws = ws + (p,)

160 new_cands.append(((), divs + (ws,)))

161 new_cands.append((ws, divs))

162 cands = new_cands

163

164 # Finalize candidates

165 final_cands = set()

166 for ws, divs in cands:

167 if not ws:

168 final_cands.add(divs)

169 continue

170 final_cands.add(divs + (ws,))

171 print("final_cands", final_cands)

172

173 # XXX this does not work yet

174 ht = defaultdict(list)

175 for divs in final_cands:

176 assert isinstance(divs, tuple) and isinstance(divs[0], tuple)

177 score = 0

178 words = []

179 for ws in divs:

180 assert isinstance(ws, tuple)

181 # exists = wxr.wtp.page_exists(" ".join(ws))

182 words.extend(ws)

183 score += 100

184 score += 1 / len(ws)

185 # if not exists:

186 # score += 1000 * len(ws)

187 key = tuple(words)

188 ht[key].append((score, divs))

189 for key, items in sorted(ht.items()):

190 print("key={} items={}".format(key, items))

191 score, divs = min(items)

192 for ws in divs:

193 ret.append(" ".join(ws))

194

195 return ret

196

197

198def freeze(x):

199 """Produces a read-only key for sets/dictionaries from the data. This

200 ignores "source" field from dictionaries."""

201 if isinstance(x, dict):

202 # XXX pending removal - we now add all entries from inflection tables

203 # if "source" in x:

204 # x = x.copy()

205 # del x["source"]

206 return frozenset((k, freeze(v)) for k, v in x.items())

207 if isinstance(x, set): 207 ↛ 208line 207 didn't jump to line 208 because the condition on line 207 was never true

208 return frozenset(x)

209 if isinstance(x, (list, tuple)):

210 return tuple(freeze(v) for v in x)

211 # XXX objects not current handled

212 return x

213

214

215def ns_title_prefix_tuple(

216 wxr, namespace: str, lower: bool = False

217) -> tuple[str, ...]:

218 """Based on given namespace name, create a tuple of aliases"""

219 if namespace in wxr.wtp.NAMESPACE_DATA:

220 return tuple(

221 map(

222 lambda x: x.lower() + ":" if lower else x + ":",

223 [wxr.wtp.NAMESPACE_DATA[namespace]["name"]]

224 + wxr.wtp.NAMESPACE_DATA[namespace]["aliases"],

225 )

226 )

227 else:

228 return ()