Coverage for src/wiktextract/datautils.py: 95%
133 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1# Utilities for manipulating word data structures
2#
3# Copyright (c) 2018-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org
4import re
5from collections import defaultdict
6from typing import Any, Iterable, Optional
8# Keys in ``data`` that can only have string values (a list of them)
9STR_KEYS = frozenset({"tags", "glosses"})
10# Keys in ``data`` that can only have dict values (a list of them)
11DICT_KEYS = frozenset(
12 {
13 "pronunciations",
14 "senses",
15 "synonyms",
16 "related",
17 "antonyms",
18 "hypernyms",
19 "holonyms",
20 "forms",
21 }
22)
25def data_append(data: Any, key: str, value: Any) -> None:
26 """Appends ``value`` under ``key`` in the dictionary ``data``. The key
27 is created if it does not exist."""
28 assert isinstance(key, str)
30 if key in STR_KEYS:
31 assert isinstance(value, str)
32 elif key in DICT_KEYS:
33 assert isinstance(value, dict)
34 list_value = (
35 getattr(data, key, []) if hasattr(data, key) else data.get(key, [])
36 )
37 list_value.append(value)
38 if hasattr(data, key):
39 setattr(data, key, list_value)
40 elif isinstance(data, dict): 40 ↛ exitline 40 didn't return from function 'data_append' because the condition on line 40 was always true
41 data[key] = list_value
44def data_extend(data: Any, key: str, values: Iterable) -> None:
45 """
46 Appends all values in a list under ``key`` in the dictionary ``data``.
47 """
48 assert isinstance(data, dict)
49 assert isinstance(key, str)
50 assert isinstance(values, (list, tuple))
52 # Note: we copy values, just in case it would actually be the same as
53 # data[key]. This has happened, and leads to iterating for ever, running
54 # out of memory. Other ways of avoiding the sharing may be more
55 # complex.
56 for x in tuple(values):
57 data_append(data, key, x)
60def split_at_comma_semi(
61 text: str,
62 separators: Iterable[str] = (",", ";", ",", "،"),
63 extra: Iterable[str] = (),
64 skipped: Optional[Iterable[str]] = None,
65) -> list[str]:
66 """Splits the text at commas and semicolons, unless they are inside
67 parenthesis. ``separators`` is default separators (setting it eliminates
68 default separators). ``extra`` is extra separators to be used in addition
69 to ``separators``. The separators in ``separators`` and ``extra`` must
70 be valid regexp pieces (already escaped if needed). ``skipped`` can be a
71 list of strings, containing material that might be otherwise split, but
72 should not; for example phrases like 'Hunde, die bellen, beißen nicht',
73 which would otherwise be split on the commas. Often link text data, becase
74 those are prototypically one unit."""
75 assert isinstance(text, str)
76 assert isinstance(separators, (list, tuple))
77 assert isinstance(extra, (list, tuple))
78 lst = []
79 paren_cnt = 0
80 bracket_cnt = 0
81 ofs = 0
82 parts = []
83 if extra:
84 separators = tuple(separators) + tuple(extra)
85 splitters: list[str] = []
86 if skipped:
87 splitters.extend(re.escape(s) for s in skipped)
88 splitters.append(r"[][()]")
89 splitters.extend(sorted(separators, key=lambda x: -len(x)))
90 split_re = "|".join(splitters)
91 for m in re.finditer(split_re, text):
92 if ofs < m.start():
93 parts.append(text[ofs : m.start()])
94 if m.start() == 0 and m.end() == len(text):
95 return [text] # Don't split if it is the only content
96 ofs = m.end()
97 if skipped and m.group(0) in skipped:
98 parts.append(m.group(0))
99 continue
100 token = m.group(0)
101 if token in "([":
102 bracket_cnt += 1
103 parts.append(token)
104 elif token in ")]":
105 bracket_cnt -= 1
106 parts.append(token)
107 elif paren_cnt > 0 or bracket_cnt > 0:
108 parts.append(token)
109 else:
110 if parts:
111 lst.append("".join(parts).strip())
112 parts = []
113 if ofs < len(text):
114 parts.append(text[ofs:])
115 if parts:
116 lst.append("".join(parts).strip())
117 return lst
120def split_slashes(wxr, text):
121 """Splits the text at slashes. This tries to use heuristics on how the
122 split is to be interpreted, trying to prefer longer forms that can be
123 found in the dictionary."""
124 text = text.strip()
125 if wxr.wtp.page_exists(text): 125 ↛ 126line 125 didn't jump to line 126 because the condition on line 125 was never true
126 return [text]
128 text = text.replace("/", "/")
129 alts = text.split(" / ") # Always full split at " / "
130 ret = []
131 for alt in alts:
132 alt = alt.strip()
133 if "/" not in alt or alt[0] == "/" or alt[-1] == "/":
134 # No slashes, no splitting; or starts/ends with a slash
135 ret.append(alt)
136 continue
138 # Split text into words. If only one word, assume single-word splits
139 words = alt.split()
140 if len(words) == 1:
141 # Only one word
142 ret.extend(x.strip() for x in alt.split("/"))
143 continue
145 # More than one word
146 cands = [((), ())]
147 for word in alt.split():
148 new_cands = []
149 parts = word.split("/")
150 if len(parts) == 1:
151 for ws, divs in cands:
152 ws = ws + tuple(parts)
153 new_cands.append([ws, divs])
154 else:
155 # Otherwise we might either just add alternatives for this word
156 # or add alternatives for the whole phrase
157 for p in parts:
158 for ws, divs in cands:
159 ws = ws + (p,)
160 new_cands.append(((), divs + (ws,)))
161 new_cands.append((ws, divs))
162 cands = new_cands
164 # Finalize candidates
165 final_cands = set()
166 for ws, divs in cands:
167 if not ws:
168 final_cands.add(divs)
169 continue
170 final_cands.add(divs + (ws,))
171 print("final_cands", final_cands)
173 # XXX this does not work yet
174 ht = defaultdict(list)
175 for divs in final_cands:
176 assert isinstance(divs, tuple) and isinstance(divs[0], tuple)
177 score = 0
178 words = []
179 for ws in divs:
180 assert isinstance(ws, tuple)
181 # exists = wxr.wtp.page_exists(" ".join(ws))
182 words.extend(ws)
183 score += 100
184 score += 1 / len(ws)
185 # if not exists:
186 # score += 1000 * len(ws)
187 key = tuple(words)
188 ht[key].append((score, divs))
189 for key, items in sorted(ht.items()):
190 print("key={} items={}".format(key, items))
191 score, divs = min(items)
192 for ws in divs:
193 ret.append(" ".join(ws))
195 return ret
198def freeze(x):
199 """Produces a read-only key for sets/dictionaries from the data. This
200 ignores "source" field from dictionaries."""
201 if isinstance(x, dict):
202 # XXX pending removal - we now add all entries from inflection tables
203 # if "source" in x:
204 # x = x.copy()
205 # del x["source"]
206 return frozenset((k, freeze(v)) for k, v in x.items())
207 if isinstance(x, set): 207 ↛ 208line 207 didn't jump to line 208 because the condition on line 207 was never true
208 return frozenset(x)
209 if isinstance(x, (list, tuple)):
210 return tuple(freeze(v) for v in x)
211 # XXX objects not current handled
212 return x
215def ns_title_prefix_tuple(
216 wxr, namespace: str, lower: bool = False
217) -> tuple[str, ...]:
218 """Based on given namespace name, create a tuple of aliases"""
219 if namespace in wxr.wtp.NAMESPACE_DATA:
220 return tuple(
221 map(
222 lambda x: x.lower() + ":" if lower else x + ":",
223 [wxr.wtp.NAMESPACE_DATA[namespace]["name"]]
224 + wxr.wtp.NAMESPACE_DATA[namespace]["aliases"],
225 )
226 )
227 else:
228 return ()