Coverage for src/wiktextract/extractor/en/hieroglyphs.py: 90%
56 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1import re
2import unicodedata
4hiero_phoneme_map: dict[str, str] = {
5 "mSa": "A12",
6 "xr": "A15",
7 "Xrd": "A17",
8 "sr": "A21",
9 "mniw": "A33",
10 "qiz": "A38",
11 "iry": "A47",
12 "Sps": "A50",
13 "Spsi": "A51",
15 "msi": "B3",
17 "mAat": "C10",
18 "HH": "C11",
19 "DHwty": "C3",
20 "Xnmw": "C4",
21 "inpw": "C6",
22 "stX": "C7",
23 "mnw": "C8",
25 "tp": "D1",
26 "WDAt": "D10",
27 #"R": "D153",
28 "fnD": "D19",
29 "Hr": "D2",
30 "r": "D21",
31 "rA": "D21",
32 "spt": "D24",
33 "spty": "D25",
34 "mnD": "D27",
35 "kA": "D28",
36 "Sny": "D3",
37 "aHA": "D34",
38 "a": "D36",
39 "ir": "D4",
40 "Dsr": "D45",
41 "d": "D46",
42 "Dba": "D50",
43 "mt": "D52",
44 "gH": "D56",
45 "gHs": "D56",
46 "rd": "D56",
47 "sbq": "D56",
48 "b": "D58",
49 "ab": "D59",
50 "wab": "D60",
51 "sAH": "D61",
52 "rmi": "D9",
54 "zAb": "E17",
55 "mAi": "E22",
56 "l": "E23",
57 "rw": "E23",
58 "Aby": "E24",
59 "wn": "E34",
60 "zzmt": "E6",
62 "wsr": "F12",
63 "wp": "F13",
64 "db": "F16",
65 "Hw": "F18",
66 "bH": "F18",
67 "ns": "F20",
68 "DrD": "F21",
69 "idn": "F21",
70 "msDR": "F21",
71 "sDm": "F21",
72 "kfA": "F22",
73 "pH": "F22",
74 "xpS": "F23",
75 "wHm": "F25",
76 "Xn": "F26",
77 "sti": "F29",
78 "Sd": "F30",
79 "ms": "F31",
80 "X": "F32",
81 "sd": "F33",
82 "ib": "F34",
83 "nfr": "F35",
84 "zmA": "F36",
85 "imAx": "F39",
86 "HAt": "F4",
87 "Aw": "F40",
88 "spr": "F42",
89 "isw": "F44",
90 "iwa": "F44",
91 "pXr": "F46",
92 "qAb": "F46",
93 "SsA": "F5",
95 "A": "G1",
96 "mwt": "G14",
97 "nbty": "G16",
98 "m": "G17",
99 "mm": "G18",
100 "AA": "G2",
101 "nH": "G21",
102 "Db": "G22",
103 "rxyt": "G23",
104 "Ax": "G25",
105 "dSr": "G27",
106 "gm": "G28",
107 "bA": "G29",
108 "baHi": "G32",
109 "aq": "G35",
110 "wr": "G36",
111 "nDs": "G37",
112 "gb": "G38",
113 "zA": "G39",
114 "tyw": "G4",
115 "pA": "G40",
116 "xn": "G41",
117 "wSA": "G42",
118 "w": "G43",
119 "ww": "G44",
120 "mAw": "G46",
121 "TA": "G47",
122 "snD": "G54",
124 "pq": "H2",
125 "wSm": "H2",
126 "pAq": "H3",
127 "nr": "H4",
128 "Sw": "H6",
130 "aSA": "I1",
131 "D": "I10",
132 "DD": "I11",
133 "Styw": "I2",
134 "mzH": "I3",
135 "sbk": "I4",
136 "sAq": "I5",
137 "km": "I6",
138 "Hfn": "I8",
139 "f": "I9",
141 "in": "K1",
142 "ad": "K3",
143 "XA": "K4",
144 "bz": "K5",
145 "nSmt": "K6",
147 "xpr": "L1",
148 "bit": "L2",
149 "srqt": "L7",
151 "iAm": "M1",
152 "wdn": "M11",
153 "xA": "M12",
154 "1000": "M12",
155 "wAD": "M13",
156 "HA": "M16",
157 "i": "M17",
158 "ii": "M18",
159 "Hn": "M2",
160 "sxt": "M20",
161 "sm": "M21",
162 "nn": "M22A",
163 "sw": "M23",
164 "rsw": "M24",
165 "Sma": "M26",
166 "nDm": "M29",
167 "xt": "M3",
168 "bnr": "M30",
169 "bdt": "M34",
170 "Dr": "M36",
171 "rnp": "M4",
172 "iz": "M40",
173 "tr": "M6",
174 "SA": "M8",
175 "zSn": "M9",
177 "pt": "N1",
178 "Abd": "N11",
179 "iaH": "N11",
180 "dwA": "N14",
181 "sbA": "N14",
182 "dwAt": "N15",
183 "tA": "N16",
184 "iw": "N18",
185 "wDb": "N20",
186 "spAt": "N24",
187 "xAst": "N25",
188 "Dw": "N26",
189 "Axt": "N27",
190 "xa": "N28",
191 "q": "N29",
192 "iAt": "N30",
193 "n": "N35",
194 "mw": "N35A",
195 "S": "N37",
196 "iAdt": "N4",
197 "idt": "N4",
198 "Sm": "N40",
199 "id": "N41",
200 "hrw": "N5",
201 "ra": "N5",
202 "zw": "N5",
203 "Hnmmt": "N8",
204 "pzD": "N9",
206 "pr": "O1",
207 "aH": "O11",
208 "wsxt": "O15",
209 "kAr": "O18",
210 "zH": "O22",
211 "txn": "O25",
212 "iwn": "O28",
213 "aA": "O29",
214 "zxnt": "O30",
215 "z": "O34",
216 "zb": "O35",
217 "inb": "O36",
218 #"qnbt": "O38A",
219 "h": "O4",
220 "Szp": "O42",
221 "ipt": "O45",
222 "nxn": "O47",
223 "niwt": "O49",
224 "zp": "O50",
225 "Snwt": "O51",
226 "Hwt": "O6",
228 "wHa": "P4",
229 "TAw": "P5",
230 "nfw": "P5",
231 "aHa": "P6",
232 "xrw": "P8",
234 "st": "Q1",
235 "wz": "Q2",
236 "p": "Q3",
237 "qrsw": "Q6",
239 "xAt": "R1",
240 "xAwt": "R1",
241 "Dd": "R11",
242 "dd": "R11",
243 "imnt": "R14",
244 "iAb": "R15",
245 "wx": "R16",
246 "xm": "R22",
247 "Htp": "R4",
248 "kAp": "R5",
249 "kp": "R5",
250 "snTr": "R7",
251 "nTr": "R8",
252 #"nTrw": "R8A",
253 "bd": "R9",
255 "HDt": "S1",
256 "N": "S3",
257 "dSrt": "S3",
258 "sxmty": "S6",
259 "xprS": "S7",
260 "Atf": "S8",
261 "Swty": "S9",
262 "mDH": "S10",
263 "wsx": "S11",
264 "nbw": "S12",
265 "THn": "S15",
266 "tHn": "S15",
267 "mnit": "S18",
268 "sDAw": "S19",
269 "xtm": "S20",
270 "sT": "S22",
271 "dmD": "S23",
272 "Tz": "S24",
273 "Sndyt": "S26",
274 "mnxt": "S27",
275 "s": "S29",
276 "sf": "S30",
277 "siA": "S32",
278 "Tb": "S33",
279 "anx": "S34",
280 "Swt": "S35",
281 "xw": "S37",
282 "HqA": "S38",
283 "awt": "S39",
284 "wAs": "S40",
285 "Dam": "S41",
286 "abA": "S42",
287 "sxm": "S42",
288 "xrp": "S42",
289 "md": "S43",
290 "Ams": "S44",
291 "nxxw": "S45",
293 "pD": "T10",
294 "sXr": "T11",
295 "zin": "T11",
296 "zwn": "T11",
297 "Ai": "T12",
298 "Ar": "T12",
299 "rwD": "T12",
300 "rwd": "T12",
301 "rs": "T13",
302 "qmA": "T14",
303 "wrrt": "T17",
304 "Sms": "T18",
305 "qs": "T19",
306 "wa": "T21",
307 "sn": "T22",
308 "iH": "T24",
309 "DbA": "T25",
310 "Xr": "T28",
311 "nmt": "T29",
312 "HD": "T3",
313 "sSm": "T31",
314 "nm": "T34",
315 "HDD": "T6",
316 "pd": "T9",
318 "mA": "U1",
319 "it": "U10",
320 "HqAt": "U11",
321 "Sna": "U13",
322 "hb": "U13",
323 "tm": "U15",
324 "biA": "U16",
325 "grg": "U17",
326 "stp": "U21",
327 "mnx": "U22",
328 "Ab": "U23",
329 "Hmt": "U24",
330 "wbA": "U26",
331 "DA": "U28",
332 "rtH": "U31",
333 "zmn": "U32",
334 "ti": "U33",
335 "xsf": "U34",
336 "Hm": "U36",
337 "mxAt": "U38",
338 "mr": "U6",
340 "100": "V1",
341 "arq": "V12",
342 "T": "V13",
343 "iTi": "V15",
344 "TmA": "V19",
345 "XAr": "V19",
346 "mDt": "V19",
347 "sTA": "V2",
348 "10": "V20",
349 "mD": "V20",
350 "mH": "V22",
351 "wD": "V24",
352 "aD": "V26",
353 "H": "V28",
354 "sk": "V29",
355 "wAH": "V29",
356 "sTAw": "V3",
357 "nb": "V30",
358 "k": "V31",
359 "msn": "V32",
360 "sSr": "V33",
361 "idr": "V37",
362 "wA": "V4",
363 "snT": "V5",
364 "sS": "V6",
365 "Sn": "V7",
367 "iab": "W10",
368 "g": "W11",
369 "nzt": "W11",
370 "Hz": "W14",
371 "xnt": "W17",
372 "mi": "W19",
373 "bAs": "W2",
374 "Hnqt": "W22",
375 "nw": "W24",
376 "ini": "W25",
377 "Hb": "W3",
378 "Xnm": "W9",
380 "t": "X1",
381 "di": "X8",
382 "rdi": "X8",
384 "mDAt": "Y1",
385 "mnhd": "Y3",
386 "zS": "Y3",
387 "mn": "Y5",
388 "ibA": "Y6",
389 "zSSt": "Y8",
391 "imi": "Z11",
392 "y": "Z4",
393 "W": "Z7",
395 "x": "AA1",
396 "mAa": "AA11",
397 "gs": "AA13",
398 "im": "AA15",
399 "M": "A15",
400 "sA": "AA17",
401 "apr": "AA20",
402 "wDa": "AA21",
403 "nD": "AA27",
404 "qd": "AA28",
405 "Xkr": "AA30",
406 "Hp": "AA5",
407 "qn": "AA8",
408}
410hiero_map: dict[str, str] = {
411 "H_SPACE" : "\u00a0",
412 ".": " ",
413 "..": "\u2003",
414}
416# Add unicode codes to map
417for i in range(0x13000, 0x1342F):
418 ch = chr(i)
419 name = unicodedata.name(ch)
420 g = name.split()[-1]
421 m = re.match(r"([a-zA-Z]+)(\d+[a-zA-Z]*)$", g)
422 assert m
424 prefix, suffix = m.groups()
425 while len(suffix) >= 2 and suffix[0] == "0":
426 suffix = suffix[1:]
427 g = prefix + suffix
428 hiero_map[g] = ch
431# Map phonemes to unicode characters
432for name, g in hiero_phoneme_map.items():
433 if g not in hiero_map: 433 ↛ 434line 433 didn't jump to line 434 because the condition on line 433 was never true
434 print("Phoneme {} maps to {} which is not in hiero_map!".format(name, g))
435 raise RuntimeError("Phoneme {} maps to unrecognized {}".format(name, g))
437 hiero_map[name] = hiero_map[g]
439def convert_asterisk(text: str) -> str:
440 tokens = text.split("*")
441 print("asterisk tokens:", tokens)
442 result = []
443 for token in tokens:
444 if token in hiero_map: 444 ↛ 447line 444 didn't jump to line 447 because the condition on line 444 was always true
445 result.append(hiero_map[token])
446 else:
447 print("Unhandled token: {!r}".format(token))
448 result.append(token)
449 v = "\U00013431".join(result)
450 return v
452def convert_colon(text: str) -> str:
453 tokens = text.split(":")
454 print("colon tokens:", tokens)
455 result = []
456 for token in tokens:
457 v = convert_asterisk(token)
458 if len(v) > 1 and len(tokens) > 1:
459 v = "\U00013437" + v + "\U00013438"
460 result.append(v)
461 return "\U00013430".join(result)
463def convert_hiero(text: str) -> str:
464 lst = list(m.group(0) for m in re.finditer(
465 r"\s+|\s*-\s*|\s*!\s*|[a-zA-Z0-9*:_]+|.", text))
466 result = []
467 for x in lst:
468 x = x.strip()
469 if not x or x == "-":
470 continue
471 if x == "!":
472 result.append("\n")
473 continue
474 v = convert_colon(x)
475 if len(v) > 1 and len(lst) > 1: 475 ↛ 476line 475 didn't jump to line 476 because the condition on line 475 was never true
476 v = "\U00013437" + v + "\U00013438"
477 result.append(v)
478 return "".join(result)