Coverage for src / wiktextract / extractor / en / translations.py: 88%
238 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-17 07:22 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-17 07:22 +0000
1# Code related to parsing translations
2#
3# Copyright (c) 2019-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org
5import copy
6import re
7from typing import Optional
9from mediawiki_langcodes import code_to_name, name_to_code
10from wikitextprocessor import MAGIC_FIRST, MAGIC_LAST
12from ...datautils import data_append, data_extend, split_at_comma_semi
13from ...wxr_context import WiktextractContext
14from .form_descriptions import (
15 classify_desc,
16 decode_tags,
17 nested_translations_re,
18 parse_translation_desc,
19 tr_note_re,
20)
21from .type_utils import TranslationData, WordData
23# Maps language names in translations to actual language names.
24# E.g., "Apache" is not a language name, but "Apachean" is.
25tr_langname_map = {
26 "Apache": "Apachean",
27 "Lutshootseed": "Lushootseed",
28 "Old Assamese": "Early Assamese",
29}
31# These names will be interpreted as script names or dialect names
32# when used as a second-level name in translations. Some script names
33# are also valid language names, but it looks likes the ones that are
34# also script names aren't used on the second level as language names.
35# These will not be interpreted as a separate language, but will instead
36# be included under the parent language with the script/dialect as a tag
37# (with spaces replaced by hyphens).
38script_and_dialect_names = set(
39 [
40 # Scripts
41 "ALUPEC",
42 "Adlam",
43 "Arabic", # Script for Kashmiri
44 "Bengali",
45 "Burmese",
46 "Carakan",
47 "CJKV Characters",
48 "Cyrillic",
49 "Devanagari",
50 "Glagolitic",
51 "Gurmukhi",
52 "Hebrew", # For Aramaic
53 "Jawi",
54 "Khmer",
55 "Latin",
56 "Mongolian",
57 "Roman",
58 "Shahmukhi",
59 "Sinhalese",
60 "Syriac", # For Aramaic
61 "Classical Syriac", # For Aramaic
62 "Taraškievica",
63 "Thai",
64 "Uyghurjin",
65 # Chinese dialects/languages
66 "Cantonese", # Variant of Chinese
67 "Dungan", # Chinese
68 "Gan", # Chinese
69 "Hakka", # Chinese
70 "Hokkien", # Chinese
71 "Jin", # Chinese
72 "Mandarin", # Chinese
73 "Min Bei", # Chinese
74 "Min Dong", # Chinese
75 "Min Nan", # Chinsese
76 "Wu", # Chinsese
77 "Xiang", # Chinese
78 "Jianghuai Mandarin", # Chinese
79 "Jilu Mandarin", # Chinese
80 "Jin Mandarin", # Chinese
81 "Northern Mandarin", # Chinese
82 "Southwestern Mandarin", # Chinese
83 "Taiwanese Mandarin", # Chinese
84 "Coastal Min", # Chinese
85 "Inland Min", # Chinese
86 "Leizhou Min", # Chinese
87 "Min", # Chinese
88 "Puxian Min", # Chinese
89 "Shanghainese Wu", # Chinese
90 "Wenzhou Wu", # Chinese
91 "Wenzhou", # Chinese
92 "Hsinchu Hokkien", # Chinese
93 "Jinjiang Hokkien", # Chinese
94 "Kaohsiung Hokkien", # Chinsese
95 "Pinghua", # Chinese
96 "Eastern Punjabi",
97 "Western Punjabi",
98 # Various countries/regions
99 "Alsace",
100 "Bavaria",
101 "Belgium",
102 "Canada",
103 "Central",
104 "Cologne",
105 "Fogo",
106 "Föhr",
107 "Föhr-Amrum",
108 "Hallig",
109 "Helgoland",
110 "Heligoland",
111 "Santiago",
112 "Sylt",
113 "Mooring",
114 "Föhr-Amrum",
115 "Vancouver Island",
116 "Wiedingharde",
117 "Anpezan", # Variant of Ladin
118 "Badiot", # Ladin
119 "Fascian", # Ladin
120 "Fodom", # Ladin
121 "Gherdëina", # Ladin
122 "Anbarani", # Variant of Talysh
123 "Asalemi", # Variant of Talysh
124 "Alemannic German", # Variant of German
125 "Rhine Franconian", # Variant of German
126 "German Low German", # Variant of Low German
127 "Campidanese", # Variant of Sardinian
128 "Logudorese", # Variant of Sardinian
129 "Digor", # Variant of Ossetian
130 "Iron", # Variant of Ossetian
131 "Northern Puebla", # Variant of Nahuatl
132 "Mecayapan", # Variant of Nathuatl
133 "Egyptian Arabic", # Variant of Arabic
134 "Gulf Arabic", # Variant of Arabic
135 "Hijazi Arabic", # Variant of Arabic
136 "Moroccan Arabic", # Variant of Arabic
137 "North Levantine Arabic", # Variant of Arabic
138 "South Levantine Arabic", # Variant of Arabic
139 "Alviri", # Variant of Alviri-Vidari
140 "Vidari", # Variant of Alviri-Vidari
141 "Tashelhit", # Variant of Berber
142 "Bokmål", # Variant of Norwegian
143 "Nynorsk", # Variant of Norwegian
144 "Mycenaean", # Variant of Greek
145 # Language varieties
146 "Ancient",
147 "Classical",
148 "Draweno-Polabian",
149 "Literary",
150 "Lower",
151 "Manitoba Saulteux",
152 "Modern",
153 "Modern Polabian",
154 "Modified traditional",
155 "Northern",
156 "Northern and Southern",
157 "Old Polabian",
158 "Simplified",
159 "Southern",
160 "Traditional",
161 "Western",
162 "1708",
163 "1918",
164 ]
165)
167# These names should be interpreted as tags (as listed in the value
168# space-separated) in second-level translations.
169tr_second_tagmap = {
170 "Föhr-Amrum, Bökingharde": "Föhr-Amrum Bökingharde",
171 "Halligen, Goesharde, Karrhard": "Halligen Goesharde Karrhard",
172 "Föhr-Amrum and Sylt dialect": "Föhr-Amrum Sylt",
173 "Hallig and Mooring": "Hallig Mooring",
174 "Föhr-Amrum & Mooring": "Föhr-Amrum Mooring",
175}
177# Ignore translations that start with one of these
178tr_ignore_prefixes = [
179 "+",
180 "Different structure used",
181 "Literally",
182 "No equivalent",
183 "Not used",
184 "Please add this translation if you can",
185 "See: ",
186 "Use ",
187 "[Book Pahlavi needed]",
188 "[book pahlavi needed]",
189 "[script needed]",
190 "different structure used",
191 "e.g.",
192 "lit.",
193 "literally",
194 "no equivalent",
195 "normally ",
196 "not used",
197 "noun compound ",
198 "please add this translation if you can",
199 "prefix ",
200 "see: ",
201 "suffix ",
202 "use ",
203 "usually ",
204]
206# Ignore translations that contain one of these anywhere (case-sensitive).
207# Or actually, put such translations in the "note" field rather than in "word".
208tr_ignore_contains = [
209 "usually expressed with ",
210 " can be used ",
211 " construction used",
212 " used with ",
213 " + ",
214 "genitive case",
215 "dative case",
216 "nominative case",
217 "accusative case",
218 "absolute state",
219 "infinitive of ",
220 "participle of ",
221 "for this sense",
222 "depending on the circumstances",
223 "expressed with ",
224 " expression ",
225 " means ",
226 " is used",
227 " — ", # Used to give example sentences
228 " translation",
229 "not attested",
230 "grammatical structure",
231 "construction is used",
232 "tense used",
233 " lit.",
234 " literally",
235 "dative",
236 "accusative",
237 "genitive",
238 "essive",
239 "partitive",
240 "translative",
241 "elative",
242 "inessive",
243 "illative",
244 "adessive",
245 "ablative",
246 "allative",
247 "abessive",
248 "comitative",
249 "instructive",
250 "particle",
251 "predicative",
252 "attributive",
253 "preposition",
254 "postposition",
255 "prepositional",
256 "postpositional",
257 "prefix",
258 "suffix",
259 "translated",
260]
262# Ignore translations that match one of these regular expressions
263tr_ignore_regexps = [
264 r"^\[[\d,]+\]$",
265 r"\?\?$",
266 r"^\s*$",
267]
269# If a translation matches this regexp (with re.search), we print a debug
270# message
271tr_suspicious_re = re.compile(
272 r" [mf][12345]$|"
273 + r" [mfnc]$|"
274 + r" (pf|impf|vir|nvir|anml|anim|inan|sg|pl)$|"
275 + "|".join(
276 re.escape(x)
277 for x in [
278 "; ",
279 "* ",
280 ": ",
281 "[",
282 "]",
283 "{",
284 "}",
285 "/",
286 "^",
287 "literally",
288 "lit.",
289 # XXX check occurrences of ⫽, seems to be used as verb-object
290 # separator but shouldn't really be part of the canonical form.
291 # See e.g. 打工/Chinese
292 "⫽",
293 "also expressed with",
294 "e.g.",
295 "cf.",
296 "used ",
297 "script needed",
298 "please add this translation",
299 "usage ",
300 ]
301 )
302)
304# Regular expression to be searched from translation (with re.search) to check
305# if it should be ignored.
306tr_ignore_re = re.compile(
307 "^("
308 + "|".join(re.escape(x) for x in tr_ignore_prefixes)
309 + ")|"
310 + "|".join(re.escape(x) for x in tr_ignore_contains)
311 + "|"
312 + "|".join(tr_ignore_regexps)
313) # These are not to be escaped
315# These English texts get converted to tags in translations
316english_to_tags = {
317 "I have": "first-person singular",
318 "you have": "second-person singular",
319 "she has": "third-person singular feminine",
320 "he has": "third-person singular masculine",
321}
324def parse_translation_item_text(
325 wxr: WiktextractContext,
326 word: str,
327 data: WordData,
328 item: str,
329 sense: Optional[str],
330 lang: Optional[str],
331 langcode: Optional[str],
332 translations_from_template: list[str],
333 is_reconstruction: bool,
334) -> Optional[str]:
335 assert isinstance(wxr, WiktextractContext)
336 assert isinstance(word, str)
337 assert isinstance(data, dict)
338 assert isinstance(item, str)
339 assert sense is None or isinstance(sense, str)
340 assert lang is None or isinstance(lang, str) # Parent item language
341 assert langcode is None or isinstance(langcode, str) # Template langcode
342 assert isinstance(translations_from_template, list)
343 for x in translations_from_template:
344 assert isinstance(x, str)
345 assert is_reconstruction in (True, False)
347 # print("parse_translation_item_text: {!r} lang={}"
348 # " langcode={}".format(item, lang, langcode))
350 if not item: 350 ↛ 351line 350 didn't jump to line 351 because the condition on line 350 was never true
351 return None
353 # Find and remove nested translations from the item
354 nested = list(m.group(1) for m in re.finditer(nested_translations_re, item))
355 if nested:
356 item = re.sub(nested_translations_re, "", item)
358 if re.search(r"\(\d+\)|\[\d+\]", item) and "numeral:" not in item:
359 wxr.wtp.debug(
360 "possible sense number in translation item: {}".format(item),
361 sortid="translations/324",
362 )
364 # Translation items should start with a language name (except
365 # some nested translation items don't and rely on the language
366 # name from the higher level, and some append a language variant
367 # name to a broader language name)
368 extra_langcodes = set()
369 if lang and name_to_code(lang, "en") != "":
370 lang_code = name_to_code(lang, "en")
371 extra_langcodes.add(lang_code)
372 # Canonicalize language name (we could have gotten it via
373 # alias or other_names)
374 if new_lang_name := code_to_name(lang_code, "en"):
375 lang = new_lang_name
376 m = re.match(r"\*?\s*([-' \w][-'&, \w()]*)[::]\s*", item)
377 tags = []
378 if m:
379 lang_sublang = ""
380 sublang = m.group(1).strip()
381 language_name_variations: list[str] = list()
382 if lang and sublang:
383 lang_sublang = lang + " " + sublang
384 sublang_lang = sublang + " " + lang
385 language_name_variations.extend(
386 (
387 lang_sublang,
388 sublang_lang,
389 lang_sublang.replace(" ", "-"),
390 sublang_lang.replace(" ", "-"),
391 )
392 )
393 if " " in sublang:
394 language_name_variations.append(sublang.replace(" ", "-"))
395 if "-" in sublang:
396 language_name_variations.append(sublang.replace("-", " "))
398 if lang is None:
399 if sublang == "Note": 399 ↛ 400line 399 didn't jump to line 400 because the condition on line 399 was never true
400 return None
401 lang = sublang
402 elif lang_sublang and any(
403 name_to_code(captured_lang := lang_comb, "en") != ""
404 # Python 3.8: catch the value of lang_comb with :=
405 for lang_comb in language_name_variations
406 ):
407 lang = captured_lang
408 elif sublang in script_and_dialect_names:
409 # If the second-level name is a script name, add it as
410 # tag and keep the top-level language.
411 # This helps with languages that script names
412 # on the same level; those scripts may also be valid
413 # language names. See leaf/English/Translations/Pali.
414 tags.append(sublang.replace(" ", "-"))
415 elif sublang in tr_second_tagmap:
416 # Certain second-level names are interpreted as tags
417 # (mapped to tags). Note that these may still have
418 # separate language codes, so additional langcode
419 # removal tricks may need to be played below.
420 tags.extend(tr_second_tagmap[sublang].split())
421 elif name_to_code(sublang, "en") != "":
422 lang = sublang
423 elif sublang[0].isupper() and classify_desc(sublang) == "tags":
424 # Interpret it as a tag
425 tags.append(sublang)
426 else:
427 # We don't recognize this prefix
428 wxr.wtp.error(
429 "unrecognized prefix (language name?) in "
430 "translation item: {}".format(item),
431 sortid="translations/369",
432 )
433 return None
434 # Strip the language name/tag from the item
435 item = item[m.end() :]
436 elif lang is None:
437 # No mathing language prefix. Try if it is missing colon.
438 parts = item.split()
439 if len(parts) > 1 and name_to_code(parts[0], "en") != "": 439 ↛ 440line 439 didn't jump to line 440 because the condition on line 439 was never true
440 lang = parts[0]
441 item = " ".join(parts[1:])
442 else:
443 if "__IGNORE__" not in item:
444 wxr.wtp.error(
445 "no language name in translation item: {}".format(item),
446 sortid="translations/382",
447 )
448 return None
450 # Map non-standard language names (e.g., "Apache" -> "Apachean")
451 lang = tr_langname_map.get(lang, lang)
453 # If we didn't get language code from the template, look it up
454 # based on language name
455 if langcode is None and name_to_code(lang, "en") != "":
456 langcode = name_to_code(lang, "en")
458 # Remove (<langcode>) parts from the item. They seem to be
459 # generated by {{t+|...}}.
460 if langcode:
461 extra_langcodes.add(langcode)
462 if "-" in langcode:
463 extra_langcodes.add(langcode.split("-")[0])
464 if langcode in (
465 "zh",
466 "yue",
467 "cdo",
468 "cmn",
469 "dng",
470 "hak",
471 "mnp",
472 "nan",
473 "wuu",
474 "zh-min-nan",
475 ):
476 extra_langcodes.update(
477 [
478 "zh",
479 "yue",
480 "cdo",
481 "cmn",
482 "dng",
483 "hak",
484 "mnp",
485 "nan",
486 "wuu",
487 "zh-min-nan",
488 ]
489 )
490 elif langcode in ("nn", "nb", "no"):
491 extra_langcodes.update(["no", "nn", "nb"])
492 for x in extra_langcodes:
493 item = re.sub(r"\s*\^?\({}\)".format(re.escape(x)), "", item)
495 # Map translations obtained from templates into magic characters
496 # before splitting the translations list. This way, if a comma
497 # (or semicolon etc) was used inside the template, it won't get
498 # split. We restore the magic characters into the original
499 # translations after splitting. This kludge improves robustness
500 # of collection translations for phrases whose translations
501 # may contain commas.
502 translations_from_template = list(
503 sorted(translations_from_template, key=lambda x: len(x), reverse=True)
504 )
505 tr_mappings = {}
506 for i, trt in enumerate(translations_from_template):
507 if not trt: 507 ↛ 508line 507 didn't jump to line 508 because the condition on line 507 was never true
508 continue
509 ch = chr(MAGIC_FIRST + i)
510 rex = re.escape(trt)
511 if trt[0].isalnum():
512 rex = r"\b" + rex
513 if trt[-1].isalnum():
514 rex = rex + r"\b"
515 item = re.sub(rex, ch, item)
516 tr_mappings[ch] = trt
518 # There may be multiple translations, separated by comma
519 nested.append(item)
520 for item in nested:
521 tagsets: list[tuple[str, ...]] = []
522 # This never does anything; it's never updated, so it's always empty
523 # topics: list[str] = []
525 for part in split_at_comma_semi(
526 item, extra=[" / ", " / ", "/", r"\| furthermore: "]
527 ):
528 # Substitute the magic characters back to original
529 # translations (this is part of dealing with
530 # phrasal translations containing commas).
531 part = re.sub(
532 r"[{:c}-{:c}]".format(MAGIC_FIRST, MAGIC_LAST),
533 lambda m: tr_mappings.get(m.group(0), m.group(0)),
534 part,
535 )
537 if part.endswith(":"): # E.g. "salt of the earth"/Korean 537 ↛ 538line 537 didn't jump to line 538 because the condition on line 537 was never true
538 part = part[:-1].strip()
539 if not part: 539 ↛ 540line 539 didn't jump to line 540 because the condition on line 539 was never true
540 continue
542 # Strip language links
543 tr: TranslationData = {"lang": lang}
544 if langcode:
545 tr["code"] = langcode # DEPRECATED in favor of "lang_code"
546 tr["lang_code"] = langcode
547 if tags:
548 tr["tags"] = list(tags)
549 for ttup in tagsets: 549 ↛ 550line 549 didn't jump to line 550 because the loop on line 549 never started
550 tr["tags"].extend(ttup)
551 # topics is never populated, so it's always empty
552 # if topics:
553 # tr["topics"] = list(topics)
554 if sense:
555 if sense.startswith( 555 ↛ 561line 555 didn't jump to line 561 because the condition on line 555 was never true
556 (
557 "Translations to be checked",
558 ":The translations below need to be checked",
559 )
560 ):
561 continue # Skip such translations
562 else:
563 tr["sense"] = sense
565 # Check if this part starts with (tags)
566 m = re.match(r"\(([^)]+)\) ", part)
567 if m:
568 par = m.group(1)
569 rest = part[m.end() :]
570 cls = classify_desc(par, no_unknown_starts=True)
571 if cls == "tags":
572 tagsets2, topics2 = decode_tags(par)
573 for ttup in tagsets2:
574 data_extend(tr, "tags", ttup)
575 data_extend(tr, "topics", topics2)
576 part = rest
578 # Check if this part ends with (tags). Note that
579 # note-re will mess things up if we rely on this being
580 # checked later.
581 m = re.search(r" +\(([^)]+)\)$", part)
582 if m:
583 par = m.group(1)
584 rest = part[: m.start()]
585 cls = classify_desc(par, no_unknown_starts=True)
586 if cls == "tags":
587 tagsets2, topics2 = decode_tags(par)
588 for ttup in tagsets2:
589 data_extend(tr, "tags", ttup)
590 data_extend(tr, "topics", topics2)
591 part = rest
593 # Check if this part starts with "<tags/english>: <rest>"
594 m = re.match(r"([-\w() ]+): ", part)
595 if m:
596 par = m.group(1).strip()
597 rest = part[m.end() :]
598 if par in ("", "see"): 598 ↛ 599line 598 didn't jump to line 599 because the condition on line 598 was never true
599 part = "rest"
600 else:
601 cls = classify_desc(par)
602 # print("par={!r} cls={!r}".format(par, cls))
603 if cls == "tags":
604 tagsets2, topics2 = decode_tags(par)
605 for ttup in tagsets2:
606 data_extend(tr, "tags", ttup)
607 data_extend(tr, "topics", topics2)
608 part = rest
609 elif cls == "english":
610 if re.search(tr_note_re, par):
611 if "note" in tr: 611 ↛ 612line 611 didn't jump to line 612 because the condition on line 611 was never true
612 tr["note"] += "; " + par
613 else:
614 tr["note"] = par
615 else:
616 if "translation" in tr and "english" in tr: 616 ↛ 618line 616 didn't jump to line 618 because the condition on line 616 was never true
617 # DEPRECATED for "translation"
618 tr["english"] += "; " + par
619 tr["translation"] += "; " + par
620 else:
621 # DEPRECATED for "translation"
622 tr["english"] = par
623 tr["translation"] = par
624 part = rest
626 # Skip translations that our template_fn says to ignore
627 # and those that contain Lua execution errors.
628 if "__IGNORE__" in part:
629 continue # Contains something we want to ignore
630 if part.startswith("Lua execution error"): 630 ↛ 631line 630 didn't jump to line 631 because the condition on line 630 was never true
631 continue
633 # Handle certain suffixes in translations that
634 # we might put in "note" but that we can actually
635 # parse into tags.
636 for suffix, t in (
637 (" with dative", "with-dative"),
638 (" with genitive", "with-genitive"),
639 (" with accusative", "with-accusative"),
640 (" in subjunctive", "with-subjunctive"),
641 (" and conditional mood", "with-conditional"),
642 (" - I have - you have", "first-person second-person singular"),
643 (" - I have", "first-person singular"),
644 (" - you have", "second-person singular"),
645 ):
646 if part.endswith(suffix): 646 ↛ 647line 646 didn't jump to line 647 because the condition on line 646 was never true
647 part = part[: -len(suffix)]
648 data_append(tr, "tags", t)
649 break
651 # Handle certain prefixes in translations
652 for prefix, t in (("subjunctive of ", "with-subjunctive"),):
653 if part.startswith(prefix): 653 ↛ 654line 653 didn't jump to line 654 because the condition on line 653 was never true
654 part = part[len(prefix) :]
655 data_append(tr, "tags", t)
656 break
658 # Skip certain one-character translations entirely
659 # (these could result from templates being ignored)
660 if part in ",;.": 660 ↛ 661line 660 didn't jump to line 661 because the condition on line 660 was never true
661 continue
663 if "english" in tr and tr["english"] in english_to_tags: 663 ↛ 664line 663 didn't jump to line 664 because the condition on line 663 was never true
664 data_extend(tr, "tags", english_to_tags[tr["english"]].split())
665 del tr["english"] # DEPRECATED for "translation"
666 if "translation" in tr:
667 del tr["translation"]
669 # Certain values indicate it is not actually a translation.
670 # See definition of tr_ignore_re to adjust.
671 m = re.search(tr_ignore_re, part)
672 w: Optional[str] = None
674 if m and (
675 m.start() != 0 or m.end() != len(part) or len(part.split()) > 1
676 ):
677 # This translation will be skipped because it
678 # seems to be some kind of explanatory text.
679 # However, let's put it in the "note" field
680 # instead, unless it is one of the listed fully
681 # ignored ones.
682 if part in ("please add this translation if you can",):
683 continue
684 # Save in note field
685 tr["note"] = part
686 else:
687 # Interpret it as an actual translation
688 parse_translation_desc(wxr, lang, part, tr)
689 w = tr.get("word")
690 if not w:
691 continue # Not set or empty
692 if w.startswith(("*", ":")):
693 w = w[1:].strip()
694 if w in ("[Term?]", ":", "/", "?"): 694 ↛ 695line 694 didn't jump to line 695 because the condition on line 694 was never true
695 continue # These are not valid linkage targets
696 if len(w) > 3 * len(word) + 20:
697 # Accept translation if word looks like acronym:
698 # 'ISBN', 'I.S.B.N'.isupper() return True, and
699 # false positives are unlikely.
700 if not word.isupper():
701 # Likely descriptive text or example because
702 # it is much too long.
703 wxr.wtp.debug(
704 "Translation too long compared to word, so"
705 " it is skipped",
706 sortid="translations/609-20230504",
707 )
708 del tr["word"]
709 tr["note"] = w
711 # Sanity check: try to detect certain suspicious
712 # patterns in translations
713 if "word" in tr:
714 m = re.search(tr_suspicious_re, tr["word"])
715 if m and lang not in (
716 "Bats", # ^ in tree/English/Tr/Bats
717 ):
718 wxr.wtp.debug(
719 "suspicious translation with {!r}: {}".format(
720 m.group(0), tr
721 ),
722 sortid="translations/611",
723 )
725 if "tags" in tr:
726 tr["tags"] = sorted(set(tr["tags"]))
728 # If we have only notes, add as-is
729 if "word" not in tr:
730 data_append(data, "translations", tr)
731 continue
733 # Split if it contains no spaces
734 if w: 734 ↛ 525line 734 didn't jump to line 525 because the condition on line 734 was always true
735 alts = [w]
736 if " " not in w:
737 # If no spaces, split by separator
738 alts = re.split(r"/|/", w)
739 # Note: there could be remaining slashes, but they are
740 # sometimes used in ways we cannot resolve programmatically.
741 # Create translations for each alternative.
742 for alt in alts:
743 alt = alt.strip()
744 tr1 = copy.deepcopy(tr)
745 if alt.startswith("*") or alt.startswith(":"):
746 alt = alt[1:].strip()
747 if not alt: 747 ↛ 748line 747 didn't jump to line 748 because the condition on line 747 was never true
748 continue
749 tr1["word"] = alt
750 data_append(data, "translations", tr1)
752 # Return the language name, in case we have subitems
753 return lang