Coverage for src/wiktextract/extractor/en/translations.py: 63%
231 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1# Code related to parsing translations
2#
3# Copyright (c) 2019-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org
5import copy
6import re
7from typing import Optional
9from mediawiki_langcodes import code_to_name, name_to_code
10from wikitextprocessor import MAGIC_FIRST, MAGIC_LAST
12from ...datautils import data_append, data_extend, split_at_comma_semi
13from ...wxr_context import WiktextractContext
14from .form_descriptions import (
15 classify_desc,
16 decode_tags,
17 nested_translations_re,
18 parse_translation_desc,
19 tr_note_re,
20)
21from .type_utils import TranslationData, WordData
23# Maps language names in translations to actual language names.
24# E.g., "Apache" is not a language name, but "Apachean" is.
25tr_langname_map = {
26 "Apache": "Apachean",
27 "Lutshootseed": "Lushootseed",
28 "Old Assamese": "Early Assamese",
29}
31# These names will be interpreted as script names or dialect names
32# when used as a second-level name in translations. Some script names
33# are also valid language names, but it looks likes the ones that are
34# also script names aren't used on the second level as language names.
35# These will not be interpreted as a separate language, but will instead
36# be included under the parent language with the script/dialect as a tag
37# (with spaces replaced by hyphens).
38script_and_dialect_names = set(
39 [
40 # Scripts
41 "ALUPEC",
42 "Adlam",
43 "Arabic", # Script for Kashmiri
44 "Bengali",
45 "Burmese",
46 "Carakan",
47 "CJKV Characters",
48 "Cyrillic",
49 "Devanagari",
50 "Glagolitic",
51 "Gurmukhi",
52 "Hebrew", # For Aramaic
53 "Jawi",
54 "Khmer",
55 "Latin",
56 "Mongolian",
57 "Roman",
58 "Shahmukhi",
59 "Sinhalese",
60 "Syriac", # For Aramaic
61 "Classical Syriac", # For Aramaic
62 "Taraškievica",
63 "Thai",
64 "Uyghurjin",
65 # Chinese dialects/languages
66 "Cantonese", # Variant of Chinese
67 "Dungan", # Chinese
68 "Gan", # Chinese
69 "Hakka", # Chinese
70 "Hokkien", # Chinese
71 "Jin", # Chinese
72 "Mandarin", # Chinese
73 "Min Bei", # Chinese
74 "Min Dong", # Chinese
75 "Min Nan", # Chinsese
76 "Wu", # Chinsese
77 "Xiang", # Chinese
78 "Jianghuai Mandarin", # Chinese
79 "Jilu Mandarin", # Chinese
80 "Jin Mandarin", # Chinese
81 "Northern Mandarin", # Chinese
82 "Southwestern Mandarin", # Chinese
83 "Taiwanese Mandarin", # Chinese
84 "Coastal Min", # Chinese
85 "Inland Min", # Chinese
86 "Leizhou Min", # Chinese
87 "Min", # Chinese
88 "Puxian Min", # Chinese
89 "Shanghainese Wu", # Chinese
90 "Wenzhou Wu", # Chinese
91 "Wenzhou", # Chinese
92 "Hsinchu Hokkien", # Chinese
93 "Jinjiang Hokkien", # Chinese
94 "Kaohsiung Hokkien", # Chinsese
95 "Pinghua", # Chinese
96 "Eastern Punjabi",
97 "Western Punjabi",
98 # Various countries/regions
99 "Alsace",
100 "Bavaria",
101 "Belgium",
102 "Canada",
103 "Central",
104 "Cologne",
105 "Fogo",
106 "Föhr",
107 "Föhr-Amrum",
108 "Hallig",
109 "Helgoland",
110 "Heligoland",
111 "Santiago",
112 "Sylt",
113 "Mooring",
114 "Föhr-Amrum",
115 "Vancouver Island",
116 "Wiedingharde",
117 "Anpezan", # Variant of Ladin
118 "Badiot", # Ladin
119 "Fascian", # Ladin
120 "Fodom", # Ladin
121 "Gherdëina", # Ladin
122 "Anbarani", # Variant of Talysh
123 "Asalemi", # Variant of Talysh
124 "Alemannic German", # Variant of German
125 "Rhine Franconian", # Variant of German
126 "German Low German", # Variant of Low German
127 "Campidanese", # Variant of Sardinian
128 "Logudorese", # Variant of Sardinian
129 "Digor", # Variant of Ossetian
130 "Iron", # Variant of Ossetian
131 "Northern Puebla", # Variant of Nahuatl
132 "Mecayapan", # Variant of Nathuatl
133 "Egyptian Arabic", # Variant of Arabic
134 "Gulf Arabic", # Variant of Arabic
135 "Hijazi Arabic", # Variant of Arabic
136 "Moroccan Arabic", # Variant of Arabic
137 "North Levantine Arabic", # Variant of Arabic
138 "South Levantine Arabic", # Variant of Arabic
139 "Alviri", # Variant of Alviri-Vidari
140 "Vidari", # Variant of Alviri-Vidari
141 "Tashelhit", # Variant of Berber
142 "Bokmål", # Variant of Norwegian
143 "Nynorsk", # Variant of Norwegian
144 "Mycenaean", # Variant of Greek
145 # Language varieties
146 "Ancient",
147 "Classical",
148 "Draweno-Polabian",
149 "Literary",
150 "Lower",
151 "Manitoba Saulteux",
152 "Modern",
153 "Modern Polabian",
154 "Modified traditional",
155 "Northern",
156 "Northern and Southern",
157 "Old Polabian",
158 "Simplified",
159 "Southern",
160 "Traditional",
161 "Western",
162 "1708",
163 "1918",
164 ]
165)
167# These names should be interpreted as tags (as listed in the value
168# space-separated) in second-level translations.
169tr_second_tagmap = {
170 "Föhr-Amrum, Bökingharde": "Föhr-Amrum Bökingharde",
171 "Halligen, Goesharde, Karrhard": "Halligen Goesharde Karrhard",
172 "Föhr-Amrum and Sylt dialect": "Föhr-Amrum Sylt",
173 "Hallig and Mooring": "Hallig Mooring",
174 "Föhr-Amrum & Mooring": "Föhr-Amrum Mooring",
175}
177# Ignore translations that start with one of these
178tr_ignore_prefixes = [
179 "+",
180 "Different structure used",
181 "Literally",
182 "No equivalent",
183 "Not used",
184 "Please add this translation if you can",
185 "See: ",
186 "Use ",
187 "[Book Pahlavi needed]",
188 "[book pahlavi needed]",
189 "[script needed]",
190 "different structure used",
191 "e.g.",
192 "lit.",
193 "literally",
194 "no equivalent",
195 "normally ",
196 "not used",
197 "noun compound ",
198 "please add this translation if you can",
199 "prefix ",
200 "see: ",
201 "suffix ",
202 "use ",
203 "usually ",
204]
206# Ignore translations that contain one of these anywhere (case-sensitive).
207# Or actually, put such translations in the "note" field rather than in "word".
208tr_ignore_contains = [
209 "usually expressed with ",
210 " can be used ",
211 " construction used",
212 " used with ",
213 " + ",
214 "genitive case",
215 "dative case",
216 "nominative case",
217 "accusative case",
218 "absolute state",
219 "infinitive of ",
220 "participle of ",
221 "for this sense",
222 "depending on the circumstances",
223 "expressed with ",
224 " expression ",
225 " means ",
226 " is used",
227 " — ", # Used to give example sentences
228 " translation",
229 "not attested",
230 "grammatical structure",
231 "construction is used",
232 "tense used",
233 " lit.",
234 " literally",
235 "dative",
236 "accusative",
237 "genitive",
238 "essive",
239 "partitive",
240 "translative",
241 "elative",
242 "inessive",
243 "illative",
244 "adessive",
245 "ablative",
246 "allative",
247 "abessive",
248 "comitative",
249 "instructive",
250 "particle",
251 "predicative",
252 "attributive",
253 "preposition",
254 "postposition",
255 "prepositional",
256 "postpositional",
257 "prefix",
258 "suffix",
259 "translated",
260]
262# Ignore translations that match one of these regular expressions
263tr_ignore_regexps = [
264 r"^\[[\d,]+\]$",
265 r"\?\?$",
266 r"^\s*$",
267]
269# If a translation matches this regexp (with re.search), we print a debug
270# message
271tr_suspicious_re = re.compile(
272 r" [mf][12345]$|"
273 + r" [mfnc]$|"
274 + r" (pf|impf|vir|nvir|anml|anim|inan|sg|pl)$|"
275 + "|".join(
276 re.escape(x)
277 for x in [
278 "; ",
279 "* ",
280 ": ",
281 "[",
282 "]",
283 "{",
284 "}",
285 "/",
286 "^",
287 "literally",
288 "lit.",
289 # XXX check occurrences of ⫽, seems to be used as verb-object
290 # separator but shouldn't really be part of the canonical form.
291 # See e.g. 打工/Chinese
292 "⫽",
293 "also expressed with",
294 "e.g.",
295 "cf.",
296 "used ",
297 "script needed",
298 "please add this translation",
299 "usage ",
300 ]
301 )
302)
304# Regular expression to be searched from translation (with re.search) to check
305# if it should be ignored.
306tr_ignore_re = re.compile(
307 "^("
308 + "|".join(re.escape(x) for x in tr_ignore_prefixes)
309 + ")|"
310 + "|".join(re.escape(x) for x in tr_ignore_contains)
311 + "|"
312 + "|".join(tr_ignore_regexps)
313) # These are not to be escaped
315# These English texts get converted to tags in translations
316english_to_tags = {
317 "I have": "first-person singular",
318 "you have": "second-person singular",
319 "she has": "third-person singular feminine",
320 "he has": "third-person singular masculine",
321}
324def parse_translation_item_text(
325 wxr: WiktextractContext,
326 word: str,
327 data: WordData,
328 item: str,
329 sense: Optional[str],
330 lang: Optional[str],
331 langcode: Optional[str],
332 translations_from_template: list[str],
333 is_reconstruction: bool,
334) -> Optional[str]:
335 assert isinstance(wxr, WiktextractContext)
336 assert isinstance(word, str)
337 assert isinstance(data, dict)
338 assert isinstance(item, str)
339 assert sense is None or isinstance(sense, str)
340 assert lang is None or isinstance(lang, str) # Parent item language
341 assert langcode is None or isinstance(langcode, str) # Template langcode
342 assert isinstance(translations_from_template, list)
343 for x in translations_from_template:
344 assert isinstance(x, str)
345 assert is_reconstruction in (True, False)
347 # print("parse_translation_item_text: {!r} lang={}"
348 # " langcode={}".format(item, lang, langcode))
350 if not item: 350 ↛ 351line 350 didn't jump to line 351 because the condition on line 350 was never true
351 return None
353 # Find and remove nested translations from the item
354 nested = list(m.group(1) for m in re.finditer(nested_translations_re, item))
355 if nested: 355 ↛ 356line 355 didn't jump to line 356 because the condition on line 355 was never true
356 item = re.sub(nested_translations_re, "", item)
358 if re.search(r"\(\d+\)|\[\d+\]", item) and "numeral:" not in item: 358 ↛ 359line 358 didn't jump to line 359 because the condition on line 358 was never true
359 wxr.wtp.debug(
360 "possible sense number in translation item: {}".format(item),
361 sortid="translations/324",
362 )
364 # Translation items should start with a language name (except
365 # some nested translation items don't and rely on the language
366 # name from the higher level, and some append a language variant
367 # name to a broader language name)
368 extra_langcodes = set()
369 if lang and name_to_code(lang, "en") != "":
370 lang_code = name_to_code(lang, "en")
371 extra_langcodes.add(lang_code)
372 # Canonicalize language name (we could have gotten it via
373 # alias or other_names)
374 lang = code_to_name(lang_code, "en")
375 m = re.match(r"\*?\s*([-' \w][-'&, \w()]*)[::]\s*", item)
376 tags = []
377 if m: 377 ↛ 434line 377 didn't jump to line 434 because the condition on line 377 was always true
378 sublang = m.group(1).strip()
379 language_name_variations: list[str] = list()
380 if lang and sublang:
381 lang_sublang = lang + " " + sublang
382 sublang_lang = sublang + " " + lang
383 language_name_variations.extend(
384 (
385 lang_sublang,
386 sublang_lang,
387 lang_sublang.replace(" ", "-"),
388 sublang_lang.replace(" ", "-"),
389 )
390 )
391 if " " in sublang:
392 language_name_variations.append(sublang.replace(" ", "-"))
393 if "-" in sublang:
394 language_name_variations.append(sublang.replace("-", " "))
396 if lang is None:
397 if sublang == "Note": 397 ↛ 398line 397 didn't jump to line 398 because the condition on line 397 was never true
398 return None
399 lang = sublang
400 elif lang_sublang and any(
401 name_to_code(captured_lang := lang_comb, "en") != ""
402 # Python 3.8: catch the value of lang_comb with :=
403 for lang_comb in language_name_variations
404 ):
405 lang = captured_lang
406 elif sublang in script_and_dialect_names:
407 # If the second-level name is a script name, add it as
408 # tag and keep the top-level language.
409 # This helps with languages that script names
410 # on the same level; those scripts may also be valid
411 # language names. See leaf/English/Translations/Pali.
412 tags.append(sublang.replace(" ", "-"))
413 elif sublang in tr_second_tagmap:
414 # Certain second-level names are interpreted as tags
415 # (mapped to tags). Note that these may still have
416 # separate language codes, so additional langcode
417 # removal tricks may need to be played below.
418 tags.extend(tr_second_tagmap[sublang].split())
419 elif name_to_code(sublang, "en") != "": 419 ↛ 421line 419 didn't jump to line 421 because the condition on line 419 was always true
420 lang = sublang
421 elif sublang[0].isupper() and classify_desc(sublang) == "tags":
422 # Interpret it as a tag
423 tags.append(sublang)
424 else:
425 # We don't recognize this prefix
426 wxr.wtp.error(
427 "unrecognized prefix (language name?) in "
428 "translation item: {}".format(item),
429 sortid="translations/369",
430 )
431 return None
432 # Strip the language name/tag from the item
433 item = item[m.end() :]
434 elif lang is None:
435 # No mathing language prefix. Try if it is missing colon.
436 parts = item.split()
437 if len(parts) > 1 and name_to_code(parts[0], "en") != "":
438 lang = parts[0]
439 item = " ".join(parts[1:])
440 else:
441 if "__IGNORE__" not in item:
442 wxr.wtp.error(
443 "no language name in translation item: {}".format(item),
444 sortid="translations/382",
445 )
446 return None
448 # Map non-standard language names (e.g., "Apache" -> "Apachean")
449 lang = tr_langname_map.get(lang, lang)
451 # If we didn't get language code from the template, look it up
452 # based on language name
453 if langcode is None and name_to_code(lang, "en") != "":
454 langcode = name_to_code(lang, "en")
456 # Remove (<langcode>) parts from the item. They seem to be
457 # generated by {{t+|...}}.
458 if langcode: 458 ↛ 500line 458 didn't jump to line 500 because the condition on line 458 was always true
459 extra_langcodes.add(langcode)
460 if "-" in langcode:
461 extra_langcodes.add(langcode.split("-")[0])
462 if langcode in (
463 "zh",
464 "yue",
465 "cdo",
466 "cmn",
467 "dng",
468 "hak",
469 "mnp",
470 "nan",
471 "wuu",
472 "zh-min-nan",
473 ):
474 extra_langcodes.update(
475 [
476 "zh",
477 "yue",
478 "cdo",
479 "cmn",
480 "dng",
481 "hak",
482 "mnp",
483 "nan",
484 "wuu",
485 "zh-min-nan",
486 ]
487 )
488 elif langcode in ("nn", "nb", "no"): 488 ↛ 489line 488 didn't jump to line 489 because the condition on line 488 was never true
489 extra_langcodes.update(["no", "nn", "nb"])
490 for x in extra_langcodes:
491 item = re.sub(r"\s*\^?\({}\)".format(re.escape(x)), "", item)
493 # Map translations obtained from templates into magic characters
494 # before splitting the translations list. This way, if a comma
495 # (or semicolon etc) was used inside the template, it won't get
496 # split. We restore the magic characters into the original
497 # translations after splitting. This kludge improves robustness
498 # of collection translations for phrases whose translations
499 # may contain commas.
500 translations_from_template = list(
501 sorted(translations_from_template, key=lambda x: len(x), reverse=True)
502 )
503 tr_mappings = {}
504 for i, trt in enumerate(translations_from_template):
505 if not trt: 505 ↛ 506line 505 didn't jump to line 506 because the condition on line 505 was never true
506 continue
507 ch = chr(MAGIC_FIRST + i)
508 rex = re.escape(trt)
509 if trt[0].isalnum(): 509 ↛ 511line 509 didn't jump to line 511 because the condition on line 509 was always true
510 rex = r"\b" + rex
511 if trt[-1].isalnum(): 511 ↛ 513line 511 didn't jump to line 513 because the condition on line 511 was always true
512 rex = rex + r"\b"
513 item = re.sub(rex, ch, item)
514 tr_mappings[ch] = trt
516 # There may be multiple translations, separated by comma
517 nested.append(item)
518 for item in nested:
519 tagsets: list[tuple[str, ...]] = []
520 # This never does anything; it's never updated, so it's always empty
521 # topics: list[str] = []
523 for part in split_at_comma_semi(
524 item, extra=[" / ", " / ", "/", r"\| furthermore: "]
525 ):
526 # Substitute the magic characters back to original
527 # translations (this is part of dealing with
528 # phrasal translations containing commas).
529 part = re.sub(
530 r"[{:c}-{:c}]".format(MAGIC_FIRST, MAGIC_LAST),
531 lambda m: tr_mappings.get(m.group(0), m.group(0)),
532 part,
533 )
535 if part.endswith(":"): # E.g. "salt of the earth"/Korean 535 ↛ 536line 535 didn't jump to line 536 because the condition on line 535 was never true
536 part = part[:-1].strip()
537 if not part: 537 ↛ 538line 537 didn't jump to line 538 because the condition on line 537 was never true
538 continue
540 # Strip language links
541 tr: TranslationData = {"lang": lang}
542 if langcode: 542 ↛ 544line 542 didn't jump to line 544 because the condition on line 542 was always true
543 tr["code"] = langcode
544 if tags:
545 tr["tags"] = list(tags)
546 for ttup in tagsets: 546 ↛ 547line 546 didn't jump to line 547 because the loop on line 546 never started
547 tr["tags"].extend(ttup)
548 # topics is never populated, so it's always empty
549 # if topics:
550 # tr["topics"] = list(topics)
551 if sense: 551 ↛ 552line 551 didn't jump to line 552 because the condition on line 551 was never true
552 if sense.startswith(
553 (
554 "Translations to be checked",
555 ":The translations below need to be checked",
556 )
557 ):
558 continue # Skip such translations
559 else:
560 tr["sense"] = sense
562 # Check if this part starts with (tags)
563 m = re.match(r"\(([^)]+)\) ", part)
564 if m: 564 ↛ 565line 564 didn't jump to line 565 because the condition on line 564 was never true
565 par = m.group(1)
566 rest = part[m.end() :]
567 cls = classify_desc(par, no_unknown_starts=True)
568 if cls == "tags":
569 tagsets2, topics2 = decode_tags(par)
570 for ttup in tagsets2:
571 data_extend(tr, "tags", ttup)
572 data_extend(tr, "topics", topics2)
573 part = rest
575 # Check if this part ends with (tags). Note that
576 # note-re will mess things up if we rely on this being
577 # checked later.
578 m = re.search(r" +\(([^)]+)\)$", part)
579 if m:
580 par = m.group(1)
581 rest = part[: m.start()]
582 cls = classify_desc(par, no_unknown_starts=True)
583 if cls == "tags":
584 tagsets2, topics2 = decode_tags(par)
585 for ttup in tagsets2:
586 data_extend(tr, "tags", ttup)
587 data_extend(tr, "topics", topics2)
588 part = rest
590 # Check if this part starts with "<tags/english>: <rest>"
591 m = re.match(r"([-\w() ]+): ", part)
592 if m: 592 ↛ 593line 592 didn't jump to line 593 because the condition on line 592 was never true
593 par = m.group(1).strip()
594 rest = part[m.end() :]
595 if par in ("", "see"):
596 part = "rest"
597 else:
598 cls = classify_desc(par)
599 # print("par={!r} cls={!r}".format(par, cls))
600 if cls == "tags":
601 tagsets2, topics2 = decode_tags(par)
602 for ttup in tagsets2:
603 data_extend(tr, "tags", ttup)
604 data_extend(tr, "topics", topics2)
605 part = rest
606 elif cls == "english":
607 if re.search(tr_note_re, par):
608 if "note" in tr:
609 tr["note"] += "; " + par
610 else:
611 tr["note"] = par
612 else:
613 if "english" in tr:
614 tr["english"] += "; " + par
615 else:
616 tr["english"] = par
617 part = rest
619 # Skip translations that our template_fn says to ignore
620 # and those that contain Lua execution errors.
621 if "__IGNORE__" in part: 621 ↛ 622line 621 didn't jump to line 622 because the condition on line 621 was never true
622 continue # Contains something we want to ignore
623 if part.startswith("Lua execution error"): 623 ↛ 624line 623 didn't jump to line 624 because the condition on line 623 was never true
624 continue
626 # Handle certain suffixes in translations that
627 # we might put in "note" but that we can actually
628 # parse into tags.
629 for suffix, t in (
630 (" with dative", "with-dative"),
631 (" with genitive", "with-genitive"),
632 (" with accusative", "with-accusative"),
633 (" in subjunctive", "with-subjunctive"),
634 (" and conditional mood", "with-conditional"),
635 (" - I have - you have", "first-person second-person singular"),
636 (" - I have", "first-person singular"),
637 (" - you have", "second-person singular"),
638 ):
639 if part.endswith(suffix): 639 ↛ 640line 639 didn't jump to line 640 because the condition on line 639 was never true
640 part = part[: -len(suffix)]
641 data_append(tr, "tags", t)
642 break
644 # Handle certain prefixes in translations
645 for prefix, t in (("subjunctive of ", "with-subjunctive"),):
646 if part.startswith(prefix): 646 ↛ 647line 646 didn't jump to line 647 because the condition on line 646 was never true
647 part = part[len(prefix) :]
648 data_append(tr, "tags", t)
649 break
651 # Skip certain one-character translations entirely
652 # (these could result from templates being ignored)
653 if part in ",;.": 653 ↛ 654line 653 didn't jump to line 654 because the condition on line 653 was never true
654 continue
656 if "english" in tr and tr["english"] in english_to_tags: 656 ↛ 657line 656 didn't jump to line 657 because the condition on line 656 was never true
657 data_extend(tr, "tags", english_to_tags[tr["english"]].split())
658 del tr["english"]
660 # Certain values indicate it is not actually a translation.
661 # See definition of tr_ignore_re to adjust.
662 m = re.search(tr_ignore_re, part)
663 w: Optional[str] = None
665 if m and (
666 m.start() != 0 or m.end() != len(part) or len(part.split()) > 1
667 ):
668 # This translation will be skipped because it
669 # seems to be some kind of explanatory text.
670 # However, let's put it in the "note" field
671 # instead, unless it is one of the listed fully
672 # ignored ones.
673 if part in ("please add this translation if you can",): 673 ↛ 676line 673 didn't jump to line 676 because the condition on line 673 was always true
674 continue
675 # Save in note field
676 tr["note"] = part
677 else:
678 # Interpret it as an actual translation
679 parse_translation_desc(wxr, lang, part, tr)
680 w = tr.get("word")
681 if not w: 681 ↛ 682line 681 didn't jump to line 682 because the condition on line 681 was never true
682 continue # Not set or empty
683 if w.startswith(("*", ":")):
684 w = w[1:].strip()
685 if w in ("[Term?]", ":", "/", "?"): 685 ↛ 686line 685 didn't jump to line 686 because the condition on line 685 was never true
686 continue # These are not valid linkage targets
687 if len(w) > 3 * len(word) + 20: 687 ↛ 691line 687 didn't jump to line 691 because the condition on line 687 was never true
688 # Accept translation if word looks like acronym:
689 # 'ISBN', 'I.S.B.N'.isupper() return True, and
690 # false positives are unlikely.
691 if not word.isupper():
692 # Likely descriptive text or example because
693 # it is much too long.
694 wxr.wtp.debug(
695 "Translation too long compared to word, so"
696 " it is skipped",
697 sortid="translations/609-20230504",
698 )
699 del tr["word"]
700 tr["note"] = w
702 # Sanity check: try to detect certain suspicious
703 # patterns in translations
704 if "word" in tr: 704 ↛ 716line 704 didn't jump to line 716 because the condition on line 704 was always true
705 m = re.search(tr_suspicious_re, tr["word"])
706 if m and lang not in ( 706 ↛ 709line 706 didn't jump to line 709 because the condition on line 706 was never true
707 "Bats", # ^ in tree/English/Tr/Bats
708 ):
709 wxr.wtp.debug(
710 "suspicious translation with {!r}: {}".format(
711 m.group(0), tr
712 ),
713 sortid="translations/611",
714 )
716 if "tags" in tr:
717 tr["tags"] = list(sorted(set(tr["tags"])))
719 # If we have only notes, add as-is
720 if "word" not in tr: 720 ↛ 721line 720 didn't jump to line 721 because the condition on line 720 was never true
721 data_append(data, "translations", tr)
722 continue
724 # Split if it contains no spaces
725 if w: 725 ↛ 523line 725 didn't jump to line 523 because the condition on line 725 was always true
726 alts = [w]
727 if " " not in w:
728 # If no spaces, split by separator
729 alts = re.split(r"/|/", w)
730 # Note: there could be remaining slashes, but they are
731 # sometimes used in ways we cannot resolve programmatically.
732 # Create translations for each alternative.
733 for alt in alts:
734 alt = alt.strip()
735 tr1 = copy.deepcopy(tr)
736 if alt.startswith("*") or alt.startswith(":"): 736 ↛ 737line 736 didn't jump to line 737 because the condition on line 736 was never true
737 alt = alt[1:].strip()
738 if not alt: 738 ↛ 739line 738 didn't jump to line 739 because the condition on line 738 was never true
739 continue
740 tr1["word"] = alt
741 data_append(data, "translations", tr1)
743 # Return the language name, in case we have subitems
744 return lang