Coverage for src/wiktextract/extractor/en/translations.py: 88%

236 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-12 08:27 +0000

1# Code related to parsing translations 

2# 

3# Copyright (c) 2019-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org 

4 

5import copy 

6import re 

7from typing import Optional 

8 

9from mediawiki_langcodes import code_to_name, name_to_code 

10from wikitextprocessor import MAGIC_FIRST, MAGIC_LAST 

11 

12from ...datautils import data_append, data_extend, split_at_comma_semi 

13from ...wxr_context import WiktextractContext 

14from .form_descriptions import ( 

15 classify_desc, 

16 decode_tags, 

17 nested_translations_re, 

18 parse_translation_desc, 

19 tr_note_re, 

20) 

21from .type_utils import TranslationData, WordData 

22 

23# Maps language names in translations to actual language names. 

24# E.g., "Apache" is not a language name, but "Apachean" is. 

25tr_langname_map = { 

26 "Apache": "Apachean", 

27 "Lutshootseed": "Lushootseed", 

28 "Old Assamese": "Early Assamese", 

29} 

30 

31# These names will be interpreted as script names or dialect names 

32# when used as a second-level name in translations. Some script names 

33# are also valid language names, but it looks likes the ones that are 

34# also script names aren't used on the second level as language names. 

35# These will not be interpreted as a separate language, but will instead 

36# be included under the parent language with the script/dialect as a tag 

37# (with spaces replaced by hyphens). 

38script_and_dialect_names = set( 

39 [ 

40 # Scripts 

41 "ALUPEC", 

42 "Adlam", 

43 "Arabic", # Script for Kashmiri 

44 "Bengali", 

45 "Burmese", 

46 "Carakan", 

47 "CJKV Characters", 

48 "Cyrillic", 

49 "Devanagari", 

50 "Glagolitic", 

51 "Gurmukhi", 

52 "Hebrew", # For Aramaic 

53 "Jawi", 

54 "Khmer", 

55 "Latin", 

56 "Mongolian", 

57 "Roman", 

58 "Shahmukhi", 

59 "Sinhalese", 

60 "Syriac", # For Aramaic 

61 "Classical Syriac", # For Aramaic 

62 "Taraškievica", 

63 "Thai", 

64 "Uyghurjin", 

65 # Chinese dialects/languages 

66 "Cantonese", # Variant of Chinese 

67 "Dungan", # Chinese 

68 "Gan", # Chinese 

69 "Hakka", # Chinese 

70 "Hokkien", # Chinese 

71 "Jin", # Chinese 

72 "Mandarin", # Chinese 

73 "Min Bei", # Chinese 

74 "Min Dong", # Chinese 

75 "Min Nan", # Chinsese 

76 "Wu", # Chinsese 

77 "Xiang", # Chinese 

78 "Jianghuai Mandarin", # Chinese 

79 "Jilu Mandarin", # Chinese 

80 "Jin Mandarin", # Chinese 

81 "Northern Mandarin", # Chinese 

82 "Southwestern Mandarin", # Chinese 

83 "Taiwanese Mandarin", # Chinese 

84 "Coastal Min", # Chinese 

85 "Inland Min", # Chinese 

86 "Leizhou Min", # Chinese 

87 "Min", # Chinese 

88 "Puxian Min", # Chinese 

89 "Shanghainese Wu", # Chinese 

90 "Wenzhou Wu", # Chinese 

91 "Wenzhou", # Chinese 

92 "Hsinchu Hokkien", # Chinese 

93 "Jinjiang Hokkien", # Chinese 

94 "Kaohsiung Hokkien", # Chinsese 

95 "Pinghua", # Chinese 

96 "Eastern Punjabi", 

97 "Western Punjabi", 

98 # Various countries/regions 

99 "Alsace", 

100 "Bavaria", 

101 "Belgium", 

102 "Canada", 

103 "Central", 

104 "Cologne", 

105 "Fogo", 

106 "Föhr", 

107 "Föhr-Amrum", 

108 "Hallig", 

109 "Helgoland", 

110 "Heligoland", 

111 "Santiago", 

112 "Sylt", 

113 "Mooring", 

114 "Föhr-Amrum", 

115 "Vancouver Island", 

116 "Wiedingharde", 

117 "Anpezan", # Variant of Ladin 

118 "Badiot", # Ladin 

119 "Fascian", # Ladin 

120 "Fodom", # Ladin 

121 "Gherdëina", # Ladin 

122 "Anbarani", # Variant of Talysh 

123 "Asalemi", # Variant of Talysh 

124 "Alemannic German", # Variant of German 

125 "Rhine Franconian", # Variant of German 

126 "German Low German", # Variant of Low German 

127 "Campidanese", # Variant of Sardinian 

128 "Logudorese", # Variant of Sardinian 

129 "Digor", # Variant of Ossetian 

130 "Iron", # Variant of Ossetian 

131 "Northern Puebla", # Variant of Nahuatl 

132 "Mecayapan", # Variant of Nathuatl 

133 "Egyptian Arabic", # Variant of Arabic 

134 "Gulf Arabic", # Variant of Arabic 

135 "Hijazi Arabic", # Variant of Arabic 

136 "Moroccan Arabic", # Variant of Arabic 

137 "North Levantine Arabic", # Variant of Arabic 

138 "South Levantine Arabic", # Variant of Arabic 

139 "Alviri", # Variant of Alviri-Vidari 

140 "Vidari", # Variant of Alviri-Vidari 

141 "Tashelhit", # Variant of Berber 

142 "Bokmål", # Variant of Norwegian 

143 "Nynorsk", # Variant of Norwegian 

144 "Mycenaean", # Variant of Greek 

145 # Language varieties 

146 "Ancient", 

147 "Classical", 

148 "Draweno-Polabian", 

149 "Literary", 

150 "Lower", 

151 "Manitoba Saulteux", 

152 "Modern", 

153 "Modern Polabian", 

154 "Modified traditional", 

155 "Northern", 

156 "Northern and Southern", 

157 "Old Polabian", 

158 "Simplified", 

159 "Southern", 

160 "Traditional", 

161 "Western", 

162 "1708", 

163 "1918", 

164 ] 

165) 

166 

167# These names should be interpreted as tags (as listed in the value 

168# space-separated) in second-level translations. 

169tr_second_tagmap = { 

170 "Föhr-Amrum, Bökingharde": "Föhr-Amrum Bökingharde", 

171 "Halligen, Goesharde, Karrhard": "Halligen Goesharde Karrhard", 

172 "Föhr-Amrum and Sylt dialect": "Föhr-Amrum Sylt", 

173 "Hallig and Mooring": "Hallig Mooring", 

174 "Föhr-Amrum & Mooring": "Föhr-Amrum Mooring", 

175} 

176 

177# Ignore translations that start with one of these 

178tr_ignore_prefixes = [ 

179 "+", 

180 "Different structure used", 

181 "Literally", 

182 "No equivalent", 

183 "Not used", 

184 "Please add this translation if you can", 

185 "See: ", 

186 "Use ", 

187 "[Book Pahlavi needed]", 

188 "[book pahlavi needed]", 

189 "[script needed]", 

190 "different structure used", 

191 "e.g.", 

192 "lit.", 

193 "literally", 

194 "no equivalent", 

195 "normally ", 

196 "not used", 

197 "noun compound ", 

198 "please add this translation if you can", 

199 "prefix ", 

200 "see: ", 

201 "suffix ", 

202 "use ", 

203 "usually ", 

204] 

205 

206# Ignore translations that contain one of these anywhere (case-sensitive). 

207# Or actually, put such translations in the "note" field rather than in "word". 

208tr_ignore_contains = [ 

209 "usually expressed with ", 

210 " can be used ", 

211 " construction used", 

212 " used with ", 

213 " + ", 

214 "genitive case", 

215 "dative case", 

216 "nominative case", 

217 "accusative case", 

218 "absolute state", 

219 "infinitive of ", 

220 "participle of ", 

221 "for this sense", 

222 "depending on the circumstances", 

223 "expressed with ", 

224 " expression ", 

225 " means ", 

226 " is used", 

227 " — ", # Used to give example sentences 

228 " translation", 

229 "not attested", 

230 "grammatical structure", 

231 "construction is used", 

232 "tense used", 

233 " lit.", 

234 " literally", 

235 "dative", 

236 "accusative", 

237 "genitive", 

238 "essive", 

239 "partitive", 

240 "translative", 

241 "elative", 

242 "inessive", 

243 "illative", 

244 "adessive", 

245 "ablative", 

246 "allative", 

247 "abessive", 

248 "comitative", 

249 "instructive", 

250 "particle", 

251 "predicative", 

252 "attributive", 

253 "preposition", 

254 "postposition", 

255 "prepositional", 

256 "postpositional", 

257 "prefix", 

258 "suffix", 

259 "translated", 

260] 

261 

262# Ignore translations that match one of these regular expressions 

263tr_ignore_regexps = [ 

264 r"^\[[\d,]+\]$", 

265 r"\?\?$", 

266 r"^\s*$", 

267] 

268 

269# If a translation matches this regexp (with re.search), we print a debug 

270# message 

271tr_suspicious_re = re.compile( 

272 r" [mf][12345]$|" 

273 + r" [mfnc]$|" 

274 + r" (pf|impf|vir|nvir|anml|anim|inan|sg|pl)$|" 

275 + "|".join( 

276 re.escape(x) 

277 for x in [ 

278 "; ", 

279 "* ", 

280 ": ", 

281 "[", 

282 "]", 

283 "{", 

284 "}", 

285 "/", 

286 "^", 

287 "literally", 

288 "lit.", 

289 # XXX check occurrences of ⫽, seems to be used as verb-object 

290 # separator but shouldn't really be part of the canonical form. 

291 # See e.g. 打工/Chinese 

292 "⫽", 

293 "also expressed with", 

294 "e.g.", 

295 "cf.", 

296 "used ", 

297 "script needed", 

298 "please add this translation", 

299 "usage ", 

300 ] 

301 ) 

302) 

303 

304# Regular expression to be searched from translation (with re.search) to check 

305# if it should be ignored. 

306tr_ignore_re = re.compile( 

307 "^(" 

308 + "|".join(re.escape(x) for x in tr_ignore_prefixes) 

309 + ")|" 

310 + "|".join(re.escape(x) for x in tr_ignore_contains) 

311 + "|" 

312 + "|".join(tr_ignore_regexps) 

313) # These are not to be escaped 

314 

315# These English texts get converted to tags in translations 

316english_to_tags = { 

317 "I have": "first-person singular", 

318 "you have": "second-person singular", 

319 "she has": "third-person singular feminine", 

320 "he has": "third-person singular masculine", 

321} 

322 

323 

324def parse_translation_item_text( 

325 wxr: WiktextractContext, 

326 word: str, 

327 data: WordData, 

328 item: str, 

329 sense: Optional[str], 

330 lang: Optional[str], 

331 langcode: Optional[str], 

332 translations_from_template: list[str], 

333 is_reconstruction: bool, 

334) -> Optional[str]: 

335 assert isinstance(wxr, WiktextractContext) 

336 assert isinstance(word, str) 

337 assert isinstance(data, dict) 

338 assert isinstance(item, str) 

339 assert sense is None or isinstance(sense, str) 

340 assert lang is None or isinstance(lang, str) # Parent item language 

341 assert langcode is None or isinstance(langcode, str) # Template langcode 

342 assert isinstance(translations_from_template, list) 

343 for x in translations_from_template: 

344 assert isinstance(x, str) 

345 assert is_reconstruction in (True, False) 

346 

347 # print("parse_translation_item_text: {!r} lang={}" 

348 # " langcode={}".format(item, lang, langcode)) 

349 

350 if not item: 350 ↛ 351line 350 didn't jump to line 351 because the condition on line 350 was never true

351 return None 

352 

353 # Find and remove nested translations from the item 

354 nested = list(m.group(1) for m in re.finditer(nested_translations_re, item)) 

355 if nested: 

356 item = re.sub(nested_translations_re, "", item) 

357 

358 if re.search(r"\(\d+\)|\[\d+\]", item) and "numeral:" not in item: 

359 wxr.wtp.debug( 

360 "possible sense number in translation item: {}".format(item), 

361 sortid="translations/324", 

362 ) 

363 

364 # Translation items should start with a language name (except 

365 # some nested translation items don't and rely on the language 

366 # name from the higher level, and some append a language variant 

367 # name to a broader language name) 

368 extra_langcodes = set() 

369 if lang and name_to_code(lang, "en") != "": 

370 lang_code = name_to_code(lang, "en") 

371 extra_langcodes.add(lang_code) 

372 # Canonicalize language name (we could have gotten it via 

373 # alias or other_names) 

374 lang = code_to_name(lang_code, "en") 

375 m = re.match(r"\*?\s*([-' \w][-'&, \w()]*)[::]\s*", item) 

376 tags = [] 

377 if m: 

378 sublang = m.group(1).strip() 

379 language_name_variations: list[str] = list() 

380 if lang and sublang: 

381 lang_sublang = lang + " " + sublang 

382 sublang_lang = sublang + " " + lang 

383 language_name_variations.extend( 

384 ( 

385 lang_sublang, 

386 sublang_lang, 

387 lang_sublang.replace(" ", "-"), 

388 sublang_lang.replace(" ", "-"), 

389 ) 

390 ) 

391 if " " in sublang: 

392 language_name_variations.append(sublang.replace(" ", "-")) 

393 if "-" in sublang: 

394 language_name_variations.append(sublang.replace("-", " ")) 

395 

396 if lang is None: 

397 if sublang == "Note": 397 ↛ 398line 397 didn't jump to line 398 because the condition on line 397 was never true

398 return None 

399 lang = sublang 

400 elif lang_sublang and any( 

401 name_to_code(captured_lang := lang_comb, "en") != "" 

402 # Python 3.8: catch the value of lang_comb with := 

403 for lang_comb in language_name_variations 

404 ): 

405 lang = captured_lang 

406 elif sublang in script_and_dialect_names: 

407 # If the second-level name is a script name, add it as 

408 # tag and keep the top-level language. 

409 # This helps with languages that script names 

410 # on the same level; those scripts may also be valid 

411 # language names. See leaf/English/Translations/Pali. 

412 tags.append(sublang.replace(" ", "-")) 

413 elif sublang in tr_second_tagmap: 

414 # Certain second-level names are interpreted as tags 

415 # (mapped to tags). Note that these may still have 

416 # separate language codes, so additional langcode 

417 # removal tricks may need to be played below. 

418 tags.extend(tr_second_tagmap[sublang].split()) 

419 elif name_to_code(sublang, "en") != "": 

420 lang = sublang 

421 elif sublang[0].isupper() and classify_desc(sublang) == "tags": 

422 # Interpret it as a tag 

423 tags.append(sublang) 

424 else: 

425 # We don't recognize this prefix 

426 wxr.wtp.error( 

427 "unrecognized prefix (language name?) in " 

428 "translation item: {}".format(item), 

429 sortid="translations/369", 

430 ) 

431 return None 

432 # Strip the language name/tag from the item 

433 item = item[m.end() :] 

434 elif lang is None: 

435 # No mathing language prefix. Try if it is missing colon. 

436 parts = item.split() 

437 if len(parts) > 1 and name_to_code(parts[0], "en") != "": 437 ↛ 438line 437 didn't jump to line 438 because the condition on line 437 was never true

438 lang = parts[0] 

439 item = " ".join(parts[1:]) 

440 else: 

441 if "__IGNORE__" not in item: 

442 wxr.wtp.error( 

443 "no language name in translation item: {}".format(item), 

444 sortid="translations/382", 

445 ) 

446 return None 

447 

448 # Map non-standard language names (e.g., "Apache" -> "Apachean") 

449 lang = tr_langname_map.get(lang, lang) 

450 

451 # If we didn't get language code from the template, look it up 

452 # based on language name 

453 if langcode is None and name_to_code(lang, "en") != "": 

454 langcode = name_to_code(lang, "en") 

455 

456 # Remove (<langcode>) parts from the item. They seem to be 

457 # generated by {{t+|...}}. 

458 if langcode: 

459 extra_langcodes.add(langcode) 

460 if "-" in langcode: 

461 extra_langcodes.add(langcode.split("-")[0]) 

462 if langcode in ( 

463 "zh", 

464 "yue", 

465 "cdo", 

466 "cmn", 

467 "dng", 

468 "hak", 

469 "mnp", 

470 "nan", 

471 "wuu", 

472 "zh-min-nan", 

473 ): 

474 extra_langcodes.update( 

475 [ 

476 "zh", 

477 "yue", 

478 "cdo", 

479 "cmn", 

480 "dng", 

481 "hak", 

482 "mnp", 

483 "nan", 

484 "wuu", 

485 "zh-min-nan", 

486 ] 

487 ) 

488 elif langcode in ("nn", "nb", "no"): 

489 extra_langcodes.update(["no", "nn", "nb"]) 

490 for x in extra_langcodes: 

491 item = re.sub(r"\s*\^?\({}\)".format(re.escape(x)), "", item) 

492 

493 # Map translations obtained from templates into magic characters 

494 # before splitting the translations list. This way, if a comma 

495 # (or semicolon etc) was used inside the template, it won't get 

496 # split. We restore the magic characters into the original 

497 # translations after splitting. This kludge improves robustness 

498 # of collection translations for phrases whose translations 

499 # may contain commas. 

500 translations_from_template = list( 

501 sorted(translations_from_template, key=lambda x: len(x), reverse=True) 

502 ) 

503 tr_mappings = {} 

504 for i, trt in enumerate(translations_from_template): 

505 if not trt: 505 ↛ 506line 505 didn't jump to line 506 because the condition on line 505 was never true

506 continue 

507 ch = chr(MAGIC_FIRST + i) 

508 rex = re.escape(trt) 

509 if trt[0].isalnum(): 

510 rex = r"\b" + rex 

511 if trt[-1].isalnum(): 

512 rex = rex + r"\b" 

513 item = re.sub(rex, ch, item) 

514 tr_mappings[ch] = trt 

515 

516 # There may be multiple translations, separated by comma 

517 nested.append(item) 

518 for item in nested: 

519 tagsets: list[tuple[str, ...]] = [] 

520 # This never does anything; it's never updated, so it's always empty 

521 # topics: list[str] = [] 

522 

523 for part in split_at_comma_semi( 

524 item, extra=[" / ", " / ", "/", r"\| furthermore: "] 

525 ): 

526 # Substitute the magic characters back to original 

527 # translations (this is part of dealing with 

528 # phrasal translations containing commas). 

529 part = re.sub( 

530 r"[{:c}-{:c}]".format(MAGIC_FIRST, MAGIC_LAST), 

531 lambda m: tr_mappings.get(m.group(0), m.group(0)), 

532 part, 

533 ) 

534 

535 if part.endswith(":"): # E.g. "salt of the earth"/Korean 535 ↛ 536line 535 didn't jump to line 536 because the condition on line 535 was never true

536 part = part[:-1].strip() 

537 if not part: 537 ↛ 538line 537 didn't jump to line 538 because the condition on line 537 was never true

538 continue 

539 

540 # Strip language links 

541 tr: TranslationData = {"lang": lang} 

542 if langcode: 

543 tr["code"] = langcode # DEPRECATED in favor of "lang_code" 

544 tr["lang_code"] = langcode 

545 if tags: 

546 tr["tags"] = list(tags) 

547 for ttup in tagsets: 547 ↛ 548line 547 didn't jump to line 548 because the loop on line 547 never started

548 tr["tags"].extend(ttup) 

549 # topics is never populated, so it's always empty 

550 # if topics: 

551 # tr["topics"] = list(topics) 

552 if sense: 

553 if sense.startswith( 553 ↛ 559line 553 didn't jump to line 559 because the condition on line 553 was never true

554 ( 

555 "Translations to be checked", 

556 ":The translations below need to be checked", 

557 ) 

558 ): 

559 continue # Skip such translations 

560 else: 

561 tr["sense"] = sense 

562 

563 # Check if this part starts with (tags) 

564 m = re.match(r"\(([^)]+)\) ", part) 

565 if m: 

566 par = m.group(1) 

567 rest = part[m.end() :] 

568 cls = classify_desc(par, no_unknown_starts=True) 

569 if cls == "tags": 

570 tagsets2, topics2 = decode_tags(par) 

571 for ttup in tagsets2: 

572 data_extend(tr, "tags", ttup) 

573 data_extend(tr, "topics", topics2) 

574 part = rest 

575 

576 # Check if this part ends with (tags). Note that 

577 # note-re will mess things up if we rely on this being 

578 # checked later. 

579 m = re.search(r" +\(([^)]+)\)$", part) 

580 if m: 

581 par = m.group(1) 

582 rest = part[: m.start()] 

583 cls = classify_desc(par, no_unknown_starts=True) 

584 if cls == "tags": 

585 tagsets2, topics2 = decode_tags(par) 

586 for ttup in tagsets2: 

587 data_extend(tr, "tags", ttup) 

588 data_extend(tr, "topics", topics2) 

589 part = rest 

590 

591 # Check if this part starts with "<tags/english>: <rest>" 

592 m = re.match(r"([-\w() ]+): ", part) 

593 if m: 

594 par = m.group(1).strip() 

595 rest = part[m.end() :] 

596 if par in ("", "see"): 596 ↛ 597line 596 didn't jump to line 597 because the condition on line 596 was never true

597 part = "rest" 

598 else: 

599 cls = classify_desc(par) 

600 # print("par={!r} cls={!r}".format(par, cls)) 

601 if cls == "tags": 

602 tagsets2, topics2 = decode_tags(par) 

603 for ttup in tagsets2: 

604 data_extend(tr, "tags", ttup) 

605 data_extend(tr, "topics", topics2) 

606 part = rest 

607 elif cls == "english": 

608 if re.search(tr_note_re, par): 

609 if "note" in tr: 609 ↛ 610line 609 didn't jump to line 610 because the condition on line 609 was never true

610 tr["note"] += "; " + par 

611 else: 

612 tr["note"] = par 

613 else: 

614 if "translation" in tr and "english" in tr: 614 ↛ 616line 614 didn't jump to line 616 because the condition on line 614 was never true

615 # DEPRECATED for "translation" 

616 tr["english"] += "; " + par 

617 tr["translation"] += "; " + par 

618 else: 

619 # DEPRECATED for "translation" 

620 tr["english"] = par 

621 tr["translation"] = par 

622 part = rest 

623 

624 # Skip translations that our template_fn says to ignore 

625 # and those that contain Lua execution errors. 

626 if "__IGNORE__" in part: 

627 continue # Contains something we want to ignore 

628 if part.startswith("Lua execution error"): 628 ↛ 629line 628 didn't jump to line 629 because the condition on line 628 was never true

629 continue 

630 

631 # Handle certain suffixes in translations that 

632 # we might put in "note" but that we can actually 

633 # parse into tags. 

634 for suffix, t in ( 

635 (" with dative", "with-dative"), 

636 (" with genitive", "with-genitive"), 

637 (" with accusative", "with-accusative"), 

638 (" in subjunctive", "with-subjunctive"), 

639 (" and conditional mood", "with-conditional"), 

640 (" - I have - you have", "first-person second-person singular"), 

641 (" - I have", "first-person singular"), 

642 (" - you have", "second-person singular"), 

643 ): 

644 if part.endswith(suffix): 644 ↛ 645line 644 didn't jump to line 645 because the condition on line 644 was never true

645 part = part[: -len(suffix)] 

646 data_append(tr, "tags", t) 

647 break 

648 

649 # Handle certain prefixes in translations 

650 for prefix, t in (("subjunctive of ", "with-subjunctive"),): 

651 if part.startswith(prefix): 651 ↛ 652line 651 didn't jump to line 652 because the condition on line 651 was never true

652 part = part[len(prefix) :] 

653 data_append(tr, "tags", t) 

654 break 

655 

656 # Skip certain one-character translations entirely 

657 # (these could result from templates being ignored) 

658 if part in ",;.": 658 ↛ 659line 658 didn't jump to line 659 because the condition on line 658 was never true

659 continue 

660 

661 if "english" in tr and tr["english"] in english_to_tags: 661 ↛ 662line 661 didn't jump to line 662 because the condition on line 661 was never true

662 data_extend(tr, "tags", english_to_tags[tr["english"]].split()) 

663 del tr["english"] # DEPRECATED for "translation" 

664 if "translation" in tr: 

665 del tr["translation"] 

666 

667 # Certain values indicate it is not actually a translation. 

668 # See definition of tr_ignore_re to adjust. 

669 m = re.search(tr_ignore_re, part) 

670 w: Optional[str] = None 

671 

672 if m and ( 

673 m.start() != 0 or m.end() != len(part) or len(part.split()) > 1 

674 ): 

675 # This translation will be skipped because it 

676 # seems to be some kind of explanatory text. 

677 # However, let's put it in the "note" field 

678 # instead, unless it is one of the listed fully 

679 # ignored ones. 

680 if part in ("please add this translation if you can",): 

681 continue 

682 # Save in note field 

683 tr["note"] = part 

684 else: 

685 # Interpret it as an actual translation 

686 parse_translation_desc(wxr, lang, part, tr) 

687 w = tr.get("word") 

688 if not w: 

689 continue # Not set or empty 

690 if w.startswith(("*", ":")): 

691 w = w[1:].strip() 

692 if w in ("[Term?]", ":", "/", "?"): 692 ↛ 693line 692 didn't jump to line 693 because the condition on line 692 was never true

693 continue # These are not valid linkage targets 

694 if len(w) > 3 * len(word) + 20: 

695 # Accept translation if word looks like acronym: 

696 # 'ISBN', 'I.S.B.N'.isupper() return True, and 

697 # false positives are unlikely. 

698 if not word.isupper(): 

699 # Likely descriptive text or example because 

700 # it is much too long. 

701 wxr.wtp.debug( 

702 "Translation too long compared to word, so" 

703 " it is skipped", 

704 sortid="translations/609-20230504", 

705 ) 

706 del tr["word"] 

707 tr["note"] = w 

708 

709 # Sanity check: try to detect certain suspicious 

710 # patterns in translations 

711 if "word" in tr: 

712 m = re.search(tr_suspicious_re, tr["word"]) 

713 if m and lang not in ( 

714 "Bats", # ^ in tree/English/Tr/Bats 

715 ): 

716 wxr.wtp.debug( 

717 "suspicious translation with {!r}: {}".format( 

718 m.group(0), tr 

719 ), 

720 sortid="translations/611", 

721 ) 

722 

723 if "tags" in tr: 

724 tr["tags"] = list(sorted(set(tr["tags"]))) 

725 

726 # If we have only notes, add as-is 

727 if "word" not in tr: 

728 data_append(data, "translations", tr) 

729 continue 

730 

731 # Split if it contains no spaces 

732 if w: 732 ↛ 523line 732 didn't jump to line 523 because the condition on line 732 was always true

733 alts = [w] 

734 if " " not in w: 

735 # If no spaces, split by separator 

736 alts = re.split(r"/|/", w) 

737 # Note: there could be remaining slashes, but they are 

738 # sometimes used in ways we cannot resolve programmatically. 

739 # Create translations for each alternative. 

740 for alt in alts: 

741 alt = alt.strip() 

742 tr1 = copy.deepcopy(tr) 

743 if alt.startswith("*") or alt.startswith(":"): 

744 alt = alt[1:].strip() 

745 if not alt: 745 ↛ 746line 745 didn't jump to line 746 because the condition on line 745 was never true

746 continue 

747 tr1["word"] = alt 

748 data_append(data, "translations", tr1) 

749 

750 # Return the language name, in case we have subitems 

751 return lang