Coverage for src / wiktextract / extractor / en / translations.py: 88%

238 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-17 07:22 +0000

1# Code related to parsing translations 

2# 

3# Copyright (c) 2019-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org 

4 

5import copy 

6import re 

7from typing import Optional 

8 

9from mediawiki_langcodes import code_to_name, name_to_code 

10from wikitextprocessor import MAGIC_FIRST, MAGIC_LAST 

11 

12from ...datautils import data_append, data_extend, split_at_comma_semi 

13from ...wxr_context import WiktextractContext 

14from .form_descriptions import ( 

15 classify_desc, 

16 decode_tags, 

17 nested_translations_re, 

18 parse_translation_desc, 

19 tr_note_re, 

20) 

21from .type_utils import TranslationData, WordData 

22 

23# Maps language names in translations to actual language names. 

24# E.g., "Apache" is not a language name, but "Apachean" is. 

25tr_langname_map = { 

26 "Apache": "Apachean", 

27 "Lutshootseed": "Lushootseed", 

28 "Old Assamese": "Early Assamese", 

29} 

30 

31# These names will be interpreted as script names or dialect names 

32# when used as a second-level name in translations. Some script names 

33# are also valid language names, but it looks likes the ones that are 

34# also script names aren't used on the second level as language names. 

35# These will not be interpreted as a separate language, but will instead 

36# be included under the parent language with the script/dialect as a tag 

37# (with spaces replaced by hyphens). 

38script_and_dialect_names = set( 

39 [ 

40 # Scripts 

41 "ALUPEC", 

42 "Adlam", 

43 "Arabic", # Script for Kashmiri 

44 "Bengali", 

45 "Burmese", 

46 "Carakan", 

47 "CJKV Characters", 

48 "Cyrillic", 

49 "Devanagari", 

50 "Glagolitic", 

51 "Gurmukhi", 

52 "Hebrew", # For Aramaic 

53 "Jawi", 

54 "Khmer", 

55 "Latin", 

56 "Mongolian", 

57 "Roman", 

58 "Shahmukhi", 

59 "Sinhalese", 

60 "Syriac", # For Aramaic 

61 "Classical Syriac", # For Aramaic 

62 "Taraškievica", 

63 "Thai", 

64 "Uyghurjin", 

65 # Chinese dialects/languages 

66 "Cantonese", # Variant of Chinese 

67 "Dungan", # Chinese 

68 "Gan", # Chinese 

69 "Hakka", # Chinese 

70 "Hokkien", # Chinese 

71 "Jin", # Chinese 

72 "Mandarin", # Chinese 

73 "Min Bei", # Chinese 

74 "Min Dong", # Chinese 

75 "Min Nan", # Chinsese 

76 "Wu", # Chinsese 

77 "Xiang", # Chinese 

78 "Jianghuai Mandarin", # Chinese 

79 "Jilu Mandarin", # Chinese 

80 "Jin Mandarin", # Chinese 

81 "Northern Mandarin", # Chinese 

82 "Southwestern Mandarin", # Chinese 

83 "Taiwanese Mandarin", # Chinese 

84 "Coastal Min", # Chinese 

85 "Inland Min", # Chinese 

86 "Leizhou Min", # Chinese 

87 "Min", # Chinese 

88 "Puxian Min", # Chinese 

89 "Shanghainese Wu", # Chinese 

90 "Wenzhou Wu", # Chinese 

91 "Wenzhou", # Chinese 

92 "Hsinchu Hokkien", # Chinese 

93 "Jinjiang Hokkien", # Chinese 

94 "Kaohsiung Hokkien", # Chinsese 

95 "Pinghua", # Chinese 

96 "Eastern Punjabi", 

97 "Western Punjabi", 

98 # Various countries/regions 

99 "Alsace", 

100 "Bavaria", 

101 "Belgium", 

102 "Canada", 

103 "Central", 

104 "Cologne", 

105 "Fogo", 

106 "Föhr", 

107 "Föhr-Amrum", 

108 "Hallig", 

109 "Helgoland", 

110 "Heligoland", 

111 "Santiago", 

112 "Sylt", 

113 "Mooring", 

114 "Föhr-Amrum", 

115 "Vancouver Island", 

116 "Wiedingharde", 

117 "Anpezan", # Variant of Ladin 

118 "Badiot", # Ladin 

119 "Fascian", # Ladin 

120 "Fodom", # Ladin 

121 "Gherdëina", # Ladin 

122 "Anbarani", # Variant of Talysh 

123 "Asalemi", # Variant of Talysh 

124 "Alemannic German", # Variant of German 

125 "Rhine Franconian", # Variant of German 

126 "German Low German", # Variant of Low German 

127 "Campidanese", # Variant of Sardinian 

128 "Logudorese", # Variant of Sardinian 

129 "Digor", # Variant of Ossetian 

130 "Iron", # Variant of Ossetian 

131 "Northern Puebla", # Variant of Nahuatl 

132 "Mecayapan", # Variant of Nathuatl 

133 "Egyptian Arabic", # Variant of Arabic 

134 "Gulf Arabic", # Variant of Arabic 

135 "Hijazi Arabic", # Variant of Arabic 

136 "Moroccan Arabic", # Variant of Arabic 

137 "North Levantine Arabic", # Variant of Arabic 

138 "South Levantine Arabic", # Variant of Arabic 

139 "Alviri", # Variant of Alviri-Vidari 

140 "Vidari", # Variant of Alviri-Vidari 

141 "Tashelhit", # Variant of Berber 

142 "Bokmål", # Variant of Norwegian 

143 "Nynorsk", # Variant of Norwegian 

144 "Mycenaean", # Variant of Greek 

145 # Language varieties 

146 "Ancient", 

147 "Classical", 

148 "Draweno-Polabian", 

149 "Literary", 

150 "Lower", 

151 "Manitoba Saulteux", 

152 "Modern", 

153 "Modern Polabian", 

154 "Modified traditional", 

155 "Northern", 

156 "Northern and Southern", 

157 "Old Polabian", 

158 "Simplified", 

159 "Southern", 

160 "Traditional", 

161 "Western", 

162 "1708", 

163 "1918", 

164 ] 

165) 

166 

167# These names should be interpreted as tags (as listed in the value 

168# space-separated) in second-level translations. 

169tr_second_tagmap = { 

170 "Föhr-Amrum, Bökingharde": "Föhr-Amrum Bökingharde", 

171 "Halligen, Goesharde, Karrhard": "Halligen Goesharde Karrhard", 

172 "Föhr-Amrum and Sylt dialect": "Föhr-Amrum Sylt", 

173 "Hallig and Mooring": "Hallig Mooring", 

174 "Föhr-Amrum & Mooring": "Föhr-Amrum Mooring", 

175} 

176 

177# Ignore translations that start with one of these 

178tr_ignore_prefixes = [ 

179 "+", 

180 "Different structure used", 

181 "Literally", 

182 "No equivalent", 

183 "Not used", 

184 "Please add this translation if you can", 

185 "See: ", 

186 "Use ", 

187 "[Book Pahlavi needed]", 

188 "[book pahlavi needed]", 

189 "[script needed]", 

190 "different structure used", 

191 "e.g.", 

192 "lit.", 

193 "literally", 

194 "no equivalent", 

195 "normally ", 

196 "not used", 

197 "noun compound ", 

198 "please add this translation if you can", 

199 "prefix ", 

200 "see: ", 

201 "suffix ", 

202 "use ", 

203 "usually ", 

204] 

205 

206# Ignore translations that contain one of these anywhere (case-sensitive). 

207# Or actually, put such translations in the "note" field rather than in "word". 

208tr_ignore_contains = [ 

209 "usually expressed with ", 

210 " can be used ", 

211 " construction used", 

212 " used with ", 

213 " + ", 

214 "genitive case", 

215 "dative case", 

216 "nominative case", 

217 "accusative case", 

218 "absolute state", 

219 "infinitive of ", 

220 "participle of ", 

221 "for this sense", 

222 "depending on the circumstances", 

223 "expressed with ", 

224 " expression ", 

225 " means ", 

226 " is used", 

227 " — ", # Used to give example sentences 

228 " translation", 

229 "not attested", 

230 "grammatical structure", 

231 "construction is used", 

232 "tense used", 

233 " lit.", 

234 " literally", 

235 "dative", 

236 "accusative", 

237 "genitive", 

238 "essive", 

239 "partitive", 

240 "translative", 

241 "elative", 

242 "inessive", 

243 "illative", 

244 "adessive", 

245 "ablative", 

246 "allative", 

247 "abessive", 

248 "comitative", 

249 "instructive", 

250 "particle", 

251 "predicative", 

252 "attributive", 

253 "preposition", 

254 "postposition", 

255 "prepositional", 

256 "postpositional", 

257 "prefix", 

258 "suffix", 

259 "translated", 

260] 

261 

262# Ignore translations that match one of these regular expressions 

263tr_ignore_regexps = [ 

264 r"^\[[\d,]+\]$", 

265 r"\?\?$", 

266 r"^\s*$", 

267] 

268 

269# If a translation matches this regexp (with re.search), we print a debug 

270# message 

271tr_suspicious_re = re.compile( 

272 r" [mf][12345]$|" 

273 + r" [mfnc]$|" 

274 + r" (pf|impf|vir|nvir|anml|anim|inan|sg|pl)$|" 

275 + "|".join( 

276 re.escape(x) 

277 for x in [ 

278 "; ", 

279 "* ", 

280 ": ", 

281 "[", 

282 "]", 

283 "{", 

284 "}", 

285 "/", 

286 "^", 

287 "literally", 

288 "lit.", 

289 # XXX check occurrences of ⫽, seems to be used as verb-object 

290 # separator but shouldn't really be part of the canonical form. 

291 # See e.g. 打工/Chinese 

292 "⫽", 

293 "also expressed with", 

294 "e.g.", 

295 "cf.", 

296 "used ", 

297 "script needed", 

298 "please add this translation", 

299 "usage ", 

300 ] 

301 ) 

302) 

303 

304# Regular expression to be searched from translation (with re.search) to check 

305# if it should be ignored. 

306tr_ignore_re = re.compile( 

307 "^(" 

308 + "|".join(re.escape(x) for x in tr_ignore_prefixes) 

309 + ")|" 

310 + "|".join(re.escape(x) for x in tr_ignore_contains) 

311 + "|" 

312 + "|".join(tr_ignore_regexps) 

313) # These are not to be escaped 

314 

315# These English texts get converted to tags in translations 

316english_to_tags = { 

317 "I have": "first-person singular", 

318 "you have": "second-person singular", 

319 "she has": "third-person singular feminine", 

320 "he has": "third-person singular masculine", 

321} 

322 

323 

324def parse_translation_item_text( 

325 wxr: WiktextractContext, 

326 word: str, 

327 data: WordData, 

328 item: str, 

329 sense: Optional[str], 

330 lang: Optional[str], 

331 langcode: Optional[str], 

332 translations_from_template: list[str], 

333 is_reconstruction: bool, 

334) -> Optional[str]: 

335 assert isinstance(wxr, WiktextractContext) 

336 assert isinstance(word, str) 

337 assert isinstance(data, dict) 

338 assert isinstance(item, str) 

339 assert sense is None or isinstance(sense, str) 

340 assert lang is None or isinstance(lang, str) # Parent item language 

341 assert langcode is None or isinstance(langcode, str) # Template langcode 

342 assert isinstance(translations_from_template, list) 

343 for x in translations_from_template: 

344 assert isinstance(x, str) 

345 assert is_reconstruction in (True, False) 

346 

347 # print("parse_translation_item_text: {!r} lang={}" 

348 # " langcode={}".format(item, lang, langcode)) 

349 

350 if not item: 350 ↛ 351line 350 didn't jump to line 351 because the condition on line 350 was never true

351 return None 

352 

353 # Find and remove nested translations from the item 

354 nested = list(m.group(1) for m in re.finditer(nested_translations_re, item)) 

355 if nested: 

356 item = re.sub(nested_translations_re, "", item) 

357 

358 if re.search(r"\(\d+\)|\[\d+\]", item) and "numeral:" not in item: 

359 wxr.wtp.debug( 

360 "possible sense number in translation item: {}".format(item), 

361 sortid="translations/324", 

362 ) 

363 

364 # Translation items should start with a language name (except 

365 # some nested translation items don't and rely on the language 

366 # name from the higher level, and some append a language variant 

367 # name to a broader language name) 

368 extra_langcodes = set() 

369 if lang and name_to_code(lang, "en") != "": 

370 lang_code = name_to_code(lang, "en") 

371 extra_langcodes.add(lang_code) 

372 # Canonicalize language name (we could have gotten it via 

373 # alias or other_names) 

374 if new_lang_name := code_to_name(lang_code, "en"): 

375 lang = new_lang_name 

376 m = re.match(r"\*?\s*([-' \w][-'&, \w()]*)[::]\s*", item) 

377 tags = [] 

378 if m: 

379 lang_sublang = "" 

380 sublang = m.group(1).strip() 

381 language_name_variations: list[str] = list() 

382 if lang and sublang: 

383 lang_sublang = lang + " " + sublang 

384 sublang_lang = sublang + " " + lang 

385 language_name_variations.extend( 

386 ( 

387 lang_sublang, 

388 sublang_lang, 

389 lang_sublang.replace(" ", "-"), 

390 sublang_lang.replace(" ", "-"), 

391 ) 

392 ) 

393 if " " in sublang: 

394 language_name_variations.append(sublang.replace(" ", "-")) 

395 if "-" in sublang: 

396 language_name_variations.append(sublang.replace("-", " ")) 

397 

398 if lang is None: 

399 if sublang == "Note": 399 ↛ 400line 399 didn't jump to line 400 because the condition on line 399 was never true

400 return None 

401 lang = sublang 

402 elif lang_sublang and any( 

403 name_to_code(captured_lang := lang_comb, "en") != "" 

404 # Python 3.8: catch the value of lang_comb with := 

405 for lang_comb in language_name_variations 

406 ): 

407 lang = captured_lang 

408 elif sublang in script_and_dialect_names: 

409 # If the second-level name is a script name, add it as 

410 # tag and keep the top-level language. 

411 # This helps with languages that script names 

412 # on the same level; those scripts may also be valid 

413 # language names. See leaf/English/Translations/Pali. 

414 tags.append(sublang.replace(" ", "-")) 

415 elif sublang in tr_second_tagmap: 

416 # Certain second-level names are interpreted as tags 

417 # (mapped to tags). Note that these may still have 

418 # separate language codes, so additional langcode 

419 # removal tricks may need to be played below. 

420 tags.extend(tr_second_tagmap[sublang].split()) 

421 elif name_to_code(sublang, "en") != "": 

422 lang = sublang 

423 elif sublang[0].isupper() and classify_desc(sublang) == "tags": 

424 # Interpret it as a tag 

425 tags.append(sublang) 

426 else: 

427 # We don't recognize this prefix 

428 wxr.wtp.error( 

429 "unrecognized prefix (language name?) in " 

430 "translation item: {}".format(item), 

431 sortid="translations/369", 

432 ) 

433 return None 

434 # Strip the language name/tag from the item 

435 item = item[m.end() :] 

436 elif lang is None: 

437 # No mathing language prefix. Try if it is missing colon. 

438 parts = item.split() 

439 if len(parts) > 1 and name_to_code(parts[0], "en") != "": 439 ↛ 440line 439 didn't jump to line 440 because the condition on line 439 was never true

440 lang = parts[0] 

441 item = " ".join(parts[1:]) 

442 else: 

443 if "__IGNORE__" not in item: 

444 wxr.wtp.error( 

445 "no language name in translation item: {}".format(item), 

446 sortid="translations/382", 

447 ) 

448 return None 

449 

450 # Map non-standard language names (e.g., "Apache" -> "Apachean") 

451 lang = tr_langname_map.get(lang, lang) 

452 

453 # If we didn't get language code from the template, look it up 

454 # based on language name 

455 if langcode is None and name_to_code(lang, "en") != "": 

456 langcode = name_to_code(lang, "en") 

457 

458 # Remove (<langcode>) parts from the item. They seem to be 

459 # generated by {{t+|...}}. 

460 if langcode: 

461 extra_langcodes.add(langcode) 

462 if "-" in langcode: 

463 extra_langcodes.add(langcode.split("-")[0]) 

464 if langcode in ( 

465 "zh", 

466 "yue", 

467 "cdo", 

468 "cmn", 

469 "dng", 

470 "hak", 

471 "mnp", 

472 "nan", 

473 "wuu", 

474 "zh-min-nan", 

475 ): 

476 extra_langcodes.update( 

477 [ 

478 "zh", 

479 "yue", 

480 "cdo", 

481 "cmn", 

482 "dng", 

483 "hak", 

484 "mnp", 

485 "nan", 

486 "wuu", 

487 "zh-min-nan", 

488 ] 

489 ) 

490 elif langcode in ("nn", "nb", "no"): 

491 extra_langcodes.update(["no", "nn", "nb"]) 

492 for x in extra_langcodes: 

493 item = re.sub(r"\s*\^?\({}\)".format(re.escape(x)), "", item) 

494 

495 # Map translations obtained from templates into magic characters 

496 # before splitting the translations list. This way, if a comma 

497 # (or semicolon etc) was used inside the template, it won't get 

498 # split. We restore the magic characters into the original 

499 # translations after splitting. This kludge improves robustness 

500 # of collection translations for phrases whose translations 

501 # may contain commas. 

502 translations_from_template = list( 

503 sorted(translations_from_template, key=lambda x: len(x), reverse=True) 

504 ) 

505 tr_mappings = {} 

506 for i, trt in enumerate(translations_from_template): 

507 if not trt: 507 ↛ 508line 507 didn't jump to line 508 because the condition on line 507 was never true

508 continue 

509 ch = chr(MAGIC_FIRST + i) 

510 rex = re.escape(trt) 

511 if trt[0].isalnum(): 

512 rex = r"\b" + rex 

513 if trt[-1].isalnum(): 

514 rex = rex + r"\b" 

515 item = re.sub(rex, ch, item) 

516 tr_mappings[ch] = trt 

517 

518 # There may be multiple translations, separated by comma 

519 nested.append(item) 

520 for item in nested: 

521 tagsets: list[tuple[str, ...]] = [] 

522 # This never does anything; it's never updated, so it's always empty 

523 # topics: list[str] = [] 

524 

525 for part in split_at_comma_semi( 

526 item, extra=[" / ", " / ", "/", r"\| furthermore: "] 

527 ): 

528 # Substitute the magic characters back to original 

529 # translations (this is part of dealing with 

530 # phrasal translations containing commas). 

531 part = re.sub( 

532 r"[{:c}-{:c}]".format(MAGIC_FIRST, MAGIC_LAST), 

533 lambda m: tr_mappings.get(m.group(0), m.group(0)), 

534 part, 

535 ) 

536 

537 if part.endswith(":"): # E.g. "salt of the earth"/Korean 537 ↛ 538line 537 didn't jump to line 538 because the condition on line 537 was never true

538 part = part[:-1].strip() 

539 if not part: 539 ↛ 540line 539 didn't jump to line 540 because the condition on line 539 was never true

540 continue 

541 

542 # Strip language links 

543 tr: TranslationData = {"lang": lang} 

544 if langcode: 

545 tr["code"] = langcode # DEPRECATED in favor of "lang_code" 

546 tr["lang_code"] = langcode 

547 if tags: 

548 tr["tags"] = list(tags) 

549 for ttup in tagsets: 549 ↛ 550line 549 didn't jump to line 550 because the loop on line 549 never started

550 tr["tags"].extend(ttup) 

551 # topics is never populated, so it's always empty 

552 # if topics: 

553 # tr["topics"] = list(topics) 

554 if sense: 

555 if sense.startswith( 555 ↛ 561line 555 didn't jump to line 561 because the condition on line 555 was never true

556 ( 

557 "Translations to be checked", 

558 ":The translations below need to be checked", 

559 ) 

560 ): 

561 continue # Skip such translations 

562 else: 

563 tr["sense"] = sense 

564 

565 # Check if this part starts with (tags) 

566 m = re.match(r"\(([^)]+)\) ", part) 

567 if m: 

568 par = m.group(1) 

569 rest = part[m.end() :] 

570 cls = classify_desc(par, no_unknown_starts=True) 

571 if cls == "tags": 

572 tagsets2, topics2 = decode_tags(par) 

573 for ttup in tagsets2: 

574 data_extend(tr, "tags", ttup) 

575 data_extend(tr, "topics", topics2) 

576 part = rest 

577 

578 # Check if this part ends with (tags). Note that 

579 # note-re will mess things up if we rely on this being 

580 # checked later. 

581 m = re.search(r" +\(([^)]+)\)$", part) 

582 if m: 

583 par = m.group(1) 

584 rest = part[: m.start()] 

585 cls = classify_desc(par, no_unknown_starts=True) 

586 if cls == "tags": 

587 tagsets2, topics2 = decode_tags(par) 

588 for ttup in tagsets2: 

589 data_extend(tr, "tags", ttup) 

590 data_extend(tr, "topics", topics2) 

591 part = rest 

592 

593 # Check if this part starts with "<tags/english>: <rest>" 

594 m = re.match(r"([-\w() ]+): ", part) 

595 if m: 

596 par = m.group(1).strip() 

597 rest = part[m.end() :] 

598 if par in ("", "see"): 598 ↛ 599line 598 didn't jump to line 599 because the condition on line 598 was never true

599 part = "rest" 

600 else: 

601 cls = classify_desc(par) 

602 # print("par={!r} cls={!r}".format(par, cls)) 

603 if cls == "tags": 

604 tagsets2, topics2 = decode_tags(par) 

605 for ttup in tagsets2: 

606 data_extend(tr, "tags", ttup) 

607 data_extend(tr, "topics", topics2) 

608 part = rest 

609 elif cls == "english": 

610 if re.search(tr_note_re, par): 

611 if "note" in tr: 611 ↛ 612line 611 didn't jump to line 612 because the condition on line 611 was never true

612 tr["note"] += "; " + par 

613 else: 

614 tr["note"] = par 

615 else: 

616 if "translation" in tr and "english" in tr: 616 ↛ 618line 616 didn't jump to line 618 because the condition on line 616 was never true

617 # DEPRECATED for "translation" 

618 tr["english"] += "; " + par 

619 tr["translation"] += "; " + par 

620 else: 

621 # DEPRECATED for "translation" 

622 tr["english"] = par 

623 tr["translation"] = par 

624 part = rest 

625 

626 # Skip translations that our template_fn says to ignore 

627 # and those that contain Lua execution errors. 

628 if "__IGNORE__" in part: 

629 continue # Contains something we want to ignore 

630 if part.startswith("Lua execution error"): 630 ↛ 631line 630 didn't jump to line 631 because the condition on line 630 was never true

631 continue 

632 

633 # Handle certain suffixes in translations that 

634 # we might put in "note" but that we can actually 

635 # parse into tags. 

636 for suffix, t in ( 

637 (" with dative", "with-dative"), 

638 (" with genitive", "with-genitive"), 

639 (" with accusative", "with-accusative"), 

640 (" in subjunctive", "with-subjunctive"), 

641 (" and conditional mood", "with-conditional"), 

642 (" - I have - you have", "first-person second-person singular"), 

643 (" - I have", "first-person singular"), 

644 (" - you have", "second-person singular"), 

645 ): 

646 if part.endswith(suffix): 646 ↛ 647line 646 didn't jump to line 647 because the condition on line 646 was never true

647 part = part[: -len(suffix)] 

648 data_append(tr, "tags", t) 

649 break 

650 

651 # Handle certain prefixes in translations 

652 for prefix, t in (("subjunctive of ", "with-subjunctive"),): 

653 if part.startswith(prefix): 653 ↛ 654line 653 didn't jump to line 654 because the condition on line 653 was never true

654 part = part[len(prefix) :] 

655 data_append(tr, "tags", t) 

656 break 

657 

658 # Skip certain one-character translations entirely 

659 # (these could result from templates being ignored) 

660 if part in ",;.": 660 ↛ 661line 660 didn't jump to line 661 because the condition on line 660 was never true

661 continue 

662 

663 if "english" in tr and tr["english"] in english_to_tags: 663 ↛ 664line 663 didn't jump to line 664 because the condition on line 663 was never true

664 data_extend(tr, "tags", english_to_tags[tr["english"]].split()) 

665 del tr["english"] # DEPRECATED for "translation" 

666 if "translation" in tr: 

667 del tr["translation"] 

668 

669 # Certain values indicate it is not actually a translation. 

670 # See definition of tr_ignore_re to adjust. 

671 m = re.search(tr_ignore_re, part) 

672 w: Optional[str] = None 

673 

674 if m and ( 

675 m.start() != 0 or m.end() != len(part) or len(part.split()) > 1 

676 ): 

677 # This translation will be skipped because it 

678 # seems to be some kind of explanatory text. 

679 # However, let's put it in the "note" field 

680 # instead, unless it is one of the listed fully 

681 # ignored ones. 

682 if part in ("please add this translation if you can",): 

683 continue 

684 # Save in note field 

685 tr["note"] = part 

686 else: 

687 # Interpret it as an actual translation 

688 parse_translation_desc(wxr, lang, part, tr) 

689 w = tr.get("word") 

690 if not w: 

691 continue # Not set or empty 

692 if w.startswith(("*", ":")): 

693 w = w[1:].strip() 

694 if w in ("[Term?]", ":", "/", "?"): 694 ↛ 695line 694 didn't jump to line 695 because the condition on line 694 was never true

695 continue # These are not valid linkage targets 

696 if len(w) > 3 * len(word) + 20: 

697 # Accept translation if word looks like acronym: 

698 # 'ISBN', 'I.S.B.N'.isupper() return True, and 

699 # false positives are unlikely. 

700 if not word.isupper(): 

701 # Likely descriptive text or example because 

702 # it is much too long. 

703 wxr.wtp.debug( 

704 "Translation too long compared to word, so" 

705 " it is skipped", 

706 sortid="translations/609-20230504", 

707 ) 

708 del tr["word"] 

709 tr["note"] = w 

710 

711 # Sanity check: try to detect certain suspicious 

712 # patterns in translations 

713 if "word" in tr: 

714 m = re.search(tr_suspicious_re, tr["word"]) 

715 if m and lang not in ( 

716 "Bats", # ^ in tree/English/Tr/Bats 

717 ): 

718 wxr.wtp.debug( 

719 "suspicious translation with {!r}: {}".format( 

720 m.group(0), tr 

721 ), 

722 sortid="translations/611", 

723 ) 

724 

725 if "tags" in tr: 

726 tr["tags"] = sorted(set(tr["tags"])) 

727 

728 # If we have only notes, add as-is 

729 if "word" not in tr: 

730 data_append(data, "translations", tr) 

731 continue 

732 

733 # Split if it contains no spaces 

734 if w: 734 ↛ 525line 734 didn't jump to line 525 because the condition on line 734 was always true

735 alts = [w] 

736 if " " not in w: 

737 # If no spaces, split by separator 

738 alts = re.split(r"/|/", w) 

739 # Note: there could be remaining slashes, but they are 

740 # sometimes used in ways we cannot resolve programmatically. 

741 # Create translations for each alternative. 

742 for alt in alts: 

743 alt = alt.strip() 

744 tr1 = copy.deepcopy(tr) 

745 if alt.startswith("*") or alt.startswith(":"): 

746 alt = alt[1:].strip() 

747 if not alt: 747 ↛ 748line 747 didn't jump to line 748 because the condition on line 747 was never true

748 continue 

749 tr1["word"] = alt 

750 data_append(data, "translations", tr1) 

751 

752 # Return the language name, in case we have subitems 

753 return lang