Coverage for src/wiktextract/extractor/en/translations.py: 63%

231 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2024-12-27 08:07 +0000

1# Code related to parsing translations 

2# 

3# Copyright (c) 2019-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org 

4 

5import copy 

6import re 

7from typing import Optional 

8 

9from mediawiki_langcodes import code_to_name, name_to_code 

10from wikitextprocessor import MAGIC_FIRST, MAGIC_LAST 

11 

12from ...datautils import data_append, data_extend, split_at_comma_semi 

13from ...wxr_context import WiktextractContext 

14from .form_descriptions import ( 

15 classify_desc, 

16 decode_tags, 

17 nested_translations_re, 

18 parse_translation_desc, 

19 tr_note_re, 

20) 

21from .type_utils import TranslationData, WordData 

22 

23# Maps language names in translations to actual language names. 

24# E.g., "Apache" is not a language name, but "Apachean" is. 

25tr_langname_map = { 

26 "Apache": "Apachean", 

27 "Lutshootseed": "Lushootseed", 

28 "Old Assamese": "Early Assamese", 

29} 

30 

31# These names will be interpreted as script names or dialect names 

32# when used as a second-level name in translations. Some script names 

33# are also valid language names, but it looks likes the ones that are 

34# also script names aren't used on the second level as language names. 

35# These will not be interpreted as a separate language, but will instead 

36# be included under the parent language with the script/dialect as a tag 

37# (with spaces replaced by hyphens). 

38script_and_dialect_names = set( 

39 [ 

40 # Scripts 

41 "ALUPEC", 

42 "Adlam", 

43 "Arabic", # Script for Kashmiri 

44 "Bengali", 

45 "Burmese", 

46 "Carakan", 

47 "CJKV Characters", 

48 "Cyrillic", 

49 "Devanagari", 

50 "Glagolitic", 

51 "Gurmukhi", 

52 "Hebrew", # For Aramaic 

53 "Jawi", 

54 "Khmer", 

55 "Latin", 

56 "Mongolian", 

57 "Roman", 

58 "Shahmukhi", 

59 "Sinhalese", 

60 "Syriac", # For Aramaic 

61 "Classical Syriac", # For Aramaic 

62 "Taraškievica", 

63 "Thai", 

64 "Uyghurjin", 

65 # Chinese dialects/languages 

66 "Cantonese", # Variant of Chinese 

67 "Dungan", # Chinese 

68 "Gan", # Chinese 

69 "Hakka", # Chinese 

70 "Hokkien", # Chinese 

71 "Jin", # Chinese 

72 "Mandarin", # Chinese 

73 "Min Bei", # Chinese 

74 "Min Dong", # Chinese 

75 "Min Nan", # Chinsese 

76 "Wu", # Chinsese 

77 "Xiang", # Chinese 

78 "Jianghuai Mandarin", # Chinese 

79 "Jilu Mandarin", # Chinese 

80 "Jin Mandarin", # Chinese 

81 "Northern Mandarin", # Chinese 

82 "Southwestern Mandarin", # Chinese 

83 "Taiwanese Mandarin", # Chinese 

84 "Coastal Min", # Chinese 

85 "Inland Min", # Chinese 

86 "Leizhou Min", # Chinese 

87 "Min", # Chinese 

88 "Puxian Min", # Chinese 

89 "Shanghainese Wu", # Chinese 

90 "Wenzhou Wu", # Chinese 

91 "Wenzhou", # Chinese 

92 "Hsinchu Hokkien", # Chinese 

93 "Jinjiang Hokkien", # Chinese 

94 "Kaohsiung Hokkien", # Chinsese 

95 "Pinghua", # Chinese 

96 "Eastern Punjabi", 

97 "Western Punjabi", 

98 # Various countries/regions 

99 "Alsace", 

100 "Bavaria", 

101 "Belgium", 

102 "Canada", 

103 "Central", 

104 "Cologne", 

105 "Fogo", 

106 "Föhr", 

107 "Föhr-Amrum", 

108 "Hallig", 

109 "Helgoland", 

110 "Heligoland", 

111 "Santiago", 

112 "Sylt", 

113 "Mooring", 

114 "Föhr-Amrum", 

115 "Vancouver Island", 

116 "Wiedingharde", 

117 "Anpezan", # Variant of Ladin 

118 "Badiot", # Ladin 

119 "Fascian", # Ladin 

120 "Fodom", # Ladin 

121 "Gherdëina", # Ladin 

122 "Anbarani", # Variant of Talysh 

123 "Asalemi", # Variant of Talysh 

124 "Alemannic German", # Variant of German 

125 "Rhine Franconian", # Variant of German 

126 "German Low German", # Variant of Low German 

127 "Campidanese", # Variant of Sardinian 

128 "Logudorese", # Variant of Sardinian 

129 "Digor", # Variant of Ossetian 

130 "Iron", # Variant of Ossetian 

131 "Northern Puebla", # Variant of Nahuatl 

132 "Mecayapan", # Variant of Nathuatl 

133 "Egyptian Arabic", # Variant of Arabic 

134 "Gulf Arabic", # Variant of Arabic 

135 "Hijazi Arabic", # Variant of Arabic 

136 "Moroccan Arabic", # Variant of Arabic 

137 "North Levantine Arabic", # Variant of Arabic 

138 "South Levantine Arabic", # Variant of Arabic 

139 "Alviri", # Variant of Alviri-Vidari 

140 "Vidari", # Variant of Alviri-Vidari 

141 "Tashelhit", # Variant of Berber 

142 "Bokmål", # Variant of Norwegian 

143 "Nynorsk", # Variant of Norwegian 

144 "Mycenaean", # Variant of Greek 

145 # Language varieties 

146 "Ancient", 

147 "Classical", 

148 "Draweno-Polabian", 

149 "Literary", 

150 "Lower", 

151 "Manitoba Saulteux", 

152 "Modern", 

153 "Modern Polabian", 

154 "Modified traditional", 

155 "Northern", 

156 "Northern and Southern", 

157 "Old Polabian", 

158 "Simplified", 

159 "Southern", 

160 "Traditional", 

161 "Western", 

162 "1708", 

163 "1918", 

164 ] 

165) 

166 

167# These names should be interpreted as tags (as listed in the value 

168# space-separated) in second-level translations. 

169tr_second_tagmap = { 

170 "Föhr-Amrum, Bökingharde": "Föhr-Amrum Bökingharde", 

171 "Halligen, Goesharde, Karrhard": "Halligen Goesharde Karrhard", 

172 "Föhr-Amrum and Sylt dialect": "Föhr-Amrum Sylt", 

173 "Hallig and Mooring": "Hallig Mooring", 

174 "Föhr-Amrum & Mooring": "Föhr-Amrum Mooring", 

175} 

176 

177# Ignore translations that start with one of these 

178tr_ignore_prefixes = [ 

179 "+", 

180 "Different structure used", 

181 "Literally", 

182 "No equivalent", 

183 "Not used", 

184 "Please add this translation if you can", 

185 "See: ", 

186 "Use ", 

187 "[Book Pahlavi needed]", 

188 "[book pahlavi needed]", 

189 "[script needed]", 

190 "different structure used", 

191 "e.g.", 

192 "lit.", 

193 "literally", 

194 "no equivalent", 

195 "normally ", 

196 "not used", 

197 "noun compound ", 

198 "please add this translation if you can", 

199 "prefix ", 

200 "see: ", 

201 "suffix ", 

202 "use ", 

203 "usually ", 

204] 

205 

206# Ignore translations that contain one of these anywhere (case-sensitive). 

207# Or actually, put such translations in the "note" field rather than in "word". 

208tr_ignore_contains = [ 

209 "usually expressed with ", 

210 " can be used ", 

211 " construction used", 

212 " used with ", 

213 " + ", 

214 "genitive case", 

215 "dative case", 

216 "nominative case", 

217 "accusative case", 

218 "absolute state", 

219 "infinitive of ", 

220 "participle of ", 

221 "for this sense", 

222 "depending on the circumstances", 

223 "expressed with ", 

224 " expression ", 

225 " means ", 

226 " is used", 

227 " — ", # Used to give example sentences 

228 " translation", 

229 "not attested", 

230 "grammatical structure", 

231 "construction is used", 

232 "tense used", 

233 " lit.", 

234 " literally", 

235 "dative", 

236 "accusative", 

237 "genitive", 

238 "essive", 

239 "partitive", 

240 "translative", 

241 "elative", 

242 "inessive", 

243 "illative", 

244 "adessive", 

245 "ablative", 

246 "allative", 

247 "abessive", 

248 "comitative", 

249 "instructive", 

250 "particle", 

251 "predicative", 

252 "attributive", 

253 "preposition", 

254 "postposition", 

255 "prepositional", 

256 "postpositional", 

257 "prefix", 

258 "suffix", 

259 "translated", 

260] 

261 

262# Ignore translations that match one of these regular expressions 

263tr_ignore_regexps = [ 

264 r"^\[[\d,]+\]$", 

265 r"\?\?$", 

266 r"^\s*$", 

267] 

268 

269# If a translation matches this regexp (with re.search), we print a debug 

270# message 

271tr_suspicious_re = re.compile( 

272 r" [mf][12345]$|" 

273 + r" [mfnc]$|" 

274 + r" (pf|impf|vir|nvir|anml|anim|inan|sg|pl)$|" 

275 + "|".join( 

276 re.escape(x) 

277 for x in [ 

278 "; ", 

279 "* ", 

280 ": ", 

281 "[", 

282 "]", 

283 "{", 

284 "}", 

285 "/", 

286 "^", 

287 "literally", 

288 "lit.", 

289 # XXX check occurrences of ⫽, seems to be used as verb-object 

290 # separator but shouldn't really be part of the canonical form. 

291 # See e.g. 打工/Chinese 

292 "⫽", 

293 "also expressed with", 

294 "e.g.", 

295 "cf.", 

296 "used ", 

297 "script needed", 

298 "please add this translation", 

299 "usage ", 

300 ] 

301 ) 

302) 

303 

304# Regular expression to be searched from translation (with re.search) to check 

305# if it should be ignored. 

306tr_ignore_re = re.compile( 

307 "^(" 

308 + "|".join(re.escape(x) for x in tr_ignore_prefixes) 

309 + ")|" 

310 + "|".join(re.escape(x) for x in tr_ignore_contains) 

311 + "|" 

312 + "|".join(tr_ignore_regexps) 

313) # These are not to be escaped 

314 

315# These English texts get converted to tags in translations 

316english_to_tags = { 

317 "I have": "first-person singular", 

318 "you have": "second-person singular", 

319 "she has": "third-person singular feminine", 

320 "he has": "third-person singular masculine", 

321} 

322 

323 

324def parse_translation_item_text( 

325 wxr: WiktextractContext, 

326 word: str, 

327 data: WordData, 

328 item: str, 

329 sense: Optional[str], 

330 lang: Optional[str], 

331 langcode: Optional[str], 

332 translations_from_template: list[str], 

333 is_reconstruction: bool, 

334) -> Optional[str]: 

335 assert isinstance(wxr, WiktextractContext) 

336 assert isinstance(word, str) 

337 assert isinstance(data, dict) 

338 assert isinstance(item, str) 

339 assert sense is None or isinstance(sense, str) 

340 assert lang is None or isinstance(lang, str) # Parent item language 

341 assert langcode is None or isinstance(langcode, str) # Template langcode 

342 assert isinstance(translations_from_template, list) 

343 for x in translations_from_template: 

344 assert isinstance(x, str) 

345 assert is_reconstruction in (True, False) 

346 

347 # print("parse_translation_item_text: {!r} lang={}" 

348 # " langcode={}".format(item, lang, langcode)) 

349 

350 if not item: 350 ↛ 351line 350 didn't jump to line 351 because the condition on line 350 was never true

351 return None 

352 

353 # Find and remove nested translations from the item 

354 nested = list(m.group(1) for m in re.finditer(nested_translations_re, item)) 

355 if nested: 355 ↛ 356line 355 didn't jump to line 356 because the condition on line 355 was never true

356 item = re.sub(nested_translations_re, "", item) 

357 

358 if re.search(r"\(\d+\)|\[\d+\]", item) and "numeral:" not in item: 358 ↛ 359line 358 didn't jump to line 359 because the condition on line 358 was never true

359 wxr.wtp.debug( 

360 "possible sense number in translation item: {}".format(item), 

361 sortid="translations/324", 

362 ) 

363 

364 # Translation items should start with a language name (except 

365 # some nested translation items don't and rely on the language 

366 # name from the higher level, and some append a language variant 

367 # name to a broader language name) 

368 extra_langcodes = set() 

369 if lang and name_to_code(lang, "en") != "": 

370 lang_code = name_to_code(lang, "en") 

371 extra_langcodes.add(lang_code) 

372 # Canonicalize language name (we could have gotten it via 

373 # alias or other_names) 

374 lang = code_to_name(lang_code, "en") 

375 m = re.match(r"\*?\s*([-' \w][-'&, \w()]*)[::]\s*", item) 

376 tags = [] 

377 if m: 377 ↛ 434line 377 didn't jump to line 434 because the condition on line 377 was always true

378 sublang = m.group(1).strip() 

379 language_name_variations: list[str] = list() 

380 if lang and sublang: 

381 lang_sublang = lang + " " + sublang 

382 sublang_lang = sublang + " " + lang 

383 language_name_variations.extend( 

384 ( 

385 lang_sublang, 

386 sublang_lang, 

387 lang_sublang.replace(" ", "-"), 

388 sublang_lang.replace(" ", "-"), 

389 ) 

390 ) 

391 if " " in sublang: 

392 language_name_variations.append(sublang.replace(" ", "-")) 

393 if "-" in sublang: 

394 language_name_variations.append(sublang.replace("-", " ")) 

395 

396 if lang is None: 

397 if sublang == "Note": 397 ↛ 398line 397 didn't jump to line 398 because the condition on line 397 was never true

398 return None 

399 lang = sublang 

400 elif lang_sublang and any( 

401 name_to_code(captured_lang := lang_comb, "en") != "" 

402 # Python 3.8: catch the value of lang_comb with := 

403 for lang_comb in language_name_variations 

404 ): 

405 lang = captured_lang 

406 elif sublang in script_and_dialect_names: 

407 # If the second-level name is a script name, add it as 

408 # tag and keep the top-level language. 

409 # This helps with languages that script names 

410 # on the same level; those scripts may also be valid 

411 # language names. See leaf/English/Translations/Pali. 

412 tags.append(sublang.replace(" ", "-")) 

413 elif sublang in tr_second_tagmap: 

414 # Certain second-level names are interpreted as tags 

415 # (mapped to tags). Note that these may still have 

416 # separate language codes, so additional langcode 

417 # removal tricks may need to be played below. 

418 tags.extend(tr_second_tagmap[sublang].split()) 

419 elif name_to_code(sublang, "en") != "": 419 ↛ 421line 419 didn't jump to line 421 because the condition on line 419 was always true

420 lang = sublang 

421 elif sublang[0].isupper() and classify_desc(sublang) == "tags": 

422 # Interpret it as a tag 

423 tags.append(sublang) 

424 else: 

425 # We don't recognize this prefix 

426 wxr.wtp.error( 

427 "unrecognized prefix (language name?) in " 

428 "translation item: {}".format(item), 

429 sortid="translations/369", 

430 ) 

431 return None 

432 # Strip the language name/tag from the item 

433 item = item[m.end() :] 

434 elif lang is None: 

435 # No mathing language prefix. Try if it is missing colon. 

436 parts = item.split() 

437 if len(parts) > 1 and name_to_code(parts[0], "en") != "": 

438 lang = parts[0] 

439 item = " ".join(parts[1:]) 

440 else: 

441 if "__IGNORE__" not in item: 

442 wxr.wtp.error( 

443 "no language name in translation item: {}".format(item), 

444 sortid="translations/382", 

445 ) 

446 return None 

447 

448 # Map non-standard language names (e.g., "Apache" -> "Apachean") 

449 lang = tr_langname_map.get(lang, lang) 

450 

451 # If we didn't get language code from the template, look it up 

452 # based on language name 

453 if langcode is None and name_to_code(lang, "en") != "": 

454 langcode = name_to_code(lang, "en") 

455 

456 # Remove (<langcode>) parts from the item. They seem to be 

457 # generated by {{t+|...}}. 

458 if langcode: 458 ↛ 500line 458 didn't jump to line 500 because the condition on line 458 was always true

459 extra_langcodes.add(langcode) 

460 if "-" in langcode: 

461 extra_langcodes.add(langcode.split("-")[0]) 

462 if langcode in ( 

463 "zh", 

464 "yue", 

465 "cdo", 

466 "cmn", 

467 "dng", 

468 "hak", 

469 "mnp", 

470 "nan", 

471 "wuu", 

472 "zh-min-nan", 

473 ): 

474 extra_langcodes.update( 

475 [ 

476 "zh", 

477 "yue", 

478 "cdo", 

479 "cmn", 

480 "dng", 

481 "hak", 

482 "mnp", 

483 "nan", 

484 "wuu", 

485 "zh-min-nan", 

486 ] 

487 ) 

488 elif langcode in ("nn", "nb", "no"): 488 ↛ 489line 488 didn't jump to line 489 because the condition on line 488 was never true

489 extra_langcodes.update(["no", "nn", "nb"]) 

490 for x in extra_langcodes: 

491 item = re.sub(r"\s*\^?\({}\)".format(re.escape(x)), "", item) 

492 

493 # Map translations obtained from templates into magic characters 

494 # before splitting the translations list. This way, if a comma 

495 # (or semicolon etc) was used inside the template, it won't get 

496 # split. We restore the magic characters into the original 

497 # translations after splitting. This kludge improves robustness 

498 # of collection translations for phrases whose translations 

499 # may contain commas. 

500 translations_from_template = list( 

501 sorted(translations_from_template, key=lambda x: len(x), reverse=True) 

502 ) 

503 tr_mappings = {} 

504 for i, trt in enumerate(translations_from_template): 

505 if not trt: 505 ↛ 506line 505 didn't jump to line 506 because the condition on line 505 was never true

506 continue 

507 ch = chr(MAGIC_FIRST + i) 

508 rex = re.escape(trt) 

509 if trt[0].isalnum(): 509 ↛ 511line 509 didn't jump to line 511 because the condition on line 509 was always true

510 rex = r"\b" + rex 

511 if trt[-1].isalnum(): 511 ↛ 513line 511 didn't jump to line 513 because the condition on line 511 was always true

512 rex = rex + r"\b" 

513 item = re.sub(rex, ch, item) 

514 tr_mappings[ch] = trt 

515 

516 # There may be multiple translations, separated by comma 

517 nested.append(item) 

518 for item in nested: 

519 tagsets: list[tuple[str, ...]] = [] 

520 # This never does anything; it's never updated, so it's always empty 

521 # topics: list[str] = [] 

522 

523 for part in split_at_comma_semi( 

524 item, extra=[" / ", " / ", "/", r"\| furthermore: "] 

525 ): 

526 # Substitute the magic characters back to original 

527 # translations (this is part of dealing with 

528 # phrasal translations containing commas). 

529 part = re.sub( 

530 r"[{:c}-{:c}]".format(MAGIC_FIRST, MAGIC_LAST), 

531 lambda m: tr_mappings.get(m.group(0), m.group(0)), 

532 part, 

533 ) 

534 

535 if part.endswith(":"): # E.g. "salt of the earth"/Korean 535 ↛ 536line 535 didn't jump to line 536 because the condition on line 535 was never true

536 part = part[:-1].strip() 

537 if not part: 537 ↛ 538line 537 didn't jump to line 538 because the condition on line 537 was never true

538 continue 

539 

540 # Strip language links 

541 tr: TranslationData = {"lang": lang} 

542 if langcode: 542 ↛ 544line 542 didn't jump to line 544 because the condition on line 542 was always true

543 tr["code"] = langcode 

544 if tags: 

545 tr["tags"] = list(tags) 

546 for ttup in tagsets: 546 ↛ 547line 546 didn't jump to line 547 because the loop on line 546 never started

547 tr["tags"].extend(ttup) 

548 # topics is never populated, so it's always empty 

549 # if topics: 

550 # tr["topics"] = list(topics) 

551 if sense: 551 ↛ 552line 551 didn't jump to line 552 because the condition on line 551 was never true

552 if sense.startswith( 

553 ( 

554 "Translations to be checked", 

555 ":The translations below need to be checked", 

556 ) 

557 ): 

558 continue # Skip such translations 

559 else: 

560 tr["sense"] = sense 

561 

562 # Check if this part starts with (tags) 

563 m = re.match(r"\(([^)]+)\) ", part) 

564 if m: 564 ↛ 565line 564 didn't jump to line 565 because the condition on line 564 was never true

565 par = m.group(1) 

566 rest = part[m.end() :] 

567 cls = classify_desc(par, no_unknown_starts=True) 

568 if cls == "tags": 

569 tagsets2, topics2 = decode_tags(par) 

570 for ttup in tagsets2: 

571 data_extend(tr, "tags", ttup) 

572 data_extend(tr, "topics", topics2) 

573 part = rest 

574 

575 # Check if this part ends with (tags). Note that 

576 # note-re will mess things up if we rely on this being 

577 # checked later. 

578 m = re.search(r" +\(([^)]+)\)$", part) 

579 if m: 

580 par = m.group(1) 

581 rest = part[: m.start()] 

582 cls = classify_desc(par, no_unknown_starts=True) 

583 if cls == "tags": 

584 tagsets2, topics2 = decode_tags(par) 

585 for ttup in tagsets2: 

586 data_extend(tr, "tags", ttup) 

587 data_extend(tr, "topics", topics2) 

588 part = rest 

589 

590 # Check if this part starts with "<tags/english>: <rest>" 

591 m = re.match(r"([-\w() ]+): ", part) 

592 if m: 592 ↛ 593line 592 didn't jump to line 593 because the condition on line 592 was never true

593 par = m.group(1).strip() 

594 rest = part[m.end() :] 

595 if par in ("", "see"): 

596 part = "rest" 

597 else: 

598 cls = classify_desc(par) 

599 # print("par={!r} cls={!r}".format(par, cls)) 

600 if cls == "tags": 

601 tagsets2, topics2 = decode_tags(par) 

602 for ttup in tagsets2: 

603 data_extend(tr, "tags", ttup) 

604 data_extend(tr, "topics", topics2) 

605 part = rest 

606 elif cls == "english": 

607 if re.search(tr_note_re, par): 

608 if "note" in tr: 

609 tr["note"] += "; " + par 

610 else: 

611 tr["note"] = par 

612 else: 

613 if "english" in tr: 

614 tr["english"] += "; " + par 

615 else: 

616 tr["english"] = par 

617 part = rest 

618 

619 # Skip translations that our template_fn says to ignore 

620 # and those that contain Lua execution errors. 

621 if "__IGNORE__" in part: 621 ↛ 622line 621 didn't jump to line 622 because the condition on line 621 was never true

622 continue # Contains something we want to ignore 

623 if part.startswith("Lua execution error"): 623 ↛ 624line 623 didn't jump to line 624 because the condition on line 623 was never true

624 continue 

625 

626 # Handle certain suffixes in translations that 

627 # we might put in "note" but that we can actually 

628 # parse into tags. 

629 for suffix, t in ( 

630 (" with dative", "with-dative"), 

631 (" with genitive", "with-genitive"), 

632 (" with accusative", "with-accusative"), 

633 (" in subjunctive", "with-subjunctive"), 

634 (" and conditional mood", "with-conditional"), 

635 (" - I have - you have", "first-person second-person singular"), 

636 (" - I have", "first-person singular"), 

637 (" - you have", "second-person singular"), 

638 ): 

639 if part.endswith(suffix): 639 ↛ 640line 639 didn't jump to line 640 because the condition on line 639 was never true

640 part = part[: -len(suffix)] 

641 data_append(tr, "tags", t) 

642 break 

643 

644 # Handle certain prefixes in translations 

645 for prefix, t in (("subjunctive of ", "with-subjunctive"),): 

646 if part.startswith(prefix): 646 ↛ 647line 646 didn't jump to line 647 because the condition on line 646 was never true

647 part = part[len(prefix) :] 

648 data_append(tr, "tags", t) 

649 break 

650 

651 # Skip certain one-character translations entirely 

652 # (these could result from templates being ignored) 

653 if part in ",;.": 653 ↛ 654line 653 didn't jump to line 654 because the condition on line 653 was never true

654 continue 

655 

656 if "english" in tr and tr["english"] in english_to_tags: 656 ↛ 657line 656 didn't jump to line 657 because the condition on line 656 was never true

657 data_extend(tr, "tags", english_to_tags[tr["english"]].split()) 

658 del tr["english"] 

659 

660 # Certain values indicate it is not actually a translation. 

661 # See definition of tr_ignore_re to adjust. 

662 m = re.search(tr_ignore_re, part) 

663 w: Optional[str] = None 

664 

665 if m and ( 

666 m.start() != 0 or m.end() != len(part) or len(part.split()) > 1 

667 ): 

668 # This translation will be skipped because it 

669 # seems to be some kind of explanatory text. 

670 # However, let's put it in the "note" field 

671 # instead, unless it is one of the listed fully 

672 # ignored ones. 

673 if part in ("please add this translation if you can",): 673 ↛ 676line 673 didn't jump to line 676 because the condition on line 673 was always true

674 continue 

675 # Save in note field 

676 tr["note"] = part 

677 else: 

678 # Interpret it as an actual translation 

679 parse_translation_desc(wxr, lang, part, tr) 

680 w = tr.get("word") 

681 if not w: 681 ↛ 682line 681 didn't jump to line 682 because the condition on line 681 was never true

682 continue # Not set or empty 

683 if w.startswith(("*", ":")): 

684 w = w[1:].strip() 

685 if w in ("[Term?]", ":", "/", "?"): 685 ↛ 686line 685 didn't jump to line 686 because the condition on line 685 was never true

686 continue # These are not valid linkage targets 

687 if len(w) > 3 * len(word) + 20: 687 ↛ 691line 687 didn't jump to line 691 because the condition on line 687 was never true

688 # Accept translation if word looks like acronym: 

689 # 'ISBN', 'I.S.B.N'.isupper() return True, and 

690 # false positives are unlikely. 

691 if not word.isupper(): 

692 # Likely descriptive text or example because 

693 # it is much too long. 

694 wxr.wtp.debug( 

695 "Translation too long compared to word, so" 

696 " it is skipped", 

697 sortid="translations/609-20230504", 

698 ) 

699 del tr["word"] 

700 tr["note"] = w 

701 

702 # Sanity check: try to detect certain suspicious 

703 # patterns in translations 

704 if "word" in tr: 704 ↛ 716line 704 didn't jump to line 716 because the condition on line 704 was always true

705 m = re.search(tr_suspicious_re, tr["word"]) 

706 if m and lang not in ( 706 ↛ 709line 706 didn't jump to line 709 because the condition on line 706 was never true

707 "Bats", # ^ in tree/English/Tr/Bats 

708 ): 

709 wxr.wtp.debug( 

710 "suspicious translation with {!r}: {}".format( 

711 m.group(0), tr 

712 ), 

713 sortid="translations/611", 

714 ) 

715 

716 if "tags" in tr: 

717 tr["tags"] = list(sorted(set(tr["tags"]))) 

718 

719 # If we have only notes, add as-is 

720 if "word" not in tr: 720 ↛ 721line 720 didn't jump to line 721 because the condition on line 720 was never true

721 data_append(data, "translations", tr) 

722 continue 

723 

724 # Split if it contains no spaces 

725 if w: 725 ↛ 523line 725 didn't jump to line 523 because the condition on line 725 was always true

726 alts = [w] 

727 if " " not in w: 

728 # If no spaces, split by separator 

729 alts = re.split(r"/|/", w) 

730 # Note: there could be remaining slashes, but they are 

731 # sometimes used in ways we cannot resolve programmatically. 

732 # Create translations for each alternative. 

733 for alt in alts: 

734 alt = alt.strip() 

735 tr1 = copy.deepcopy(tr) 

736 if alt.startswith("*") or alt.startswith(":"): 736 ↛ 737line 736 didn't jump to line 737 because the condition on line 736 was never true

737 alt = alt[1:].strip() 

738 if not alt: 738 ↛ 739line 738 didn't jump to line 739 because the condition on line 738 was never true

739 continue 

740 tr1["word"] = alt 

741 data_append(data, "translations", tr1) 

742 

743 # Return the language name, in case we have subitems 

744 return lang