Coverage for src / wiktextract / extractor / en / form_descriptions.py: 79%

1402 statements  

« prev     ^ index     » next       coverage.py v7.14.0, created at 2026-05-11 04:48 +0000

1# Code for parsing linguistic form descriptions and tags for word senses 

2# (both the word entry head - initial part and parenthesized parts - 

3# and tags at the beginning of word senses) 

4# 

5# Copyright (c) 2020-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org 

6 

7import functools 

8import re 

9import unicodedata 

10from typing import ( 

11 Any, 

12 Literal, 

13 Optional, 

14 Sequence, 

15 Union, 

16) 

17 

18import Levenshtein 

19from nltk import TweetTokenizer # type:ignore[import-untyped] 

20from wikitextprocessor.parser import WikiNode 

21 

22from ...datautils import data_append, data_extend, split_at_comma_semi 

23from ...page import extract_links_from_node 

24from ...tags import ( 

25 alt_of_tags, 

26 form_of_tags, 

27 head_final_bantu_langs, 

28 head_final_bantu_map, 

29 head_final_numeric_langs, 

30 head_final_other_langs, 

31 head_final_other_map, 

32 head_final_semitic_langs, 

33 head_final_semitic_map, 

34 uppercase_tags, 

35 valid_tags, 

36 xlat_descs_map, 

37 xlat_head_map, 

38 xlat_tags_map, 

39) 

40from ...topics import topic_generalize_map, valid_topics 

41from ...wxr_context import WiktextractContext 

42from .english_words import ( 

43 english_words, 

44 not_english_words, 

45 potentially_english_words, 

46) 

47from .form_descriptions_known_firsts import known_firsts 

48from .taxondata import known_species 

49from .type_utils import ( 

50 AltOf, 

51 FormData, 

52 LinkageData, 

53 SenseData, 

54 SoundData, 

55 TranslationData, 

56 WordData, 

57) 

58 

59# Tokenizer for classify_desc() 

60tokenizer = TweetTokenizer() 

61 

62# These are ignored as the value of a related form in form head. 

63IGNORED_RELATED: set[str] = set( 

64 [ 

65 "-", 

66 "־", 

67 "᠆", 

68 "‐", 

69 "‑", 

70 "‒", 

71 "–", 

72 "—", 

73 "―", 

74 "−", 

75 "⸺", 

76 "⸻", 

77 "﹘", 

78 "﹣", 

79 "-", 

80 "?", 

81 "(none)", 

82 ] 

83) 

84 

85 

86# First words of unicodedata.name() that indicate scripts that cannot be 

87# accepted in romanizations or english (i.e., should be considered "other" 

88# in classify_desc()). 

89non_latin_scripts: list[str] = [ 

90 "ADLAM", 

91 "ARABIC", 

92 "ARABIC-INDIC", 

93 "ARMENIAN", 

94 "BALINESE", 

95 "BENGALI", 

96 "BRAHMI", 

97 "BRAILLE", 

98 "CANADIAN", 

99 "CHAKMA", 

100 "CHAM", 

101 "CHEROKEE", 

102 "CJK", 

103 "COPTIC", 

104 "COUNTING ROD", 

105 "CUNEIFORM", 

106 "CYRILLIC", 

107 "DOUBLE-STRUCK", 

108 "EGYPTIAN", 

109 "ETHIOPIC", 

110 "EXTENDED ARABIC-INDIC", 

111 "GEORGIAN", 

112 "GLAGOLITIC", 

113 "GOTHIC", 

114 "GREEK", 

115 "GUJARATI", 

116 "GURMUKHI", 

117 "HANGUL", 

118 "HANIFI ROHINGYA", 

119 "HEBREW", 

120 "HIRAGANA", 

121 "JAVANESE", 

122 "KANNADA", 

123 "KATAKANA", 

124 "KAYAH LI", 

125 "KHMER", 

126 "KHUDAWADI", 

127 "LAO", 

128 "LEPCHA", 

129 "LIMBU", 

130 "MALAYALAM", 

131 "MEETEI", 

132 "MYANMAR", 

133 "NEW TAI LUE", 

134 "NKO", 

135 "OL CHIKI", 

136 "OLD PERSIAN", 

137 "OLD SOUTH ARABIAN", 

138 "ORIYA", 

139 "OSMANYA", 

140 "PHOENICIAN", 

141 "SAURASHTRA", 

142 "SHARADA", 

143 "SINHALA", 

144 "SUNDANESE", 

145 "SYLOTI", 

146 "TAI THAM", 

147 "TAKRI", 

148 "TAMIL", 

149 "TELUGU", 

150 "THAANA", 

151 "THAI", 

152 "TIBETAN", 

153 "TIFINAGH", 

154 "TIRHUTA", 

155 "UGARITIC", 

156 "WARANG CITI", 

157 "YI", 

158] 

159non_latin_scripts_re = re.compile( 

160 r"(" + r"|".join(re.escape(x) for x in non_latin_scripts) + r")\b" 

161) 

162 

163# Sanity check xlat_head_map values 

164for k, v in xlat_head_map.items(): 

165 if v.startswith("?"): 

166 v = v[1:] 

167 for tag in v.split(): 

168 if tag not in valid_tags: 168 ↛ 169line 168 didn't jump to line 169 because the condition on line 168 was never true

169 print( 

170 "WARNING: xlat_head_map[{}] contains" 

171 " unrecognized tag {}".format( 

172 k, tag 

173 ) 

174 ) 

175 

176# Regexp for finding nested translations from translation items (these are 

177# used in, e.g., year/English/Translations/Arabic). This is actually used 

178# in page.py. 

179nested_translations_re = re.compile( 

180 r"\s+\((({}): ([^()]|\([^()]+\))+)\)".format( 

181 "|".join( 

182 re.escape(x.removeprefix("?")) 

183 for x in sorted(xlat_head_map.values(), key=len, reverse=True) 

184 if x and not x.startswith("class-") 

185 ) 

186 ) 

187) 

188 

189# Regexp that matches head tag specifiers. Used to match tags from end of 

190# translations and linkages 

191head_final_re_text = r"( -)?( ({}))+".format( 

192 "|".join( 

193 re.escape(x) 

194 for x in 

195 # The sort is to put longer ones first, preferring them in 

196 # the regexp match 

197 sorted(xlat_head_map.keys(), key=len, reverse=True) 

198 ) 

199) 

200head_final_re = re.compile(head_final_re_text + r"$") 

201 

202# Regexp used to match head tag specifiers at end of a form for certain 

203# Bantu languages (particularly Swahili and similar languages). 

204head_final_bantu_re_text = r" ({})".format( 

205 "|".join(re.escape(x) for x in head_final_bantu_map.keys()) 

206) 

207head_final_bantu_re = re.compile(head_final_bantu_re_text + "$") 

208 

209# Regexp used to match head tag specifiers at end of a form for certain 

210# Semitic languages (particularly Arabic and similar languages). 

211head_final_semitic_re_text = r" ({})".format( 

212 "|".join(re.escape(x) for x in head_final_semitic_map.keys()) 

213) 

214head_final_semitic_re = re.compile(head_final_semitic_re_text + "$") 

215 

216# Regexp used to match head tag specifiers at end of a form for certain 

217# other languages (e.g., Lithuanian, Finnish, French). 

218head_final_other_re_text = r" ({})".format( 

219 "|".join(re.escape(x) for x in head_final_other_map.keys()) 

220) 

221head_final_other_re = re.compile(head_final_other_re_text + "$") 

222 

223# Regexp for splitting heads. See parse_word_head(). 

224head_split_re_text_part_1 = ( 

225 "(" 

226 + head_final_re_text 

227 + "|" 

228 + head_final_bantu_re_text 

229 + "|" 

230 + head_final_semitic_re_text 

231 + "|" 

232 + head_final_other_re_text 

233) 

234 

235head_split_re_text = head_split_re_text_part_1 + ")?( or |[,;]+| *$)" 

236 

237head_split_re_text_no_semicolon = head_split_re_text_part_1 + ")?( or |,+| *$)" 

238 

239head_split_re = re.compile(head_split_re_text) 

240head_split_no_semicolon_re = re.compile(head_split_re_text_no_semicolon) 

241 

242head_split_re_parens = 0 

243for m in re.finditer(r"(^|[^\\])[(]+", head_split_re_text): 

244 head_split_re_parens += m.group(0).count("(") 

245 

246# Parenthesized parts that are ignored in translations 

247tr_ignored_parens: set[str] = set( 

248 [ 

249 "please verify", 

250 "(please verify)", 

251 "transliteration needed", 

252 "(transliteration needed)", 

253 "in words with back vowel harmony", 

254 "(in words with back vowel harmony)", 

255 "in words with front vowel harmony", 

256 "(in words with front vowel harmony)", 

257 "see below", 

258 "see usage notes below", 

259 ] 

260) 

261tr_ignored_parens_re = re.compile( 

262 r"^(" 

263 + "|".join(re.escape(x) for x in tr_ignored_parens) 

264 + ")$" 

265 + r"|^(Can we clean up|Can we verify|for other meanings see " 

266 r"lit\. )" 

267) 

268 

269# Translations that are ignored 

270ignored_translations: set[str] = set( 

271 [ 

272 "[script needed]", 

273 "please add this translation if you can", 

274 ] 

275) 

276 

277# Put english text into the "note" field in a translation if it contains one 

278# of these words 

279tr_note_re = re.compile( 

280 r"(\b(article|definite|indefinite|superlative|comparative|pattern|" 

281 r"adjective|adjectives|clause|clauses|pronoun|pronouns|preposition|prep|" 

282 r"postposition|postp|action|actions|articles|" 

283 r"adverb|adverbs|noun|nouns|verb|verbs|before|" 

284 r"after|placed|prefix|suffix|used with|translated|" 

285 r"nominative|genitive|dative|infinitive|participle|past|perfect|imperfect|" 

286 r"perfective|imperfective|auxiliary|negative|future|present|tense|aspect|" 

287 r"conjugation|declension|class|category|plural|singular|positive|" 

288 r"seldom used|formal|informal|familiar|unspoken|spoken|written|" 

289 r"indicative|progressive|conditional|potential|" 

290 r"accusative|adessive|inessive|superessive|elative|allative|" 

291 r"dialect|dialects|object|subject|predicate|movies|recommended|language|" 

292 r"locative|continuous|simple|continuousness|gerund|subjunctive|" 

293 r"periphrastically|no equivalent|not used|not always used|" 

294 r"used only with|not applicable|use the|signifying|wordplay|pronounced|" 

295 r"preconsonantal|spelled|spelling|respelling|respellings|phonetic|" 

296 r"may be replaced|stricter sense|for nonhumans|" 

297 r"sense:|used:|in full:|informally used|followed by|" 

298 r"not restricted to|pertaining to|or optionally with|are optional|" 

299 r"in conjunction with|in compounds|depending on the relationship|" 

300 r"person addressed|one person|multiple persons|may be replaced with|" 

301 r"optionally completed with|in the phrase|in response to|" 

302 r"before a|before an|preceded by|verbs ending|very common|after a verb|" 

303 r"with verb|with uncountable|with the objects|with stative|" 

304 r"can be replaced by|often after|used before|used after|" 

305 r"used in|clipping of|spoken|somewhat|capitalized|" 

306 r"short form|shortening of|shortened form|initialism of|" 

307 r"said to|rare:|rarer also|is rarer|negatively connoted|" 

308 r"previously mentioned|uncountable noun|countable noun|" 

309 r"countable nouns|uncountable nouns|" 

310 r"with predicative|with -|with imperfect|with a negated|" 

311 r"colloquial|misspelling|holophrastic|frequently|esp\.|especially|" 

312 r'"|' 

313 r"general term|after a vowel|before a vowel|" 

314 r"form|regular|irregular|alternative)" 

315 r")($|[) ])|^(" 

316 # Following are only matched at the beginning of the string 

317 r"pl|pl\.|see:|pl:|sg:|plurals:|e\.g\.|e\.g\.:" 

318 r"|e\.g\.,|cf\.|compare|such as|" 

319 r"see|only|often|usually|used|usage:|of|not|in|compare|usu\.|" 

320 r"as|about|abbrv\.|abbreviation|abbr\.|that:|optionally|" 

321 r"mainly|from|for|also|also:|acronym|" 

322 r"\+|with) " 

323) 

324# \b does not work at the end??? 

325 

326# Related forms matching this regexp will be considered suspicious if the 

327# page title does not also match one of these. 

328suspicious_related_re = re.compile( 

329 r"(^| )(f|m|n|c|or|pl|sg|inan|anim|pers|anml|impf|pf|vir|nvir)( |$)" 

330 r"|[][:=<>&#*|]" 

331 r"| \d+$" 

332) 

333 

334# Word forms (head forms, translations, etc) that will be considered ok and 

335# silently accepted even if they would otherwise trigger a suspicious 

336# form warning. 

337ok_suspicious_forms: set[str] = set( 

338 [ 

339 "but en or", # "golden goal"/English/Tr/French 

340 "cœur en or", # "heart of gold"/Eng/Tr/French 

341 "en or", # golden/Eng/Tr/French 

342 "men du", # jet/Etym2/Noun/Tr/Cornish 

343 "parachute en or", # "golden parachute"/Eng/Tr/French 

344 "vieil or", # "old gold"/Eng/Tr/French 

345 # "all that glitters is not gold"/Eng/Tr/French 

346 "tout ce qui brille n’est pas or", 

347 "μη αποκλειστικό or", # inclusive or/Eng/Tr/Greek 

348 "period or full stop", 

349 ] 

350) 

351 

352 

353# Replacements to be done in classify_desc before tokenizing. This is a 

354# workaround for shortcomings in TweetTokenizer. 

355tokenizer_fixup_map = { 

356 r"a.m.": "AM", 

357 r"p.m.": "PM", 

358} 

359tokenizer_fixup_re = re.compile( 

360 r"\b(" 

361 + "|".join( 

362 re.escape(x) 

363 for x in sorted( 

364 tokenizer_fixup_map.keys(), key=lambda x: len(x), reverse=True 

365 ) 

366 ) 

367 + r")" 

368) 

369 

370# Unknown tags starting with these words will be silently ignored. 

371ignored_unknown_starts: set[str] = set( 

372 [ 

373 "originally", 

374 "e.g.", 

375 "c.f.", 

376 "supplanted by", 

377 "supplied by", 

378 ] 

379) 

380 

381ignored_unknown_starts_re = re.compile( 

382 r"^(" 

383 + "|".join( 

384 re.escape(x) 

385 for x in sorted(ignored_unknown_starts, key=lambda x: -len(x)) 

386 ) 

387 + ") " 

388) 

389 

390# If an unknown sequence starts with one of these, it will continue as an 

391# unknown sequence until the end, unless it turns out to have a replacement. 

392allowed_unknown_starts: set[str] = set( 

393 [ 

394 "Relating", 

395 "accompanied", 

396 "added", 

397 "after", 

398 "answering", 

399 "as", 

400 "based", 

401 "before", 

402 "conjugated", 

403 "conjunction", 

404 "construed", 

405 "especially", 

406 "expression:", 

407 "figurative:", 

408 "followed", 

409 "for", 

410 "forms", 

411 "from", 

412 "governs", 

413 "in", 

414 "indicating", 

415 "modifying", 

416 "normally", 

417 "not", 

418 "of", 

419 "preceding", 

420 "prefixed", 

421 "referring", 

422 "relating", 

423 "revived", 

424 "said", 

425 "since", 

426 "takes", 

427 "used", 

428 "with", 

429 "With", 

430 "without", 

431 ] 

432) 

433# Allow the ignored unknown starts without complaining 

434allowed_unknown_starts.update(ignored_unknown_starts) 

435 

436# Full unknown tags that will be ignored in decode_tags() 

437# XXX this is unused, ask Tatu where the contents is now 

438ignored_unknown_tags: set[str] = set([]) 

439 

440# Head endings that are mapped to tags 

441head_end_map = { 

442 " 1st conj.": "conjugation-1", 

443 " 2nd conj.": "conjugation-2", 

444 " 3rd conj.": "conjugation-3", 

445 " 4th conj.": "conjugation-4", 

446 " 5th conj.": "conjugation-5", 

447 " 6th conj.": "conjugation-6", 

448 " 7th conj.": "conjugation-7", 

449} 

450head_end_re = re.compile( 

451 r"(" + "|".join(re.escape(x) for x in head_end_map.keys()) + r")$" 

452) 

453 

454 

455# Dictionary of language-specific parenthesized head part starts that 

456# either introduce new tags or modify previous tags. The value for each 

457# language is a dictionary that maps the first word of the head part to 

458# (rem_tags, add_tags), where ``rem_tags`` can be True to remove all previous 

459# tags or a space-separated string of tags to remove, and ``add_tags`` should 

460# be a string of tags to add. 

461lang_specific_head_map: dict[ 

462 str, dict[str, Union[tuple[str, str], tuple[Literal[True], str]]] 

463] = { 

464 "Danish": { 

465 # prefix: (rem_tags space separate string/True, add_tags s-sep str) 

466 "c": ("neuter", "common-gender"), 

467 "n": ("common-gender", "neuter"), 

468 "pl": ("singular neuter common-gender", "plural"), 

469 "sg": ("plural neuter common-gender", "singular"), 

470 }, 

471} 

472 

473 

474# Regular expression used to strip additional stuff from the end of alt_of and 

475# form_of. 

476alt_of_form_of_clean_re = re.compile( 

477 r"(?s)(" 

478 + "|".join( 

479 [ 

480 r":", 

481 r'[“"]', 

482 r";", 

483 r" \(", 

484 r" - ", 

485 r" ־ ", 

486 r" ᠆ ", 

487 r" ‐ ", 

488 r" ‑ ", 

489 r" ‒ ", 

490 r" – ", 

491 r" — ", 

492 r" ― ", 

493 r" − ", 

494 r" ⸺ ", 

495 r" ⸻ ", 

496 r" ﹘ ", 

497 r" ﹣ ", 

498 r" - ", 

499 r" \+ ", 

500 r" \(with ", 

501 r" with -ra/-re", 

502 r"\. Used ", 

503 r"\. Also ", 

504 r"\. Since ", 

505 r"\. A ", 

506 r"\.\. A ", 

507 r"\. An ", 

508 r"\.\. An ", 

509 r"\. an ", 

510 r"\. The ", 

511 r"\. Spanish ", 

512 r"\. Language ", 

513 r"\. former name of ", 

514 r"\. AIM", 

515 r"\. OT", 

516 r"\. Not ", 

517 r"\. Now ", 

518 r"\. Nowadays ", 

519 r"\. Early ", 

520 r"\. ASEAN", 

521 r"\. UN", 

522 r"\. IMF", 

523 r"\. WHO", 

524 r"\. WIPO", 

525 r"\. AC", 

526 r"\. DC", 

527 r"\. DNA", 

528 r"\. RNA", 

529 r"\. SOB", 

530 r"\. IMO", 

531 r"\. Behavior", 

532 r"\. Income ", 

533 r"\. More ", 

534 r"\. Most ", 

535 r"\. Only ", 

536 r"\. Also ", 

537 r"\. From ", 

538 r"\. Of ", 

539 r"\.\. Of ", 

540 r"\. To ", 

541 r"\. For ", 

542 r"\. If ", 

543 r"\. Praenominal ", 

544 r"\. This ", 

545 r"\. Replaced ", 

546 r"\. CHCS is the ", 

547 r"\. Equivalent ", 

548 r"\. Initialism ", 

549 r"\. Note ", 

550 r"\. Alternative ", 

551 r"\. Compare ", 

552 r"\. Cf\. ", 

553 r"\. Comparable ", 

554 r"\. Involves ", 

555 r"\. Sometimes ", 

556 r"\. Commonly ", 

557 r"\. Often ", 

558 r"\. Typically ", 

559 r"\. Possibly ", 

560 r"\. Although ", 

561 r"\. Rare ", 

562 r"\. Instead ", 

563 r"\. Integrated ", 

564 r"\. Distinguished ", 

565 r"\. Given ", 

566 r"\. Found ", 

567 r"\. Was ", 

568 r"\. In ", 

569 r"\. It ", 

570 r"\.\. It ", 

571 r"\. One ", 

572 r"\. Any ", 

573 r"\. They ", 

574 r"\. Members ", 

575 r"\. Each ", 

576 r"\. Original ", 

577 r"\. Especially ", 

578 r"\. Usually ", 

579 r"\. Known ", 

580 r"\.\. Known ", 

581 r"\. See ", 

582 r"\. see ", 

583 r"\. target was not ", 

584 r"\. Popular ", 

585 r"\. Pedantic ", 

586 r"\. Positive ", 

587 r"\. Society ", 

588 r"\. Plan ", 

589 r"\. Environmentally ", 

590 r"\. Affording ", 

591 r"\. Encompasses ", 

592 r"\. Expresses ", 

593 r"\. Indicates ", 

594 r"\. Text ", 

595 r"\. Large ", 

596 r"\. Sub-sorting ", 

597 r"\. Sax", 

598 r"\. First-person ", 

599 r"\. Second-person ", 

600 r"\. Third-person ", 

601 r"\. 1st ", 

602 r"\. 2nd ", 

603 r"\. 3rd ", 

604 r"\. Term ", 

605 r"\. Northeastern ", 

606 r"\. Northwestern ", 

607 r"\. Southeast ", 

608 r"\. Egyptian ", 

609 r"\. English ", 

610 r"\. Cape Province was split into ", 

611 r"\. Pañcat", 

612 r"\. of the ", 

613 r"\. is ", 

614 r"\. after ", 

615 r"\. or ", 

616 r"\. chromed", 

617 r"\. percussion", 

618 r"\. with his ", 

619 r"\. a\.k\.a\. ", 

620 r"\. comparative form ", 

621 r"\. singular ", 

622 r"\. plural ", 

623 r"\. present ", 

624 r"\. his ", 

625 r"\. her ", 

626 r"\. equivalent ", 

627 r"\. measuring ", 

628 r"\. used in ", 

629 r"\. cutely ", 

630 r"\. Protects", 

631 r'\. "', 

632 r"\.^", 

633 r"\. \+ ", 

634 r"\., ", 

635 r". — ", 

636 r", a ", 

637 r", an ", 

638 r", the ", 

639 r", obsolete ", 

640 r", possessed", # 'd/English 

641 r", imitating", # 1/English 

642 r", derived from", 

643 r", called ", 

644 r", especially ", 

645 r", slang for ", 

646 r", used to", # c/o /English 

647 r", commonly", # b/w /English 

648 r" corresponding to ", 

649 r" equivalent to ", 

650 r" popularized by ", 

651 r" denoting ", 

652 r" in its various senses\.", 

653 r" used by ", 

654 r" but not for ", 

655 r" since ", 

656 r" i\.e\. ", 

657 r" i\. e\. ", 

658 r" e\.g\. ", 

659 r" eg\. ", 

660 r" etc\. ", 

661 r"\[http", 

662 r" — used as ", 

663 r" by K\. Forsyth ", 

664 r" by J\. R\. Allen ", 

665 r" by S\. Ferguson ", 

666 r" by G\. Donaldson ", 

667 r" May refer to ", 

668 r" An area or region ", 

669 ] 

670 ) 

671 + r").*$" 

672) 

673 

674 

675class ValidNode: 

676 """Node in the valid_sequences tree. Each node is part of a chain 

677 or chains that form sequences built out of keys in key->tags 

678 maps like xlat_tags, etc. The ValidNode's 'word' is the key 

679 by which it is refered to in the root dict or a `children` dict, 

680 `end` marks that the node is the end-terminus of a sequence (but 

681 it can still continue if the sequence is shared by the start of 

682 other sequences: "nominative$" and "nominative plural$" for example), 

683 `tags` and `topics` are the dicts containing tag and topic strings 

684 for terminal nodes (end==True).""" 

685 

686 __slots__ = ( 

687 "end", 

688 "tags", 

689 "topics", 

690 "children", 

691 ) 

692 

693 def __init__( 

694 self, 

695 end=False, 

696 tags: Optional[list[str]] = None, 

697 topics: Optional[list[str]] = None, 

698 children: Optional[dict[str, "ValidNode"]] = None, 

699 ) -> None: 

700 self.end = end 

701 self.tags: list[str] = tags or [] 

702 self.topics: list[str] = topics or [] 

703 self.children: dict[str, "ValidNode"] = children or {} 

704 

705 

706def add_to_valid_tree(tree: ValidNode, desc: str, v: Optional[str]) -> None: 

707 """Helper function for building trees of valid tags/sequences during 

708 initialization.""" 

709 assert isinstance(tree, ValidNode) 

710 assert isinstance(desc, str) 

711 assert v is None or isinstance(v, str) 

712 node = tree 

713 

714 # Build the tree structure: each node has children nodes 

715 # whose names are denoted by their dict key. 

716 for w in desc.split(" "): 

717 if w in node.children: 

718 node = node.children[w] 

719 else: 

720 new_node = ValidNode() 

721 node.children[w] = new_node 

722 node = new_node 

723 if not node.end: 

724 node.end = True 

725 if not v: 

726 return None # Terminate early because there are no tags 

727 

728 tagslist = [] 

729 topicslist = [] 

730 for vv in v.split(): 

731 if vv in valid_tags: 

732 tagslist.append(vv) 

733 elif vv in valid_topics: 733 ↛ 736line 733 didn't jump to line 736 because the condition on line 733 was always true

734 topicslist.append(vv) 

735 else: 

736 print( 

737 "WARNING: tag/topic {!r} maps to unknown {!r}".format(desc, vv) 

738 ) 

739 topics = " ".join(topicslist) 

740 tags = " ".join(tagslist) 

741 # Changed to "_tags" and "_topics" to avoid possible key-collisions. 

742 if topics: 

743 node.topics.extend([topics]) 

744 if tags: 

745 node.tags.extend([tags]) 

746 

747 

748def add_to_valid_tree1( 

749 tree: ValidNode, 

750 k: str, 

751 v: Union[list[str], tuple[str, ...], str], 

752 valid_values: Union[set[str], dict[str, Any]], 

753) -> list[str]: 

754 assert isinstance(tree, ValidNode) 

755 assert isinstance(k, str) 

756 assert v is None or isinstance(v, (list, tuple, str)) 

757 assert isinstance(valid_values, (set, dict)) 

758 if not v: 758 ↛ 759line 758 didn't jump to line 759 because the condition on line 758 was never true

759 add_to_valid_tree(valid_sequences, k, None) 

760 return [] 

761 elif isinstance(v, str): 

762 v = [v] 

763 q = [] 

764 for vv in v: 

765 assert isinstance(vv, str) 

766 add_to_valid_tree(valid_sequences, k, vv) 

767 vvs = vv.split() 

768 for x in vvs: 

769 q.append(x) 

770 # return each individual tag 

771 return q 

772 

773 

774def add_to_valid_tree_mapping( 

775 tree: ValidNode, 

776 mapping: Union[dict[str, Union[list[str], str]], dict[str, str]], 

777 valid_values: Union[set[str], dict[str, Any]], 

778 recurse: bool, 

779) -> None: 

780 assert isinstance(tree, ValidNode) 

781 assert isinstance(mapping, dict) 

782 assert isinstance(valid_values, (set, dict)) 

783 assert recurse in (True, False) 

784 for k, v in mapping.items(): 

785 assert isinstance(k, str) 

786 assert isinstance(v, (list, str)) 

787 if isinstance(v, str): 

788 q = add_to_valid_tree1(tree, k, [v], valid_values) 

789 else: 

790 q = add_to_valid_tree1(tree, k, v, valid_values) 

791 if recurse: 

792 visited = set() 

793 while q: 

794 v = q.pop() 

795 if v in visited: 

796 continue 

797 visited.add(v) 

798 if v not in mapping: 

799 continue 

800 vv = mapping[v] 

801 qq = add_to_valid_tree1(tree, k, vv, valid_values) 

802 q.extend(qq) 

803 

804 

805# Tree of sequences considered to be tags (includes sequences that are 

806# mapped to something that becomes one or more valid tags) 

807valid_sequences = ValidNode() 

808sequences_with_slashes: set[str] = set() 

809for tag in valid_tags: 

810 # The basic tags used in our tag system; some are a bit weird, but easier 

811 # to implement this with 'false' positives than filter out stuff no one else 

812 # uses. 

813 if "/" in tag: 

814 sequences_with_slashes.add(tag) 

815 add_to_valid_tree(valid_sequences, tag, tag) 

816for tag in uppercase_tags: 

817 hyphenated = re.sub(r"\s+", "-", tag) 

818 if "/" in tag: 

819 sequences_with_slashes.add(tag) 

820 add_to_valid_tree(valid_sequences, tag, hyphenated) 

821 

822# xlat_tags_map! 

823add_to_valid_tree_mapping(valid_sequences, xlat_tags_map, valid_tags, False) 

824for k in xlat_tags_map: 

825 if "/" in k: 

826 sequences_with_slashes.add(k) 

827# Add topics to the same table, with all generalized topics also added 

828for topic in valid_topics: 

829 assert " " not in topic 

830 if "/" in topic: 830 ↛ 831line 830 didn't jump to line 831 because the condition on line 830 was never true

831 sequences_with_slashes.add(topic) 

832 add_to_valid_tree(valid_sequences, topic, topic) 

833# Let each original topic value stand alone. These are not generally on 

834# valid_topics. We add the original topics with spaces replaced by hyphens. 

835for topic in topic_generalize_map.keys(): 

836 hyphenated = re.sub(r"\s+", "-", topic) 

837 if "/" in topic: 837 ↛ 838line 837 didn't jump to line 838 because the condition on line 837 was never true

838 sequences_with_slashes.add(topic) 

839 add_to_valid_tree(valid_sequences, topic, hyphenated) 

840# Add canonicalized/generalized topic values 

841add_to_valid_tree_mapping( 

842 valid_sequences, topic_generalize_map, valid_topics, True 

843) 

844 

845# Regex used to divide a decode candidate into parts that shouldn't 

846# have their slashes turned into spaces 

847slashes_re = re.compile( 

848 r"(" + "|".join((re.escape(s) for s in sequences_with_slashes)) + r")" 

849) 

850 

851# Regexp used to find "words" from word heads and linguistic descriptions 

852word_pattern = ( 

853 r"[^ ,;()\u200e]+|" 

854 r"\([^ ,;()\u200e]+\)[^ ,;()\u200e]+|" 

855 r"[\u2800-\u28ff]|" # Braille characters 

856 r"\(([^()]|\([^()]*\))*\)" 

857) 

858 

859word_re_global = re.compile(word_pattern) 

860 

861 

862def distw(titleparts: Sequence[str], word: str) -> float: 

863 """Computes how distinct ``word`` is from the most similar word in 

864 ``titleparts``. Returns 1 if words completely distinct, 0 if 

865 identical, or otherwise something in between.""" 

866 assert isinstance(titleparts, (list, tuple)) 

867 assert isinstance(word, str) 

868 w = min( 

869 Levenshtein.distance(word, tw) / max(len(tw), len(word)) 

870 for tw in titleparts 

871 ) 

872 return w 

873 

874 

875def map_with( 

876 ht: dict[str, str | list[str]] | dict[str, str], 

877 lst: Sequence[str], 

878) -> list[str]: 

879 """Takes alternatives from ``lst``, maps them using ``ht`` to zero or 

880 more alternatives each, and returns a combined list of alternatives.""" 

881 assert isinstance(ht, dict) 

882 assert isinstance(lst, (list, tuple)) 

883 ret = [] 

884 for x in lst: 

885 assert isinstance(x, str) 

886 x = x.strip() 

887 x = ht.get(x, x) 

888 if isinstance(x, str): 888 ↛ 891line 888 didn't jump to line 891 because the condition on line 888 was always true

889 if x: 889 ↛ 884line 889 didn't jump to line 884 because the condition on line 889 was always true

890 ret.append(x) 

891 elif isinstance(x, (list, tuple)): 

892 ret.extend(x) 

893 else: 

894 raise RuntimeError("map_with unexpected value: {!r}".format(x)) 

895 return ret 

896 

897 

898TagList = list[str] 

899PosPathStep = tuple[int, TagList, TagList] 

900 

901 

902def check_unknown( 

903 from_i: int, 

904 to_i: int, 

905 i: int, 

906 wordlst: Sequence[str], 

907 allow_any: bool, 

908 no_unknown_starts: bool, 

909) -> list[PosPathStep]: 

910 """Check if the current section from_i->to_i is actually unknown 

911 or if it needs some special handling. We already presupposed that 

912 this is UNKNOWN; this is just called to see what *kind* of UNKNOWN.""" 

913 assert isinstance(to_i, int) 

914 assert isinstance(from_i, int) 

915 assert isinstance(i, int) 

916 # Adds unknown tag if needed. Returns new last_i 

917 # print("check_unknown to_i={} from_i={} i={}" 

918 # .format(to_i, from_i, i)) 

919 if from_i >= to_i: 

920 return [] 

921 words = wordlst[from_i:to_i] 

922 tag = " ".join(words) 

923 assert tag 

924 # print(f"{tag=}") 

925 if re.match(ignored_unknown_starts_re, tag): 

926 # Tags with this start are to be ignored 

927 return [(from_i, ["UNKNOWN"], [])] 

928 if tag in ignored_unknown_tags: 928 ↛ 929line 928 didn't jump to line 929 because the condition on line 928 was never true

929 return [] # One of the tags listed as to be ignored 

930 if tag in ("and", "or"): 

931 return [] 

932 if ( 

933 not allow_any 

934 and not words[0].startswith("~") 

935 and ( 

936 no_unknown_starts 

937 or words[0] not in allowed_unknown_starts 

938 or len(words) <= 1 

939 ) 

940 ): 

941 # print("ERR allow_any={} words={}" 

942 # .format(allow_any, words)) 

943 return [ 

944 (from_i, ["UNKNOWN"], ["error-unknown-tag"]) 

945 ] # Add ``tag`` here to include 

946 else: 

947 return [(from_i, ["UNKNOWN"], [tag])] 

948 

949 

950def add_new1( 

951 node: ValidNode, 

952 i: int, 

953 start_i: int, 

954 last_i: int, 

955 new_paths: list[list[PosPathStep]], 

956 new_nodes: list[tuple[ValidNode, int, int]], 

957 pos_paths: list[list[list[PosPathStep]]], 

958 wordlst: list[str], 

959 allow_any: bool, 

960 no_unknown_starts: bool, 

961 max_last_i: int, 

962) -> int: 

963 assert isinstance(new_paths, list) 

964 # print("add_new: start_i={} last_i={}".format(start_i, last_i)) 

965 # print("$ {} last_i={} start_i={}" 

966 # .format(w, last_i, start_i)) 

967 max_last_i = max(max_last_i, last_i) # if last_i has grown 

968 if (node, start_i, last_i) not in new_nodes: 

969 new_nodes.append((node, start_i, last_i)) 

970 if node.end: 

971 # We can see a terminal point in the search tree. 

972 u = check_unknown( 

973 last_i, start_i, i, wordlst, allow_any, no_unknown_starts 

974 ) 

975 # Create new paths candidates based on different past possible 

976 # paths; pos_path[last_i] contains possible paths, so add this 

977 # new one at the beginning(?) 

978 # The list comprehension inside the parens generates an iterable 

979 # of lists, so this is .extend( [(last_i...)], [(last_i...)], ... ) 

980 # XXX: this is becoming impossible to annotate, nodes might 

981 # need to become classed objects and not just dicts, or at least 

982 # a TypedDict with a "children" node 

983 new_paths.extend( 

984 [(last_i, node.tags, node.topics)] + u + x 

985 for x in pos_paths[last_i] 

986 ) 

987 max_last_i = i + 1 

988 return max_last_i 

989 

990 

991@functools.lru_cache(maxsize=65536) 

992def decode_tags( 

993 src: str, 

994 allow_any=False, 

995 no_unknown_starts=False, 

996) -> tuple[list[tuple[str, ...]], list[str]]: 

997 tagsets, topics = decode_tags1(src, allow_any, no_unknown_starts) 

998 # print(f"decode_tags: {src=}, {tagsets=}") 

999 

1000 # Insert retry-code here that modifies the text source 

1001 if ( 

1002 any(s.startswith("error-") for tagset in tagsets for s in tagset) 

1003 # I hate Python's *nested* list comprehension syntax ^ 

1004 or any(s.startswith("error-") for s in topics) 

1005 ): 

1006 new_tagsets: list[tuple[str, ...]] = [] 

1007 new_topics: list[str] = [] 

1008 

1009 if "/" in src: 

1010 # slashes_re contains valid key entries with slashes; we're going 

1011 # to skip them by splitting the string and skipping handling every 

1012 # second entry, which contains the splitting group like "masculine/ 

1013 # feminine" style keys. 

1014 split_parts = re.split(slashes_re, src) 

1015 new_parts: list[str] = [] 

1016 if len(split_parts) > 1: 

1017 for i, s in enumerate(split_parts): 

1018 if i % 2 == 0: 

1019 new_parts.append(s.replace("/", " ")) 

1020 else: 

1021 new_parts.append(s) 

1022 new_src = "".join(new_parts) 

1023 else: 

1024 new_src = src 

1025 new_tagsets, new_topics = decode_tags1( 

1026 new_src, allow_any, no_unknown_starts 

1027 ) 

1028 elif " or " in src or " and " in src: 

1029 # Annoying kludge. 

1030 new_src = src.replace(" and ", " ") 

1031 new_src = new_src.replace(" or ", " ") 

1032 new_tagsets, new_topics = decode_tags1( 

1033 new_src, allow_any, no_unknown_starts 

1034 ) 

1035 # print(f"{new_tagsets=}") 

1036 

1037 if new_tagsets or new_topics: 

1038 old_errors = sum( 

1039 1 for tagset in tagsets for s in tagset if s.startswith("error") 

1040 ) 

1041 old_errors += sum(1 for s in topics if s.startswith("error")) 

1042 new_errors = sum( 

1043 1 

1044 for new_tagset in new_tagsets 

1045 for s in new_tagset 

1046 if s.startswith("error") 

1047 ) 

1048 new_errors += sum(1 for s in new_topics if s.startswith("error")) 

1049 

1050 if new_errors <= old_errors: 1050 ↛ 1053line 1050 didn't jump to line 1053 because the condition on line 1050 was always true

1051 return new_tagsets, new_topics 

1052 

1053 return tagsets, topics 

1054 

1055 

1056def decode_tags1( 

1057 src: str, 

1058 allow_any=False, 

1059 no_unknown_starts=False, 

1060) -> tuple[list[tuple[str, ...]], list[str]]: 

1061 """Decodes tags, doing some canonicalizations. This returns a list of 

1062 lists of tags and a list of topics.""" 

1063 assert isinstance(src, str) 

1064 

1065 # print("decode_tags: src={!r}".format(src)) 

1066 

1067 pos_paths: list[list[list[PosPathStep]]] = [[[]]] 

1068 wordlst: list[str] = [] 

1069 max_last_i = 0 # pre-initialized here so that it can be used as a ref 

1070 

1071 add_new = functools.partial( 

1072 add_new1, # pre-set parameters and references for function 

1073 pos_paths=pos_paths, 

1074 wordlst=wordlst, 

1075 allow_any=allow_any, 

1076 no_unknown_starts=no_unknown_starts, 

1077 max_last_i=max_last_i, 

1078 ) 

1079 # First split the tags at commas and semicolons. Their significance is that 

1080 # a multi-word sequence cannot continue across them. 

1081 parts = split_at_comma_semi(src, extra=[";", ":"]) 

1082 

1083 for part in parts: 

1084 max_last_i = len(wordlst) # "how far have we gone?" 

1085 lst1 = part.split() 

1086 if not lst1: 

1087 continue 

1088 wordlst.extend(lst1) 

1089 cur_nodes: list[tuple[ValidNode, int, int]] = [] # Currently seen 

1090 for w in lst1: 

1091 i = len(pos_paths) - 1 

1092 new_nodes: list[tuple[ValidNode, int, int]] = [] 

1093 # replacement nodes for next loop 

1094 new_paths: list[list[PosPathStep]] = [] 

1095 # print("ITER i={} w={} max_last_i={} wordlst={}" 

1096 # .format(i, w, max_last_i, wordlst)) 

1097 node: ValidNode 

1098 start_i: int 

1099 last_i: int 

1100 for node, start_i, last_i in cur_nodes: 

1101 # ValidNodes are part of a search tree that checks if a 

1102 # phrase is found in xlat_tags_map and other text->tags dicts. 

1103 if w in node.children: 

1104 # the phrase continues down the tree 

1105 # print("INC", w) 

1106 max_last_i = add_new( 

1107 node.children[w], 

1108 i, 

1109 start_i, 

1110 last_i, 

1111 new_paths, 

1112 new_nodes, 

1113 ) 

1114 if node.end: 

1115 # we've hit an end point, the tags and topics have already 

1116 # been gathered at some point, don't do anything with the 

1117 # old stuff 

1118 if w in valid_sequences.children: 

1119 # This starts a *new* possible section 

1120 max_last_i = add_new( 

1121 valid_sequences.children[w], # root-> 

1122 i, 

1123 i, 

1124 i, 

1125 new_paths, 

1126 new_nodes, 

1127 ) 

1128 if w not in node.children and not node.end: 

1129 # print("w not in node and $: i={} last_i={} wordlst={}" 

1130 # .format(i, last_i, wordlst)) 

1131 # If i == last_i == 0, for example (beginning) 

1132 if ( 

1133 i == last_i 

1134 or no_unknown_starts 

1135 or wordlst[last_i] not in allowed_unknown_starts 

1136 ): 

1137 # print("NEW", w) 

1138 if w in valid_sequences.children: 

1139 # Start new sequences here 

1140 max_last_i = add_new( 

1141 valid_sequences.children[w], 

1142 i, 

1143 i, 

1144 last_i, 

1145 new_paths, 

1146 new_nodes, 

1147 ) 

1148 if not new_nodes: 

1149 # This is run at the start when i == max_last_i == 0, 

1150 # which is what populates the first node in new_nodes. 

1151 # Some initial words cause the rest to be interpreted as unknown 

1152 # print("not new nodes: i={} last_i={} wordlst={}" 

1153 # .format(i, max_last_i, wordlst)) 

1154 if ( 

1155 i == max_last_i 

1156 or no_unknown_starts 

1157 or wordlst[max_last_i] not in allowed_unknown_starts 

1158 ): 

1159 # print("RECOVER w={} i={} max_last_i={} wordlst={}" 

1160 # .format(w, i, max_last_i, wordlst)) 

1161 if w in valid_sequences.children: 

1162 max_last_i = add_new( 

1163 # new sequence from root 

1164 valid_sequences.children[w], 

1165 i, 

1166 i, 

1167 max_last_i, 

1168 new_paths, 

1169 new_nodes, 

1170 ) 

1171 cur_nodes = new_nodes # Completely replace nodes! 

1172 # 2023-08-18, fix to improve performance 

1173 # Decode tags does a big search of the best-shortest matching 

1174 # sequences of tags, but the original algorithm didn't have 

1175 # any culling happen during operation, so in a case with 

1176 # a lot of tags (for example, big blocks of text inserted 

1177 # somewhere by mistake that is processed by decode_tags), 

1178 # it would lead to exponential growth of new_paths contents. 

1179 # This culling, using the same weighting algorithm code as 

1180 # in the original is just applied to new_paths before it is 

1181 # added to pos_paths. Basically it's "take the 10 best paths". 

1182 # This *can* cause bugs if it gets stuck in a local minimum 

1183 # or something, but this whole process is one-dimensional 

1184 # and not that complex, so hopefully it works out... 

1185 pw = [] 

1186 path: list[PosPathStep] 

1187 for path in new_paths: 

1188 weight = len(path) 

1189 if any(x[1] == ["UNKNOWN"] for x in path): 

1190 weight += 100 # Penalize unknown paths 

1191 pw.append((weight, path)) 

1192 new_paths = [weightpath[1] for weightpath in sorted(pw)[:10]] 

1193 pos_paths.append(new_paths) 

1194 

1195 # print("END max_last_i={} len(wordlst)={} len(pos_paths)={}" 

1196 # .format(max_last_i, len(wordlst), len(pos_paths))) 

1197 

1198 if cur_nodes: 

1199 # print("END HAVE_NODES") 

1200 for node, start_i, last_i in cur_nodes: 

1201 if node.end: 

1202 # print("$ END start_i={} last_i={}" 

1203 # .format(start_i, last_i)) 

1204 for path in pos_paths[start_i]: 

1205 pos_paths[-1].append( 

1206 [(last_i, node.tags, node.topics)] + path 

1207 ) 

1208 else: 

1209 # print("UNK END start_i={} last_i={} wordlst={}" 

1210 # .format(start_i, last_i, wordlst)) 

1211 u = check_unknown( 

1212 last_i, 

1213 len(wordlst), 

1214 len(wordlst), 

1215 wordlst, 

1216 allow_any, 

1217 no_unknown_starts, 

1218 ) 

1219 if pos_paths[start_i]: 

1220 for path in pos_paths[start_i]: 

1221 pos_paths[-1].append(u + path) 

1222 else: 

1223 pos_paths[-1].append(u) 

1224 else: 

1225 # Check for a final unknown tag 

1226 # print("NO END NODES max_last_i={}".format(max_last_i)) 

1227 paths = pos_paths[max_last_i] or [[]] 

1228 u = check_unknown( 

1229 max_last_i, 

1230 len(wordlst), 

1231 len(wordlst), 

1232 wordlst, 

1233 allow_any, 

1234 no_unknown_starts, 

1235 ) 

1236 if u: 

1237 # print("end max_last_i={}".format(max_last_i)) 

1238 for path in list(paths): # Copy in case it is the last pos 

1239 pos_paths[-1].append(u + path) 

1240 

1241 # import json 

1242 # print("POS_PATHS:", json.dumps(pos_paths, indent=2, sort_keys=True)) 

1243 

1244 if not pos_paths[-1]: 

1245 # print("decode_tags: {}: EMPTY POS_PATHS[-1]".format(src)) 

1246 return [], [] 

1247 

1248 # Find the best path 

1249 pw = [] 

1250 for path in pos_paths[-1]: 

1251 weight = len(path) 

1252 if any(x[1] == ["UNKNOWN"] for x in path): 

1253 weight += 100 # Penalize unknown paths 

1254 pw.append((weight, path)) 

1255 path = min(pw)[1] 

1256 

1257 # Convert the best path to tagsets and topics 

1258 tagsets: list[list[str]] = [[]] 

1259 topics: list[str] = [] 

1260 for i, tagspec, topicspec in path: 

1261 if len(tagsets or "") > 16: 

1262 # ctx.error("Too many tagsets! This is probably exponential", 

1263 # sortid="form_descriptions/20230818") 

1264 return [("error-unknown-tag", "error-exponential-tagsets")], [] 

1265 if tagspec == ["UNKNOWN"]: 

1266 new_tagsets = [] 

1267 for x in tagsets: 

1268 new_tagsets.append(x + topicspec) 

1269 tagsets = new_tagsets 

1270 continue 

1271 if tagspec: 

1272 new_tagsets = [] 

1273 for x in tagsets: 

1274 for t in tagspec: 

1275 if t: 1275 ↛ 1282line 1275 didn't jump to line 1282 because the condition on line 1275 was always true

1276 new_tags = list(x) 

1277 for tag in t.split(): 

1278 if tag not in new_tags: 

1279 new_tags.append(tag) 

1280 new_tagsets.append(new_tags) 

1281 else: 

1282 new_tagsets.append(x) 

1283 tagsets = new_tagsets 

1284 if topicspec: 

1285 for t in topicspec: 

1286 for topic in t.split(): 

1287 if topic not in topics: 

1288 topics.append(topic) 

1289 

1290 # print("unsorted tagsets:", tagsets) 

1291 ret_tagsets = sorted(set(tuple(sorted(set(tags))) for tags in tagsets)) 

1292 # topics = list(sorted(set(topics))) XXX tests expect not sorted 

1293 # print("decode_tags: {} -> {} topics {}".format(src, tagsets, topics)) 

1294 # Yes, ret_tagsets is a list of tags in tuples, while topics is a LIST 

1295 # of tags. Turning topics into a tuple breaks tests, turning the tuples 

1296 # inside tagsets into lists breaks tests, I'm leaving them mismatched 

1297 # for now. XXX 

1298 return ret_tagsets, topics 

1299 

1300 

1301def parse_head_final_tags( 

1302 wxr: WiktextractContext, lang: str, form: str 

1303) -> tuple[str, list[str]]: 

1304 """Parses tags that are allowed at the end of a form head from the end 

1305 of the form. This can also be used for parsing the final gender etc tags 

1306 from translations and linkages.""" 

1307 assert isinstance(wxr, WiktextractContext) 

1308 assert isinstance(lang, str) # Should be language that "form" is for 

1309 assert isinstance(form, str) 

1310 

1311 # print("parse_head_final_tags: lang={} form={!r}".format(lang, form)) 

1312 

1313 # Make sure there are no double spaces in the form as this code does not 

1314 # handle them otherwise. 

1315 form = re.sub(r"\s+", " ", form.strip()) 

1316 if not form: 

1317 return form, [] 

1318 

1319 origform = form 

1320 

1321 tags = [] 

1322 

1323 # If parsing for certain Bantu languages (e.g., Swahili), handle 

1324 # some extra head-final tags first 

1325 if lang in head_final_bantu_langs: 

1326 m = re.search(head_final_bantu_re, form) 

1327 if m is not None: 

1328 tagkeys = m.group(1) 

1329 if not wxr.wtp.title.endswith(tagkeys): # type:ignore[union-attr] 1329 ↛ 1344line 1329 didn't jump to line 1344 because the condition on line 1329 was always true

1330 form = form[: m.start()] 

1331 v = head_final_bantu_map[tagkeys] 

1332 if v.startswith("?"): 1332 ↛ 1333line 1332 didn't jump to line 1333 because the condition on line 1332 was never true

1333 v = v[1:] 

1334 wxr.wtp.debug( 

1335 "suspicious suffix {!r} in language {}: {}".format( 

1336 tagkeys, lang, origform 

1337 ), 

1338 sortid="form_descriptions/1028", 

1339 ) 

1340 tags.extend(v.split()) 

1341 

1342 # If parsing for certain Semitic languages (e.g., Arabic), handle 

1343 # some extra head-final tags first 

1344 if lang in head_final_semitic_langs: 

1345 m = re.search(head_final_semitic_re, form) 

1346 if m is not None: 

1347 tagkeys = m.group(1) 

1348 if not wxr.wtp.title.endswith(tagkeys): # type:ignore[union-attr] 1348 ↛ 1363line 1348 didn't jump to line 1363 because the condition on line 1348 was always true

1349 form = form[: m.start()] 

1350 v = head_final_semitic_map[tagkeys] 

1351 if v.startswith("?"): 1351 ↛ 1352line 1351 didn't jump to line 1352 because the condition on line 1351 was never true

1352 v = v[1:] 

1353 wxr.wtp.debug( 

1354 "suspicious suffix {!r} in language {}: {}".format( 

1355 tagkeys, lang, origform 

1356 ), 

1357 sortid="form_descriptions/1043", 

1358 ) 

1359 tags.extend(v.split()) 

1360 

1361 # If parsing for certain other languages (e.g., Lithuanian, 

1362 # French, Finnish), handle some extra head-final tags first 

1363 if lang in head_final_other_langs: 

1364 m = re.search(head_final_other_re, form) 

1365 if m is not None: 

1366 tagkeys = m.group(1) 

1367 if not wxr.wtp.title.endswith(tagkeys): # type:ignore[union-attr] 1367 ↛ 1373line 1367 didn't jump to line 1373 because the condition on line 1367 was always true

1368 form = form[: m.start()] 

1369 tags.extend(head_final_other_map[tagkeys].split(" ")) 

1370 

1371 # Handle normal head-final tags 

1372 # Loop this until nothing is found 

1373 while True: 

1374 prev_form = form 

1375 m = re.search(head_final_re, form) 

1376 if m is not None: 

1377 # print(f"{m=}, {m.groups()=}") 

1378 tagkeys = m.group(3) 

1379 # Only replace tags ending with numbers in languages that have 

1380 # head-final numeric tags (e.g., Bantu classes); also, don't replace 

1381 # tags if the main title ends with them (then presume they are part 

1382 # of the word) 

1383 # print("head_final_tags form={!r} tagkeys={!r} lang={}" 

1384 # .format(form, tagkeys, lang)) 

1385 tagkeys_contains_digit = re.search(r"\d", tagkeys) 

1386 if ( 

1387 (not tagkeys_contains_digit or lang in head_final_numeric_langs) 

1388 and not wxr.wtp.title.endswith(" " + tagkeys) # type:ignore[union-attr] 

1389 and 

1390 # XXX the above test does not capture when the whole word is a 

1391 # xlat_head_map key, so I added the below test to complement 

1392 # it; does this break anything? 

1393 not wxr.wtp.title == tagkeys 

1394 ): # defunct/English, 

1395 # "more defunct" -> "more" ["archaic"] 

1396 if ( 1396 ↛ 1414line 1396 didn't jump to line 1414 because the condition on line 1396 was always true

1397 not tagkeys_contains_digit 

1398 or lang in head_final_numeric_langs 

1399 ): 

1400 # m.start(3) gets the start of what is in m.group(3), handy 

1401 form = form[: m.start(3)].strip() 

1402 v = xlat_head_map[tagkeys] 

1403 if v.startswith("?"): 1403 ↛ 1404line 1403 didn't jump to line 1404 because the condition on line 1403 was never true

1404 v = v[1:] 

1405 wxr.wtp.debug( 

1406 "suspicious suffix {!r} in language {}: {}".format( 

1407 tagkeys, lang, origform 

1408 ), 

1409 sortid="form_descriptions/1077", 

1410 ) 

1411 tags.extend(v.split()) 

1412 else: 

1413 break 

1414 if prev_form == form: 

1415 break 

1416 

1417 # Generate warnings about words ending in " or" after processing 

1418 if ( 

1419 (form.endswith(" or") and not origform.endswith(" or")) 

1420 or re.search( 

1421 r" (1|2|3|4|5|6|7|8|9|10|11|12|13|14|15|16|17|18|" 

1422 r"1a|2a|9a|10a|m1|f1|f2|m2|f3|m3|f4|m4|f5|m5|or|\?)" 

1423 r"($|/| (f|m|sg|pl|anim|inan))", 

1424 form, 

1425 ) 

1426 or form.endswith(" du") 

1427 ): 

1428 if form not in ok_suspicious_forms: 

1429 wxr.wtp.debug( 

1430 "suspicious unhandled suffix in {}:" 

1431 " {!r}, originally {!r}".format( 

1432 lang, form, origform 

1433 ), 

1434 sortid="form_descriptions/1089", 

1435 ) 

1436 

1437 # print("parse_head_final_tags: form={!r} tags={}".format(form, tags)) 

1438 return form, tags 

1439 

1440 

1441def quote_kept_parens(s: str) -> str: 

1442 """Changes certain parenthesized expressions so that they won't be 

1443 interpreted as parentheses. This is used for parts that are kept as 

1444 part of the word, such as "read admiral (upper half)".""" 

1445 return re.sub( 

1446 r"\((lower half|upper half|k|s|n|II|III|A|C|G|U|Y|" 

1447 r"vinyl|p-phenylene vinylene|\(\(\s*\)\))\)", 

1448 r"__lpar__\1__rpar__", 

1449 s, 

1450 ) 

1451 

1452 

1453def quote_kept_ruby( 

1454 wxr: WiktextractContext, 

1455 ruby_tuples: list[ 

1456 tuple[ 

1457 str, 

1458 str, 

1459 ] 

1460 ], 

1461 s: str, 

1462) -> str: 

1463 if len(ruby_tuples) < 1: 1463 ↛ 1464line 1463 didn't jump to line 1464 because the condition on line 1463 was never true

1464 wxr.wtp.debug( 

1465 "quote_kept_ruby called with no ruby", 

1466 sortid="form_description/1114/20230517", 

1467 ) 

1468 return s 

1469 ks = [] 

1470 rs = [] 

1471 for k, r in ruby_tuples: 

1472 ks.append(re.escape(k)) 

1473 rs.append(re.escape(r)) 

1474 if not (ks and rs): 1474 ↛ 1475line 1474 didn't jump to line 1475 because the condition on line 1474 was never true

1475 wxr.wtp.debug( 

1476 f"empty column in ruby_tuples: {ruby_tuples}", 

1477 sortid="form_description/1124/20230606", 

1478 ) 

1479 return s 

1480 newm = re.compile( 

1481 r"({})\s*\(\s*({})\s*\)".format("|".join(ks), "|".join(rs)) 

1482 ) 

1483 rub_re = re.compile( 

1484 r"({})".format( 

1485 r"|".join( 

1486 r"{}\(*{}\)*".format( 

1487 re.escape(k), 

1488 re.escape(r), 

1489 ) 

1490 for k, r in ruby_tuples 

1491 ) 

1492 ) 

1493 ) 

1494 

1495 def paren_replace(m: re.Match) -> str: 

1496 return re.sub(newm, r"\1__lrub__\2__rrub__", m.group(0)) 

1497 

1498 return re.sub(rub_re, paren_replace, s) 

1499 

1500 

1501def unquote_kept_parens(s: str) -> str: 

1502 """Conerts the quoted parentheses back to normal parentheses.""" 

1503 return re.sub(r"__lpar__(.*?)__rpar__", r"(\1)", s) 

1504 

1505 

1506def add_romanization( 

1507 wxr: WiktextractContext, 

1508 data: WordData, 

1509 roman: str, 

1510 text: str, 

1511 is_reconstruction: bool, 

1512 head_group: Optional[int], 

1513 ruby: Sequence[tuple[str, str]], 

1514) -> None: 

1515 tags_lst = ["romanization"] 

1516 m = re.match(r"([^:]+):(.+)", roman) 

1517 # This function's purpose is to intercept broken romanizations, 

1518 # like "Yale: hēnpyeng" style tags. Most romanization styles 

1519 # are already present as tags, so we can use decode_tags to find 

1520 # them. 

1521 if m: 1521 ↛ 1522line 1521 didn't jump to line 1522 because the condition on line 1521 was never true

1522 tagsets, topics = decode_tags(m.group(1)) 

1523 if tagsets: 

1524 for tags in tagsets: 

1525 tags_lst.extend(tags) 

1526 roman = m.group(2) 

1527 add_related( 

1528 wxr, 

1529 data, 

1530 tags_lst, 

1531 [roman], 

1532 text, 

1533 True, 

1534 is_reconstruction, 

1535 head_group, 

1536 ruby, 

1537 ) 

1538 

1539 

1540def add_related( 

1541 wxr: WiktextractContext, 

1542 data: WordData, 

1543 tags_lst: Union[list[str], tuple[str, ...]], 

1544 related_list: list[str], 

1545 origtext: str, 

1546 add_all_canonicals: bool, 

1547 is_reconstruction: bool, 

1548 head_group: Optional[int], 

1549 ruby_data: Optional[Sequence[tuple[str, str]]] = None, 

1550 links: list[tuple[str, str]] | None = None, 

1551 link_dict: dict[str, list[str]] | None = None, 

1552) -> Optional[list[tuple[str, ...]]]: 

1553 """Internal helper function for some post-processing entries for related 

1554 forms (e.g., in word head). This returns a list of list of tags to be 

1555 added to following related forms or None (cf. walrus/English word head, 

1556 parenthesized part starting with "both").""" 

1557 assert isinstance(wxr, WiktextractContext) 

1558 assert isinstance(tags_lst, (list, tuple)) 

1559 for x in tags_lst: 

1560 assert isinstance(x, str) 

1561 assert isinstance(related_list, (list, tuple)) 

1562 assert isinstance(origtext, str) 

1563 assert add_all_canonicals in (True, False) 

1564 assert isinstance(ruby_data, (list, tuple)) or ruby_data is None 

1565 if ruby_data is None: 1565 ↛ 1566line 1565 didn't jump to line 1566 because the condition on line 1565 was never true

1566 ruby_data = [] 

1567 related = " ".join(related_list) 

1568 # print("add_related: tags_lst={} related={}".format(tags_lst, related)) 

1569 if related == "[please provide]": 1569 ↛ 1570line 1569 didn't jump to line 1570 because the condition on line 1569 was never true

1570 return None 

1571 if related in IGNORED_RELATED: 1571 ↛ 1572line 1571 didn't jump to line 1572 because the condition on line 1571 was never true

1572 return None 

1573 if is_reconstruction and related.startswith("*") and len(related) > 1: 

1574 related = related[1:] 

1575 

1576 # print(f"{links=}, {link_dict=}") 

1577 # Get title word, with any reconstruction prefix removed 

1578 titleword = re.sub(r"^Reconstruction:[^/]*/", "", wxr.wtp.title) # type:ignore[arg-type] 

1579 

1580 def check_related(related: str) -> None: 

1581 # Warn about some suspicious related forms 

1582 m = re.search(suspicious_related_re, related) 

1583 if (m and m.group(0) not in titleword) or ( 

1584 related in ("f", "m", "n", "c") and len(titleword) >= 3 

1585 ): 

1586 if "eumhun" in tags_lst: 1586 ↛ 1587line 1586 didn't jump to line 1587 because the condition on line 1586 was never true

1587 return 

1588 if "cangjie-input" in tags_lst: 1588 ↛ 1589line 1588 didn't jump to line 1589 because the condition on line 1588 was never true

1589 return 

1590 if "class" in tags_lst: 1590 ↛ 1591line 1590 didn't jump to line 1591 because the condition on line 1590 was never true

1591 return 

1592 if wxr.wtp.section == "Korean" and re.search( 1592 ↛ 1596line 1592 didn't jump to line 1596 because the condition on line 1592 was never true

1593 r"^\s*\w*>\w*\s*$", related 

1594 ): 

1595 # ignore Korean "i>ni" / "라>나" values 

1596 return 

1597 if ( 1597 ↛ 1604line 1597 didn't jump to line 1604 because the condition on line 1597 was never true

1598 wxr.wtp.section == "Burmese" 

1599 and "romanization" in tags_lst 

1600 and re.search(r":", related) 

1601 ): 

1602 # ignore Burmese with ":", that is used in Burmese 

1603 # translitteration of "း", the high-tone visarga. 

1604 return 

1605 wxr.wtp.debug( 

1606 "suspicious related form tags {}: {!r} in {!r}".format( 

1607 tags_lst, related, origtext 

1608 ), 

1609 sortid="form_descriptions/1147", 

1610 ) 

1611 

1612 following_tagsets = None # Tagsets to add to following related forms 

1613 roman = None 

1614 tagsets1: list[tuple[str, ...]] = [tuple()] 

1615 topics1: list[str] = [] 

1616 

1617 m = re.match(r"\((([^()]|\([^()]*\))*)\)\s+", related) 

1618 if m: 

1619 paren = m.group(1) 

1620 related = related[m.end() :] 

1621 m = re.match(r"^(all|both) (.*)", paren) 

1622 if m: 1622 ↛ 1623line 1622 didn't jump to line 1623 because the condition on line 1622 was never true

1623 tagsets1, topics1 = decode_tags(m.group(2)) 

1624 following_tagsets = tagsets1 

1625 else: 

1626 tagsets1, topics1 = decode_tags(paren) 

1627 else: 

1628 m = re.search(r"\s+\((([^()]|\([^()]*\))*)\)$", related) 

1629 if m: 

1630 paren = m.group(1) 

1631 if paren.startswith("U+"): 1631 ↛ 1632line 1631 didn't jump to line 1632 because the condition on line 1631 was never true

1632 related = related[: m.start()] 

1633 else: 

1634 cls = classify_desc(paren) 

1635 if ( 1635 ↛ 1642line 1635 didn't jump to line 1642 because the condition on line 1635 was always true

1636 cls in ("romanization", "english") 

1637 and classify_desc(related[: m.start()]) == "other" 

1638 ): 

1639 roman = paren 

1640 related = related[: m.start()] 

1641 else: 

1642 related = related[: m.start()] 

1643 tagsets1, topics1 = decode_tags(paren) 

1644 if related and related.startswith("{{"): 1644 ↛ 1645line 1644 didn't jump to line 1645 because the condition on line 1644 was never true

1645 wxr.wtp.debug( 

1646 "`{{` in word head form - possible Wiktionary error: {!r}".format( 

1647 related 

1648 ), 

1649 sortid="form_descriptions/1177", 

1650 ) 

1651 return None # Likely Wiktionary coding error 

1652 related = unquote_kept_parens(related) 

1653 # Split related by "/" (e.g., grande/Spanish) superlative in head 

1654 # Do not split if / in word title, see π//Japanese 

1655 if len(related) > 5 and "/" not in wxr.wtp.title: # type:ignore[operator] 

1656 alts = split_at_comma_semi(related, separators=["/"]) 

1657 else: 

1658 alts = [related] 

1659 if ruby_data: 

1660 # prepare some regex stuff in advance 

1661 ks, rs = [], [] 

1662 for k, r in ruby_data: 

1663 ks.append(re.escape(k)) 

1664 rs.append(re.escape(r)) 

1665 splitter = r"((?:{})__lrub__(?:{})__rrub__)".format( 

1666 "|".join(ks), "|".join(rs) 

1667 ) 

1668 for related in alts: 

1669 ruby: list[tuple[str, str]] = [] 

1670 if ruby_data: 

1671 new_related = [] 

1672 rub_split = re.split(splitter, related) 

1673 for s in rub_split: 

1674 m = re.match(r"(.+)__lrub__(.+)__rrub__", s) 

1675 if m: 

1676 # add ruby with (\1, \2) 

1677 ruby.append((m.group(1), m.group(2))) 

1678 new_related.append(m.group(1)) 

1679 else: 

1680 new_related.append(s) 

1681 related = "".join(new_related) 

1682 tagsets2, topics2 = decode_tags(" ".join(tags_lst)) 

1683 for tags1 in tagsets1: 

1684 assert isinstance(tags1, (list, tuple)) 

1685 for tags2 in tagsets2: 

1686 assert isinstance(tags1, (list, tuple)) 

1687 dt: LinkageData = {"word": related} 

1688 if roman: 

1689 dt["roman"] = roman 

1690 if ruby: 

1691 dt["ruby"] = ruby 

1692 if "alt-of" in tags2: 1692 ↛ 1693line 1692 didn't jump to line 1693 because the condition on line 1692 was never true

1693 check_related(related) 

1694 data_extend(data, "tags", tags1) 

1695 data_extend(data, "tags", tags2) 

1696 data_extend(data, "topics", topics1) 

1697 data_extend(data, "topics", topics2) 

1698 data_append(data, "alt_of", dt) 

1699 elif "form-of" in tags2: 1699 ↛ 1700line 1699 didn't jump to line 1700 because the condition on line 1699 was never true

1700 check_related(related) 

1701 data_extend(data, "tags", tags1) 

1702 data_extend(data, "tags", tags2) 

1703 data_extend(data, "topics", topics1) 

1704 data_extend(data, "topics", topics2) 

1705 data_append(data, "form_of", dt) 

1706 elif "compound-of" in tags2: 1706 ↛ 1707line 1706 didn't jump to line 1707 because the condition on line 1706 was never true

1707 check_related(related) 

1708 data_extend(data, "tags", tags1) 

1709 data_extend(data, "tags", tags2) 

1710 data_extend(data, "topics", topics1) 

1711 data_extend(data, "topics", topics2) 

1712 data_append(data, "compound", related) 

1713 else: 

1714 lang = wxr.wtp.section or "LANG_MISSING" 

1715 related, final_tags = parse_head_final_tags( 

1716 wxr, lang, related 

1717 ) 

1718 # print("add_related: related={!r} tags1={!r} tags2={!r} " 

1719 # "final_tags={!r}" 

1720 # .format(related, tags1, tags2, final_tags)) 

1721 tags = list(tags1) + list(tags2) + list(final_tags) 

1722 check_related(related) 

1723 form: FormData = {"form": related} 

1724 if ( 

1725 links 

1726 and link_dict 

1727 and ( 

1728 form_links := match_links_to_form( 

1729 wxr, related, links, link_dict 

1730 ) 

1731 ) 

1732 ): 

1733 form["links"] = form_links 

1734 if head_group: 

1735 form["head_nr"] = head_group 

1736 if roman: 

1737 form["roman"] = roman 

1738 if ruby: 

1739 form["ruby"] = ruby 

1740 data_extend(form, "topics", topics1) 

1741 data_extend(form, "topics", topics2) 

1742 if topics1 or topics2: 1742 ↛ 1743line 1742 didn't jump to line 1743 because the condition on line 1742 was never true

1743 wxr.wtp.debug( 

1744 "word head form has topics: {}".format(form), 

1745 sortid="form_descriptions/1233", 

1746 ) 

1747 # Add tags from canonical form into the main entry 

1748 if "canonical" in tags: 

1749 if related in ("m", "f") and len(titleword) > 1: 1749 ↛ 1750line 1749 didn't jump to line 1750 because the condition on line 1749 was never true

1750 wxr.wtp.debug( 

1751 "probably incorrect canonical form " 

1752 "{!r} ignored (probably tag combination " 

1753 "missing from xlat_head_map)".format(related), 

1754 sortid="form_descriptions/1241", 

1755 ) 

1756 continue 

1757 if ( 

1758 related != titleword 

1759 or add_all_canonicals 

1760 or topics1 

1761 or topics2 

1762 or ruby 

1763 ): 

1764 data_extend(form, "tags", sorted(set(tags))) 

1765 else: 

1766 # We won't add canonical form here 

1767 filtered_tags = list( 

1768 x for x in tags if x != "canonical" 

1769 ) 

1770 data_extend(data, "tags", filtered_tags) 

1771 continue 

1772 else: 

1773 data_extend(form, "tags", sorted(set(tags))) 

1774 # Only insert if the form is not already there 

1775 for old in data.get("forms", ()): 

1776 if form == old: 1776 ↛ 1777line 1776 didn't jump to line 1777 because the condition on line 1776 was never true

1777 break 

1778 else: 

1779 data_append(data, "forms", form) 

1780 

1781 # If this form had pre-tags that started with "both" or "all", add those 

1782 # tags also to following related forms that don't have their own tags 

1783 # specified. 

1784 return following_tagsets 

1785 

1786 

1787def match_links_to_form( 

1788 wxr: WiktextractContext, 

1789 form: str, 

1790 links: list[tuple[str, str]], 

1791 link_dict: dict[str, list[str]] | None, 

1792) -> list[tuple[str, str]] | None: 

1793 if not links: 1793 ↛ 1794line 1793 didn't jump to line 1794 because the condition on line 1793 was never true

1794 return None 

1795 if link_dict is None: 

1796 link_dict = {} 

1797 for ltxt, ltrg in links: 

1798 if ltxt not in link_dict: 

1799 link_dict[ltxt] = [ 

1800 ltrg, 

1801 ] 

1802 else: 

1803 link_dict[ltxt].append(ltrg) 

1804 ret: list[tuple[str, str]] = [] 

1805 if form in link_dict: 

1806 if len(link_dict[form]) > 1: 

1807 wxr.wtp.warning( 

1808 f"{form=} has many different " 

1809 "link candidates `{link_dict[form]}`, " 

1810 "which can't be disambiguated.", 

1811 sortid="form_descriptions/match_links_to_form", 

1812 ) 

1813 for ltarg in link_dict[form]: 

1814 ret.append((form, ltarg)) 

1815 elif " " in form: 

1816 # split and search for a sequence of links... 

1817 split_forms = form.split() 

1818 found = False 

1819 for i, (ltext, ltarg) in enumerate(links): 

1820 if ltext == split_forms[0]: 

1821 for j, f in enumerate(split_forms): 

1822 if i + j >= len(links): 

1823 break 

1824 if f.strip(",;() ") != links[i + j][0].strip(",;() "): 

1825 break 

1826 if i + j == len(links): 1826 ↛ 1827line 1826 didn't jump to line 1827 because the condition on line 1826 was never true

1827 break 

1828 else: 

1829 found = True 

1830 if found: 

1831 ret = links[i : i + len(split_forms)] 

1832 break 

1833 # We only care about weird links 

1834 # print(f"{len(ret)=}, {ret}") 

1835 for txt, tar in ret: 

1836 if txt != tar and txt != tar[: tar.find("#")]: 

1837 break 

1838 else: 

1839 return None 

1840 return ret or None 

1841 

1842 

1843# Issue #967, in English word forms sometimes forms are skipped because 

1844# they are taggable words and their distw() is too big, like clipping from clip 

1845WORDS_WITH_FALSE_POSITIVE_TAGS: dict[str, list[str]] = { 

1846 "clip": ["clipping"], # XXX remember to change me back to clipping after 

1847 "English": ["English", "Englishes"], 

1848 "common": ["common", "commoner"], 

1849 # tests. 

1850} 

1851 

1852WORDS_WITH_FALSE_POSITIVE_FORMS: dict[str, list[str]] = { 

1853 "unaccountability": ["countable", "uncountable"], 

1854 "uncountability": ["countable", "uncountable"], 

1855} 

1856 

1857FALSE_POSITIVE_MISSING_FORMS: dict[str, list[str]] = {} 

1858 

1859FORM_ASSOCIATED_TAG_WORDS: set[str] = { 

1860 "participle", 

1861 "past", 

1862 "present", 

1863 "singular", 

1864 "plural", 

1865 "first-person", 

1866 "second-person", 

1867 "third-person", 

1868 "gerund", 

1869} 

1870 

1871 

1872def parse_word_head( 

1873 wxr: WiktextractContext, 

1874 word: str, 

1875 pos: str, 

1876 text: str, 

1877 data: WordData, 

1878 is_reconstruction: bool, 

1879 head_group: Optional[int], 

1880 original_header_nodes: list[WikiNode | str] | None = None, 

1881 ruby=None, 

1882 links: list[ 

1883 tuple[ 

1884 str, 

1885 str, 

1886 ] 

1887 ] 

1888 | None = None, 

1889) -> None: 

1890 """Parses the head line for a word for in a particular language and 

1891 part-of-speech, extracting tags and related forms.""" 

1892 assert isinstance(wxr, WiktextractContext) 

1893 assert isinstance(pos, str) 

1894 assert isinstance(text, str) 

1895 assert isinstance(data, dict) 

1896 assert isinstance(ruby, (list, tuple)) or ruby is None 

1897 if ruby is None: 

1898 ruby = [] 

1899 assert is_reconstruction in (True, False) 

1900 # print("PARSE_WORD_HEAD: {}: {!r}".format(wxr.wtp.section, text)) 

1901 # print(f"PARSE_WORD_HEAD: {data=}") 

1902 # print(f"PARSE_WORD_HEAD: {links=}") 

1903 

1904 # Save original text for if we want to look for mismatched form-links 

1905 

1906 link_dict: dict[str, list[str]] | None 

1907 if links is not None: 

1908 link_dict = {} 

1909 for ltxt, ltrg in links: 

1910 if ltxt not in link_dict: 

1911 link_dict[ltxt] = [ 

1912 ltrg, 

1913 ] 

1914 else: 

1915 link_dict[ltxt].append(ltrg) 

1916 else: 

1917 link_dict = None 

1918 

1919 # print(f"MAIN: {links=}") 

1920 link_words_not_alnum = [] 

1921 if not word.isalnum(): 

1922 # `-` is kosher, add more of these if needed. 

1923 if word.replace("-", "").isalnum(): 

1924 pass 

1925 else: 

1926 # if the word contains non-letter or -number characters, it 

1927 # might have something that messes with split-at-semi-comma; we 

1928 # collect links so that we can skip splitting them. 

1929 if links is None and original_header_nodes is not None: 

1930 links, _ = extract_links_from_node( 

1931 wxr, 

1932 original_header_nodes, 

1933 remove_anchor_tags=True, 

1934 expand_nodes=True, 

1935 ) 

1936 if links is not None: 1936 ↛ 1940line 1936 didn't jump to line 1940 because the condition on line 1936 was always true

1937 for ltext, ltar in links: 

1938 if not ltext.isalnum(): 

1939 link_words_not_alnum.append(ltext) 

1940 if word not in link_words_not_alnum: 1940 ↛ 1943line 1940 didn't jump to line 1943 because the condition on line 1940 was always true

1941 link_words_not_alnum.append(word) 

1942 

1943 if link_words_not_alnum is None: 1943 ↛ 1944line 1943 didn't jump to line 1944 because the condition on line 1943 was never true

1944 link_words_not_alnum = [] 

1945 

1946 if len(link_words_not_alnum) > 0: 

1947 # if we have link data (that is, links with stuff like commas and 

1948 # spaces, replace word_re with a modified local scope pattern 

1949 # print(f"links {list((c, ord(c)) for link in links for c in link)=}") 

1950 word_re = re.compile( 

1951 r"\b" # In case we have forms that are longer and contain links 

1952 + 

1953 # or words as a substring... 

1954 r"\b|\b".join( 

1955 sorted( 

1956 (re.escape(s) for s in link_words_not_alnum), 

1957 key=lambda x: -len(x), 

1958 ) 

1959 ) 

1960 + r"\b|" 

1961 + word_pattern 

1962 ) 

1963 else: 

1964 word_re = word_re_global 

1965 

1966 if "Lua execution error" in text or "Lua timeout error" in text: 1966 ↛ 1967line 1966 didn't jump to line 1967 because the condition on line 1966 was never true

1967 return 

1968 

1969 # Fix words with "superlative:" or "comparative:" at end of head 

1970 # e.g. grande/Spanish/Adj 

1971 text = re.sub(r" (superlative|comparative): (.*)", r" (\1 \2)", text) 

1972 

1973 # Parse Arabic non-past forms, e.g. أبلع/Arabic/Verb 

1974 m = re.search(r", non-past ([^)]+ \([^)]+\))", text) 

1975 if m: 

1976 add_related( 

1977 wxr, 

1978 data, 

1979 ["non-past"], 

1980 [m.group(1)], 

1981 text, 

1982 True, 

1983 is_reconstruction, 

1984 head_group, 

1985 ruby, 

1986 links, 

1987 link_dict, 

1988 ) 

1989 text = text[: m.start()] + text[m.end() :] 

1990 

1991 language = wxr.wtp.section 

1992 titleword = re.sub( 

1993 r"^Reconstruction:[^/]*/", "", wxr.wtp.title or "MISSING_TITLE" 

1994 ) 

1995 titleparts = list( 

1996 m.group(0) 

1997 for m in re.finditer(word_re, wxr.wtp.title or "MISSING_TITLE") 

1998 ) 

1999 if not titleparts: 1999 ↛ 2000line 1999 didn't jump to line 2000 because the condition on line 1999 was never true

2000 return 

2001 

2002 # Remove " or" from the end to prevent weird canonical forms 

2003 if text.endswith(" or"): 

2004 for tp in titleparts: 

2005 if text.endswith(tp): 2005 ↛ 2006line 2005 didn't jump to line 2006 because the condition on line 2005 was never true

2006 break 

2007 else: 

2008 text = text.removesuffix(" or").rstrip() 

2009 

2010 # Handle the part of the head that is not in parentheses. However, certain 

2011 # parenthesized parts are part of word, and those must be handled 

2012 # specially here. 

2013 if ruby: 

2014 text = quote_kept_ruby(wxr, ruby, text) 

2015 base = text 

2016 base = quote_kept_parens(base) 

2017 base = remove_text_in_parentheses(base) 

2018 base = base.replace("?", "") # Removes uncertain articles etc 

2019 base = re.sub(r"\s+", " ", base) 

2020 base = re.sub(r" ([,;])", r"\1", base) 

2021 base = re.sub(r" • ", r" ", base) 

2022 # Many languages use • as a punctuation mark separating the base 

2023 # from the rest of the head. στάδιος/Ancient Greek, issue #176 

2024 base = base.strip() 

2025 # print(f"{base=}") 

2026 

2027 # Check for certain endings in head (mostly for compatibility with weird 

2028 # heads, e.g. rata/Romanian "1st conj." at end) 

2029 m = re.search(head_end_re, base) 

2030 tags: Union[tuple[str, ...], list[str]] = [] 

2031 if m: 2031 ↛ 2032line 2031 didn't jump to line 2032 because the condition on line 2031 was never true

2032 tags = head_end_map[m.group(1).lower()].split() 

2033 data_extend(data, "tags", tags) 

2034 base = base[: m.start()] 

2035 

2036 # Special case: handle Hán Nôm readings for Vietnamese characters 

2037 m = re.match( 

2038 r"{}: (Hán Nôm) readings: (.*)".format(re.escape(titleword)), base 

2039 ) 

2040 if m: 2040 ↛ 2041line 2040 didn't jump to line 2041 because the condition on line 2040 was never true

2041 tag, readings = m.groups() 

2042 tag = re.sub(r"\s+", "-", tag) 

2043 for reading in split_at_comma_semi( 

2044 readings, skipped=link_words_not_alnum 

2045 ): 

2046 add_related( 

2047 wxr, 

2048 data, 

2049 [tag], 

2050 [reading], 

2051 text, 

2052 True, 

2053 is_reconstruction, 

2054 head_group, 

2055 ruby, 

2056 links, 

2057 link_dict, 

2058 ) 

2059 return 

2060 

2061 # Special case: Hebrew " [pattern: nnn]" ending 

2062 m = re.search(r"\s+\[pattern: ([^]]+)\]", base) 

2063 if m: 2063 ↛ 2064line 2063 didn't jump to line 2064 because the condition on line 2063 was never true

2064 add_related( 

2065 wxr, 

2066 data, 

2067 ["class"], 

2068 [m.group(1)], 

2069 text, 

2070 True, 

2071 is_reconstruction, 

2072 head_group, 

2073 ruby, 

2074 links, 

2075 link_dict, 

2076 ) 

2077 base = base[: m.start()] + base[m.end() :] 

2078 

2079 # Clean away some messy "Upload an image" template text used in 

2080 # American Sign Language: 

2081 # S@NearBaseForearm-PalmUp Frontandback S@BaseForearm-PalmUp 

2082 m = re.search(r"Upload .+ gif image.", base) 

2083 if m: 2083 ↛ 2084line 2083 didn't jump to line 2084 because the condition on line 2083 was never true

2084 base = base[: m.start()] + base[m.end() :] 

2085 

2086 semicolon_present = False 

2087 # Split the head into alternatives. This is a complicated task, as 

2088 # we do not want so split on "or" or "," when immediately followed by more 

2089 # head-final tags, but otherwise do want to split by them. 

2090 # 20230907 added "or" to this to handle 'true or false', titles with 'or' 

2091 if wxr.wtp.title and ( 

2092 "," in wxr.wtp.title or ";" in wxr.wtp.title or " or " in wxr.wtp.title 

2093 ): 

2094 # If the title has ";", we don't want to split on that and can remove 

2095 # the ; from the splitting regex pretty easily because it's uncommon. 

2096 # However, commas are so common that not splitting on them is just 

2097 # not feasible, and we have to just deal with that if there are 

2098 # alternative forms or variations with stray commas that shouldn't 

2099 # be split. 

2100 if ";" in wxr.wtp.title: 

2101 semicolon_present = True 

2102 base = base.replace(";", "<SEMICOLON>") 

2103 default_splitter = head_split_no_semicolon_re 

2104 else: 

2105 default_splitter = head_split_re 

2106 # A kludge to handle article titles/phrases with commas. 

2107 # Preprocess splits to first capture the title, then handle 

2108 # all the others as usual. 

2109 presplits = re.split(r"({})".format(wxr.wtp.title), base) 

2110 splits = [] 

2111 for psplit in presplits: 

2112 if psplit == wxr.wtp.title: 

2113 splits.append(psplit) 

2114 else: 

2115 splits.extend(re.split(default_splitter, psplit)) 

2116 else: 

2117 # Do the normal split; previous only-behavior. 

2118 splits = re.split(head_split_re, base) 

2119 # print("BASE: ", repr(base)) 

2120 # print("SPLITS:", splits) 

2121 alts: list[str] = [] 

2122 # print("parse_word_head: splits:", splits, 

2123 # "head_split_re_parens:", head_split_re_parens) 

2124 for i in range( 

2125 0, len(splits) - head_split_re_parens, head_split_re_parens + 1 

2126 ): 

2127 v = splits[i] 

2128 ending = splits[i + 1] or "" # XXX is this correct??? 

2129 # print("parse_word_head alts v={!r} ending={!r} alts={}" 

2130 # .format(v, ending, alts)) 

2131 if alts and (v == "" and ending): 

2132 assert ending[0] == " " 

2133 alts[-1] += " or" + ending # endings starts with space 

2134 elif v or ending: 

2135 alts.append((v or "") + (ending or "")) 

2136 last = splits[-1].strip() 

2137 conn = "" if len(splits) < 3 else splits[-2] 

2138 # print("parse_word_head alts last={!r} conn={!r} alts={}" 

2139 # .format(last, conn, alts)) 

2140 if ( 2140 ↛ 2151line 2140 didn't jump to line 2151 because the condition on line 2140 was never true

2141 alts 

2142 and last 

2143 and ( 

2144 last.split()[0] in xlat_head_map 

2145 or ( 

2146 conn == " or " 

2147 and (alts[-1] + " or " + last).strip() in xlat_head_map 

2148 ) 

2149 ) 

2150 ): 

2151 alts[-1] += " or " + last 

2152 elif last: 2152 ↛ 2153line 2152 didn't jump to line 2153 because the condition on line 2152 was never true

2153 alts.append(last) 

2154 

2155 # print("parse_word_head alts: {}".format(alts)) 

2156 # print(f"{base=}") 

2157 

2158 # Process the head alternatives 

2159 canonicals: list[tuple[list[str], list[str]]] = [] 

2160 mode: Optional[str] = None 

2161 for alt_i, alt in enumerate(alts): 

2162 alt = alt.strip() 

2163 if alt.startswith("compound form:"): 2163 ↛ 2164line 2163 didn't jump to line 2164 because the condition on line 2163 was never true

2164 mode = "compound-form" 

2165 alt = alt[14:].strip() 

2166 if ((dash_i := alt.find(" -")) > 0) and ( 

2167 dash_i > (wxr.wtp.title or "").find(" -") 

2168 ): 

2169 # test_en_head / test_suffixes_at_end_of_form1 

2170 # Some heads have suffixes that end up attached to the form 

2171 # like in https://en.wiktionary.org/wiki/%E6%A5%BD%E3%81%97%E3%81%84 

2172 alt = alt[:dash_i] 

2173 if mode == "compound-form": 2173 ↛ 2174line 2173 didn't jump to line 2174 because the condition on line 2173 was never true

2174 add_related( 

2175 wxr, 

2176 data, 

2177 ["in-compounds"], 

2178 [alt], 

2179 text, 

2180 True, 

2181 is_reconstruction, 

2182 head_group, 

2183 ruby, 

2184 links, 

2185 link_dict, 

2186 ) 

2187 continue 

2188 # For non-first parts, see if it can be treated as tags-only 

2189 if alt_i == 0: 

2190 expanded_alts = [alt] 

2191 else: 

2192 expanded_alts = map_with(xlat_descs_map, [alt]) 

2193 # print("EXPANDED_ALTS:", expanded_alts) 

2194 tagsets: Optional[list[tuple[str, ...]]] 

2195 for alt in expanded_alts: 

2196 baseparts = list(m.group(0) for m in word_re.finditer(alt)) 

2197 if alt_i > 0: 

2198 tagsets, topics = decode_tags(" ".join(baseparts)) 

2199 if not any("error-unknown-tag" in x for x in tagsets): 

2200 data_extend(data, "topics", topics) 

2201 for tags1 in tagsets: 

2202 data_extend(data, "tags", tags1) 

2203 continue 

2204 

2205 alt, tags = parse_head_final_tags( 

2206 wxr, language or "MISSING_LANG", alt 

2207 ) 

2208 tags = list(tags) # Make sure we don't modify anything cached 

2209 tags.append("canonical") 

2210 if alt_i == 0 and "," in wxr.wtp.title or ";" in wxr.wtp.title: # type:ignore[operator] 

2211 # Kludge to handle article titles/phrases with commas. 

2212 # basepart's regex strips commas, which leads to a 

2213 # canonical form that is the title phrase without a comma. 

2214 # basepart in add_related is almost immediately joined with 

2215 # spaces anyhow. XXX not exactly sure why it's 

2216 # canonicals.append((tags, baseparts)) and not (tags, [alt]) 

2217 baseparts = [alt] 

2218 canonicals.append((tags, baseparts)) 

2219 

2220 # If more of this kind of replace-and-return-original kind of stuff is 

2221 # needed, make semicolon_present into a flag enum, something like `modified` 

2222 if semicolon_present: 

2223 new_cans = [] 

2224 for tags, baseparts in canonicals: 

2225 new_cans.append( 

2226 (tags, [s.replace("<SEMICOLON>", ";") for s in baseparts]) 

2227 ) 

2228 canonicals = new_cans 

2229 for tags, baseparts in canonicals: 

2230 add_related( 

2231 wxr, 

2232 data, 

2233 tags, 

2234 baseparts, 

2235 text, 

2236 len(canonicals) > 1, 

2237 is_reconstruction, 

2238 head_group, 

2239 ruby, 

2240 links, 

2241 link_dict, 

2242 ) 

2243 

2244 # Handle parenthesized descriptors for the word form and links to 

2245 # related words 

2246 text = quote_kept_parens(text) 

2247 parens = list( 

2248 m.group(2) 

2249 for m in re.finditer(r"(^|\s)\((([^()]|\([^()]*\))*)\)", text) 

2250 ) 

2251 parens.extend( 

2252 m.group(1) 

2253 for m in re.finditer(r"[^\s]\((([^()]|\([^()]*\))*)\)($|\s)", text) 

2254 ) 

2255 have_romanization = False 

2256 have_ruby = False 

2257 hiragana = "" 

2258 katakana = "" 

2259 for paren in parens: 

2260 paren = paren.strip() 

2261 if not paren: 2261 ↛ 2262line 2261 didn't jump to line 2262 because the condition on line 2261 was never true

2262 continue 

2263 if paren.startswith("see "): 

2264 continue 

2265 if paren.startswith("U+"): 2265 ↛ 2266line 2265 didn't jump to line 2266 because the condition on line 2265 was never true

2266 continue 

2267 # In some rare cases, strip word that inflects form the form 

2268 # description, e.g. "look through rose-tinted glasses"/English. 

2269 paren = re.sub(r"\s*\(\[[^])]*\]\)", "", paren) 

2270 

2271 # If it starts with hiragana or katakana, treat as such form. Note 

2272 # that each hiragana/katakana character is in separate parentheses, 

2273 # so we must concatenate them. 

2274 try: 

2275 un = unicodedata.name(paren[0]).split()[0] 

2276 except ValueError: 

2277 un = "INVALID" 

2278 if un == "KATAKANA": 2278 ↛ 2279line 2278 didn't jump to line 2279 because the condition on line 2278 was never true

2279 katakana += paren 

2280 have_ruby = True 

2281 continue 

2282 if un == "HIRAGANA": 2282 ↛ 2283line 2282 didn't jump to line 2283 because the condition on line 2282 was never true

2283 hiragana += paren 

2284 have_ruby = True 

2285 continue 

2286 

2287 # Parse format ", 16 (Japan, Mainland), 17 (Hong Kong, Taiwan) strokes," 

2288 # in the middle of the parenthesized expression, e.g. 薄 

2289 def strokes_repl(m: re.Match) -> str: 

2290 strokes1, tags1, strokes2, tags2 = m.groups() 

2291 for strokes, tags in [[strokes1, tags1], [strokes2, tags2]]: 

2292 tags = tags.split(", ") 

2293 tags = list( 

2294 "Mainland China" if t == "Mainland" else t for t in tags 

2295 ) 

2296 tags.append("strokes") 

2297 add_related( 

2298 wxr, 

2299 data, 

2300 tags, 

2301 [strokes], 

2302 text, 

2303 True, 

2304 is_reconstruction, 

2305 head_group, 

2306 ruby, 

2307 links, 

2308 link_dict, 

2309 ) 

2310 return ", " 

2311 

2312 paren = re.sub( 

2313 r", (\d+) \(([^()]+)\), (\d+) \(([^()]+)\) strokes, ", 

2314 strokes_repl, 

2315 paren, 

2316 ) 

2317 

2318 descriptors = map_with(xlat_descs_map, [paren]) 

2319 new_desc = [] 

2320 for desc in descriptors: 

2321 new_desc.extend( 

2322 map_with( 

2323 xlat_tags_map, 

2324 split_at_comma_semi( 

2325 desc, extra=[", or "], skipped=link_words_not_alnum 

2326 ), 

2327 ) 

2328 ) 

2329 prev_tags: Union[list[list[str]], list[tuple[str, ...]], None] = None 

2330 following_tags = None # Added to prev_tags from previous parenthesized 

2331 # part, e.g. walrus/English 

2332 # "(both nonstandard, proscribed, uncommon)" 

2333 for desc_i, desc in enumerate(new_desc): 

2334 # print("HEAD DESC: {!r}".format(desc)) 

2335 

2336 # Abort on certain descriptors (assume remaining values are 

2337 # examples or uninteresting, cf. gaan/Navajo, horior/Latin) 

2338 if re.match(r"^(per |e\.g\.$)", desc): 2338 ↛ 2339line 2338 didn't jump to line 2339 because the condition on line 2338 was never true

2339 break 

2340 

2341 # If it all consists of CJK characters, add it with the 

2342 # CJK tag. This is used at least for some Vietnamese 

2343 # words (e.g., ba/Vietnamese) 

2344 try: 

2345 if all(unicodedata.name(x).startswith("CJK ") for x in desc): 2345 ↛ 2346line 2345 didn't jump to line 2346 because the condition on line 2345 was never true

2346 add_related( 

2347 wxr, 

2348 data, 

2349 ["CJK"], 

2350 [desc], 

2351 text, 

2352 True, 

2353 is_reconstruction, 

2354 head_group, 

2355 ruby, 

2356 links, 

2357 link_dict, 

2358 ) 

2359 continue 

2360 except ValueError: 

2361 pass 

2362 

2363 # Handle some special cases 

2364 splitdesc = desc.split() 

2365 if ( 2365 ↛ 2374line 2365 didn't jump to line 2374 because the condition on line 2365 was never true

2366 len(splitdesc) >= 3 

2367 and splitdesc[1] == "superlative" 

2368 and classify_desc(splitdesc[0]) != "tags" 

2369 and prev_tags 

2370 ): 

2371 # Handle the special case of second comparative after comma, 

2372 # followed by superlative without comma. E.g. 

2373 # mal/Portuguese/Adv 

2374 for ts in prev_tags: 

2375 add_related( 

2376 wxr, 

2377 data, 

2378 ts, 

2379 [splitdesc[0]], 

2380 text, 

2381 True, 

2382 is_reconstruction, 

2383 head_group, 

2384 ruby, 

2385 links, 

2386 link_dict, 

2387 ) 

2388 desc = " ".join(splitdesc[1:]) 

2389 elif ( 2389 ↛ 2397line 2389 didn't jump to line 2397 because the condition on line 2389 was never true

2390 len(splitdesc) == 2 

2391 and splitdesc[0] in ("also", "and") 

2392 and prev_tags 

2393 and classify_desc(splitdesc[1]) != "tags" 

2394 ): 

2395 # Sometimes alternative forms are prefixed with "also" or 

2396 # "and" 

2397 for ts in prev_tags: 

2398 add_related( 

2399 wxr, 

2400 data, 

2401 ts, 

2402 [splitdesc[1]], 

2403 text, 

2404 True, 

2405 is_reconstruction, 

2406 head_group, 

2407 ruby, 

2408 links, 

2409 link_dict, 

2410 ) 

2411 continue 

2412 elif len(splitdesc) >= 2 and splitdesc[0] in ("including",): 2412 ↛ 2413line 2412 didn't jump to line 2413 because the condition on line 2412 was never true

2413 continue 

2414 

2415 # If only one word, assume it is comma-separated alternative 

2416 # to the previous one 

2417 if " " not in desc: 

2418 cls = classify_desc(desc) 

2419 if cls != "tags": 

2420 if prev_tags: 2420 ↛ 2422line 2420 didn't jump to line 2422 because the condition on line 2420 was never true

2421 # Assume comma-separated alternative to previous one 

2422 for ts in prev_tags: 

2423 add_related( 

2424 wxr, 

2425 data, 

2426 ts, 

2427 [desc], 

2428 text, 

2429 True, 

2430 is_reconstruction, 

2431 head_group, 

2432 ruby, 

2433 links, 

2434 link_dict, 

2435 ) 

2436 continue 

2437 elif distw(titleparts, desc) <= 0.5: 2437 ↛ 2440line 2437 didn't jump to line 2440 because the condition on line 2437 was never true

2438 # Similar to head word, assume a dialectal variation to 

2439 # the base form. Cf. go/Alemannic German/Verb 

2440 add_related( 

2441 wxr, 

2442 data, 

2443 ["alternative"], 

2444 [desc], 

2445 text, 

2446 True, 

2447 is_reconstruction, 

2448 head_group, 

2449 ruby, 

2450 links, 

2451 link_dict, 

2452 ) 

2453 continue 

2454 elif ( 

2455 cls in ("romanization", "english") 

2456 and not have_romanization 

2457 and classify_desc(titleword) == "other" 

2458 and not ( 

2459 "categories" in data and desc in data["categories"] 

2460 ) 

2461 ): 

2462 # Assume it to be a romanization 

2463 add_romanization( 

2464 wxr, 

2465 data, 

2466 desc, 

2467 text, 

2468 is_reconstruction, 

2469 head_group, 

2470 ruby, 

2471 ) 

2472 have_romanization = True 

2473 continue 

2474 

2475 m = re.match(r"^(\d+) strokes?$", desc) 

2476 if m: 

2477 # Special case, used to give #strokes for Han characters 

2478 add_related( 

2479 wxr, 

2480 data, 

2481 ["strokes"], 

2482 [m.group(1)], 

2483 text, 

2484 True, 

2485 is_reconstruction, 

2486 head_group, 

2487 ruby, 

2488 links, 

2489 link_dict, 

2490 ) 

2491 continue 

2492 

2493 # See if it is radical+strokes 

2494 m = re.match( 

2495 r"^([\u2F00-\u2FDF\u2E80-\u2EFF\U00018800-\U00018AFF" 

2496 r"\uA490-\uA4CF\u4E00-\u9FFF]\+\d+)" 

2497 r"( in (Japanese|Chinese|traditional Chinese|" 

2498 r"simplified Chinese))?$", 

2499 desc, 

2500 ) 

2501 if m: 2501 ↛ 2504line 2501 didn't jump to line 2504 because the condition on line 2501 was never true

2502 # Special case, used to give radical + strokes for Han 

2503 # characters 

2504 radical_strokes = m.group(1) 

2505 lang = m.group(3) 

2506 t = ["radical+strokes"] 

2507 if lang: 

2508 t.extend(lang.split()) 

2509 add_related( 

2510 wxr, 

2511 data, 

2512 t, 

2513 [radical_strokes], 

2514 text, 

2515 True, 

2516 is_reconstruction, 

2517 head_group, 

2518 ruby, 

2519 links, 

2520 link_dict, 

2521 ) 

2522 prev_tags = None 

2523 following_tags = None 

2524 continue 

2525 

2526 # See if it indicates historical Katakana ortography (←) or 

2527 # just otherwise katakana/hiragana form 

2528 m = re.match(r"←\s*|kana\s+", desc) 

2529 if m: 2529 ↛ 2530line 2529 didn't jump to line 2530 because the condition on line 2529 was never true

2530 if desc.startswith("←"): 

2531 t1 = "historical " 

2532 else: 

2533 t1 = "" 

2534 x = desc[m.end() :] 

2535 if x.endswith("?"): 

2536 x = x[:-1] 

2537 # XXX should we add a tag indicating uncertainty? 

2538 if x: 

2539 name = unicodedata.name(x[0]) 

2540 if name.startswith("HIRAGANA "): 

2541 desc = t1 + "hiragana " + x 

2542 elif name.startswith("KATAKANA "): 

2543 desc = t1 + "katakana " + x 

2544 

2545 # See if it is "n strokes in Chinese" or similar 

2546 m = re.match( 

2547 r"(\d+) strokes in (Chinese|Japanese|" 

2548 r"traditional Chinese|simplified Chinese)$", 

2549 desc, 

2550 ) 

2551 if m: 2551 ↛ 2553line 2551 didn't jump to line 2553 because the condition on line 2551 was never true

2552 # Special case, used to give just strokes for some Han chars 

2553 strokes = m.group(1) 

2554 lang = m.group(2) 

2555 t = ["strokes"] 

2556 t.extend(lang.split()) 

2557 add_related( 

2558 wxr, 

2559 data, 

2560 t, 

2561 [strokes], 

2562 text, 

2563 True, 

2564 is_reconstruction, 

2565 head_group, 

2566 ruby, 

2567 links, 

2568 link_dict, 

2569 ) 

2570 prev_tags = None 

2571 following_tags = None 

2572 continue 

2573 

2574 # American Sign Language has images (or requests for image) 

2575 # as heads, + this ASL gloss after. 

2576 m2 = re.search(r"\(ASL gloss:\s+(.*)\)", text) 

2577 if m2: 2577 ↛ 2578line 2577 didn't jump to line 2578 because the condition on line 2577 was never true

2578 add_related( 

2579 wxr, 

2580 data, 

2581 ["ASL-gloss"], 

2582 [m2.group(1)], 

2583 text, 

2584 True, 

2585 is_reconstruction, 

2586 head_group, 

2587 ruby, 

2588 links, 

2589 link_dict, 

2590 ) 

2591 continue 

2592 

2593 parts = list(m.group(0) for m in re.finditer(word_re, desc)) 

2594 if not parts: 2594 ↛ 2595line 2594 didn't jump to line 2595 because the condition on line 2594 was never true

2595 prev_tags = None 

2596 following_tags = None 

2597 continue 

2598 

2599 # Check for certain language-specific header part starts that 

2600 # modify 

2601 if len(parts) == 2 and language in lang_specific_head_map: 2601 ↛ 2602line 2601 didn't jump to line 2602 because the condition on line 2601 was never true

2602 ht = lang_specific_head_map[language] 

2603 if parts[0] in ht: 

2604 rem_tags, add_tags = ht[parts[0]] 

2605 new_prev_tags1: list[list[str]] = [] 

2606 tags2: Union[tuple[str, ...], list[str]] 

2607 for tags2 in prev_tags or [()]: 

2608 if rem_tags is True: # Remove all old tags 

2609 tsets = set() 

2610 else: 

2611 tsets = set(tags2) - set(rem_tags.split()) 

2612 tsets = tsets | set(add_tags.split()) 

2613 tags = list(sorted(tsets)) 

2614 add_related( 

2615 wxr, 

2616 data, 

2617 tags, 

2618 [parts[1]], 

2619 text, 

2620 True, 

2621 is_reconstruction, 

2622 head_group, 

2623 ruby, 

2624 links, 

2625 link_dict, 

2626 ) 

2627 new_prev_tags1.append(tags) 

2628 prev_tags = new_prev_tags1 

2629 following_tags = None 

2630 continue 

2631 

2632 # Handle the special case of descriptors that are parenthesized, 

2633 # e.g., (archaic or Scotland) 

2634 m = re.match(r"\(([^)]+)\)\s+(.*)$", desc) 

2635 if m is not None and classify_desc(m.group(1)) == "tags": 2635 ↛ 2636line 2635 didn't jump to line 2636 because the condition on line 2635 was never true

2636 tagpart = m.group(1) 

2637 related = [m.group(2)] 

2638 tagsets, topics = decode_tags(tagpart, no_unknown_starts=True) 

2639 if topics: 

2640 wxr.wtp.debug( 

2641 "parenthized head part {!r} contains topics: {}".format( 

2642 tagpart, topics 

2643 ), 

2644 sortid="form_descriptions/1647", 

2645 ) 

2646 elif m is not None and re.match(r"in the sense ", m.group(1)): 2646 ↛ 2649line 2646 didn't jump to line 2649 because the condition on line 2646 was never true

2647 # Handle certain ignored cases 

2648 # e.g. bord/Danish: in the sense "plank" 

2649 related = [m.group(2)] 

2650 tagsets = [()] 

2651 else: 

2652 # Normal parsing of the descriptor 

2653 alt_related = None 

2654 alt_tagsets = None 

2655 tagsets = None 

2656 for i in range(len(parts), 0, -1): 

2657 related = parts[i:] 

2658 tagparts = parts[:i] 

2659 # print(" i={} related={} tagparts={}" 

2660 # .format(i, related, tagparts)) 

2661 tagsets, topics = decode_tags( 

2662 " ".join(tagparts), no_unknown_starts=True 

2663 ) 

2664 # print("tagparts={!r} tagsets={} topics={} related={} " 

2665 # "alt_related={} distw={:.2f}" 

2666 # .format(tagparts, tagsets, topics, related, 

2667 # alt_related, 

2668 # distw(titleparts, parts[i - 1]))) 

2669 if ( 

2670 topics 

2671 or not tagsets 

2672 or any("error-unknown-tag" in x for x in tagsets) 

2673 ): 

2674 if alt_related is not None: 2674 ↛ 2676line 2674 didn't jump to line 2676 because the condition on line 2674 was never true

2675 # We already had a good division, so let's stop. 

2676 break 

2677 # Bad division, try deeper 

2678 continue 

2679 # print(f"{parts[i-1]=}, {parts=}") 

2680 if ( 

2681 i > 1 

2682 and len(parts[i - 1]) >= 4 

2683 and ( 

2684 distw(titleparts, parts[i - 1]) <= 0.4 

2685 or ( 

2686 wxr.wtp.section == "English" 

2687 and wxr.wtp.title 

2688 in WORDS_WITH_FALSE_POSITIVE_TAGS 

2689 and parts[i - 1] 

2690 in WORDS_WITH_FALSE_POSITIVE_TAGS[wxr.wtp.title] 

2691 ) 

2692 ) 

2693 # Fixes 'unaccountability' wiktext #1196 

2694 and not ( 

2695 wxr.wtp.section == "English" 

2696 and wxr.wtp.title in WORDS_WITH_FALSE_POSITIVE_FORMS 

2697 and parts[i - 1] 

2698 in WORDS_WITH_FALSE_POSITIVE_FORMS[wxr.wtp.title] 

2699 ) 

2700 # Fixes wiktextract #983, where "participle" 

2701 # was too close to "Martinize" and so this accepted 

2702 # ["participle", "Martinize"] as matching; this 

2703 # kludge prevents this from happening if titleparts 

2704 # is shorter than what would be 'related'. 

2705 # This breaks if we want to detect stuff that 

2706 # actually gets an extra space-separated word when 

2707 # 'inflected'. 

2708 and ( 

2709 len(titleparts) >= len(parts[i - 1 :]) 

2710 or "or" in parts[i - 1 :] 

2711 ) 

2712 ): 

2713 # print(f"Reached; {parts=}, {parts[i-1]=}") 

2714 alt_related = related 

2715 alt_tagsets = tagsets 

2716 continue 

2717 alt_related = None 

2718 alt_tagsets = None 

2719 break 

2720 else: 

2721 if alt_related is None: 2721 ↛ 2755line 2721 didn't jump to line 2755 because the condition on line 2721 was always true

2722 # Check if the parenthesized part is likely a 

2723 # romanization 

2724 if ( 2724 ↛ 2732line 2724 didn't jump to line 2732 because the condition on line 2724 was never true

2725 (have_ruby or classify_desc(base) == "other") 

2726 and classify_desc(paren) == "romanization" 

2727 and not ( 

2728 "categories" in data 

2729 and desc in data["categories"] 

2730 ) 

2731 ): 

2732 for r in split_at_comma_semi( 

2733 paren, 

2734 extra=[" or "], 

2735 skipped=link_words_not_alnum, 

2736 ): 

2737 add_romanization( 

2738 wxr, 

2739 data, 

2740 r, 

2741 text, 

2742 is_reconstruction, 

2743 head_group, 

2744 ruby, 

2745 ) 

2746 have_romanization = True 

2747 continue 

2748 tagsets = [("error-unrecognized-head-form",)] 

2749 wxr.wtp.debug( 

2750 "unrecognized head form: {}".format(desc), 

2751 sortid="form_descriptions/1698", 

2752 ) 

2753 continue 

2754 

2755 if alt_related is not None: 2755 ↛ 2756line 2755 didn't jump to line 2756 because the condition on line 2755 was never true

2756 related = alt_related 

2757 tagsets = alt_tagsets 

2758 

2759 # print("FORM END: tagsets={} related={}".format(tagsets, related)) 

2760 # print("==================") 

2761 

2762 if ( 2762 ↛ 2783line 2762 didn't jump to line 2783 because the condition on line 2762 was never true

2763 len(related) <= 0 

2764 and wxr.wtp.section == "English" 

2765 and tagsets is not None 

2766 and len(tagsets) > 0 

2767 and not any( 

2768 s.startswith("error-") for tagset in tagsets for s in tagset 

2769 ) 

2770 and any( 

2771 s in FORM_ASSOCIATED_TAG_WORDS 

2772 for tagset in tagsets 

2773 for s in tagset 

2774 ) 

2775 and ( 

2776 wxr.wtp.title not in FALSE_POSITIVE_MISSING_FORMS 

2777 and not any( 

2778 rel in FALSE_POSITIVE_MISSING_FORMS[wxr.wtp.title or ""] 

2779 for rel in related 

2780 ) 

2781 ) 

2782 ): 

2783 wxr.wtp.debug( 

2784 f"Form tags without form: {desc=}, {tagsets=}", 

2785 sortid="form_description/20250107", 

2786 ) 

2787 if not tagsets: 2787 ↛ 2788line 2787 didn't jump to line 2788 because the condition on line 2787 was never true

2788 continue 

2789 

2790 # print(f"{alts=}, {related=}") 

2791 

2792 assert isinstance(related, (list, tuple)) 

2793 related_str = " ".join(related) 

2794 if "or" in titleparts: 

2795 alts = [related_str] 

2796 else: 

2797 alts = split_at_comma_semi( 

2798 related_str, 

2799 separators=[r"\bor\b"], 

2800 skipped=link_words_not_alnum, 

2801 ) 

2802 # print(f"{related_str=}, {alts=}") 

2803 if not alts: 

2804 alts = [""] 

2805 for related_str in alts: 

2806 if related_str: 

2807 if prev_tags and ( 

2808 all( 

2809 all( 

2810 t in ["nonstandard", "dialectal"] 

2811 or valid_tags[t] == "dialect" 

2812 for t in tags 

2813 ) 

2814 for ts in tagsets 

2815 ) 

2816 or ( 

2817 any("participle" in ts for ts in prev_tags) 

2818 and all( 

2819 "attributive" in ts 

2820 or any(valid_tags[t] == "gender" for t in ts) 

2821 for ts in tagsets 

2822 ) 

2823 ) 

2824 ): 

2825 # Merged with previous tags. Don't update previous 

2826 # tags here; cf. burn/English/Verb 

2827 for tags_l in tagsets: 

2828 for ts in prev_tags: 

2829 tags_l1 = sorted(set(tags_l) | set(ts)) 

2830 add_related( 

2831 wxr, 

2832 data, 

2833 tags_l1, 

2834 [related_str], 

2835 text, 

2836 True, 

2837 is_reconstruction, 

2838 head_group, 

2839 ruby, 

2840 links, 

2841 link_dict, 

2842 ) 

2843 else: 

2844 # Not merged with previous tags 

2845 for tags_l in tagsets: 

2846 if following_tags is not None: 2846 ↛ 2847line 2846 didn't jump to line 2847 because the condition on line 2846 was never true

2847 for ts in following_tags: 

2848 tags_l1 = list( 

2849 sorted(set(tags_l) | set(ts)) 

2850 ) 

2851 add_related( 

2852 wxr, 

2853 data, 

2854 tags_l1, 

2855 [related_str], 

2856 text, 

2857 True, 

2858 is_reconstruction, 

2859 head_group, 

2860 ruby, 

2861 links, 

2862 link_dict, 

2863 ) 

2864 else: 

2865 ret = add_related( 

2866 wxr, 

2867 data, 

2868 tags_l, 

2869 [related_str], 

2870 text, 

2871 True, 

2872 is_reconstruction, 

2873 head_group, 

2874 ruby, 

2875 links, 

2876 link_dict, 

2877 ) 

2878 if ret is not None: 2878 ↛ 2879line 2878 didn't jump to line 2879 because the condition on line 2878 was never true

2879 following_tags = ret 

2880 prev_tags = tagsets 

2881 else: 

2882 if desc_i < len(new_desc) - 1 and all( 2882 ↛ 2889line 2882 didn't jump to line 2889 because the condition on line 2882 was never true

2883 "participle" in ts or "infinitive" in ts 

2884 for ts in tagsets 

2885 ): 

2886 # Interpret it as a standalone form description 

2887 # in the middle, probably followed by forms or 

2888 # language-specific descriptors. cf. drikke/Danish 

2889 new_prev_tags2 = [] 

2890 for ts1 in prev_tags or [()]: 

2891 for ts2 in tagsets: 

2892 ts = tuple(sorted(set(ts1) | set(ts2))) 

2893 new_prev_tags2.append(ts) 

2894 prev_tags = new_prev_tags2 

2895 continue 

2896 for tags in tagsets: 

2897 data_extend(data, "tags", tags) 

2898 prev_tags = tagsets 

2899 following_tags = None 

2900 

2901 # Finally, if we collected hirakana/katakana, add them now 

2902 if hiragana: 2902 ↛ 2903line 2902 didn't jump to line 2903 because the condition on line 2902 was never true

2903 add_related( 

2904 wxr, 

2905 data, 

2906 ["hiragana"], 

2907 [hiragana], 

2908 text, 

2909 True, 

2910 is_reconstruction, 

2911 head_group, 

2912 ruby, 

2913 ) 

2914 if katakana: 2914 ↛ 2915line 2914 didn't jump to line 2915 because the condition on line 2914 was never true

2915 add_related( 

2916 wxr, 

2917 data, 

2918 ["katakana"], 

2919 [katakana], 

2920 text, 

2921 True, 

2922 is_reconstruction, 

2923 head_group, 

2924 ruby, 

2925 ) 

2926 

2927 # XXX check if this is actually relevant, tags in word root data 

2928 # is extremely rare (not sure where they slip through). 

2929 tags = data.get("tags", []) # type:ignore 

2930 if len(tags) > 0: 

2931 # wxr.wtp.debug( 

2932 # f"Tags appear in word root data: {data['tags']=}", # type:ignore 

2933 # sortid="form_descriptions/2620/20240606", 

2934 # ) # Messes up tests. 

2935 data["tags"] = sorted(set(tags)) # type:ignore 

2936 

2937 

2938def parse_sense_qualifier( 

2939 wxr: WiktextractContext, text: str, data: Union[SenseData, LinkageData] 

2940) -> None: 

2941 """Parses tags or topics for a sense or some other data. The values are 

2942 added into the dictionary ``data``.""" 

2943 assert isinstance(wxr, WiktextractContext) 

2944 assert isinstance(text, str) 

2945 assert isinstance(data, dict) 

2946 # print("parse_sense_qualifier:", text) 

2947 if re.match(r"\([^()]+\)$", text): 2947 ↛ 2948line 2947 didn't jump to line 2948 because the condition on line 2947 was never true

2948 text = text[1:-1] 

2949 if re.match(r'"[^"]+"$', text): 2949 ↛ 2950line 2949 didn't jump to line 2950 because the condition on line 2949 was never true

2950 text = text[1:-1] 

2951 lst = map_with(xlat_descs_map, [text]) 

2952 sense_tags: list[str] = [] 

2953 for text in lst: 

2954 for semi in split_at_comma_semi(text): 

2955 if not semi: 2955 ↛ 2956line 2955 didn't jump to line 2956 because the condition on line 2955 was never true

2956 continue 

2957 orig_semi = semi 

2958 idx = semi.find(":") 

2959 if idx >= 0: 2959 ↛ 2960line 2959 didn't jump to line 2960 because the condition on line 2959 was never true

2960 semi = semi[:idx] 

2961 cls = classify_desc(semi, allow_unknown_tags=True) 

2962 # print("parse_sense_qualifier: classify_desc: {} -> {}" 

2963 # .format(semi, cls)) 

2964 if cls == "tags": 

2965 tagsets, topics = decode_tags(semi) 

2966 data_extend(data, "topics", topics) 

2967 # XXX should think how to handle distinct options better, 

2968 # e.g., "singular and plural genitive"; that can't really be 

2969 # done with changing the calling convention of this function. 

2970 # Should split sense if more than one category of tags differs. 

2971 for tags in tagsets: 

2972 sense_tags.extend(tags) 

2973 elif cls == "taxonomic": 2973 ↛ 2974line 2973 didn't jump to line 2974 because the condition on line 2973 was never true

2974 if re.match(r"×[A-Z]", semi): 

2975 sense_tags.append("extinct") 

2976 semi = semi[1:] 

2977 data["taxonomic"] = semi 

2978 elif cls == "english": 

2979 if "qualifier" in data and data["qualifier"] != orig_semi: 2979 ↛ 2980line 2979 didn't jump to line 2980 because the condition on line 2979 was never true

2980 data["qualifier"] += "; " + orig_semi 

2981 else: 

2982 data["qualifier"] = orig_semi 

2983 else: 

2984 wxr.wtp.debug( 

2985 "unrecognized sense qualifier: {}".format(text), 

2986 sortid="form_descriptions/1831", 

2987 ) 

2988 sense_tags = sorted(set(sense_tags)) 

2989 data_extend(data, "tags", sense_tags) 

2990 

2991 

2992def parse_pronunciation_tags( 

2993 wxr: WiktextractContext, text: str, data: SoundData 

2994) -> None: 

2995 assert isinstance(wxr, WiktextractContext) 

2996 assert isinstance(text, str) 

2997 assert isinstance(data, dict) 

2998 text = text.strip() 

2999 if not text: 

3000 return 

3001 cls = classify_desc(text) 

3002 notes = [] 

3003 if cls == "tags": 

3004 tagsets, topics = decode_tags(text) 

3005 data_extend(data, "topics", topics) 

3006 for tagset in tagsets: 

3007 for t in tagset: 

3008 if " " in t: 3008 ↛ 3009line 3008 didn't jump to line 3009 because the condition on line 3008 was never true

3009 notes.append(t) 

3010 else: 

3011 data_append(data, "tags", t) 

3012 else: 

3013 notes.append(text) 

3014 if notes: 

3015 data["note"] = "; ".join(notes) 

3016 

3017 

3018def parse_translation_desc( 

3019 wxr: WiktextractContext, lang: str, text: str, tr: TranslationData 

3020) -> None: 

3021 assert isinstance(wxr, WiktextractContext) 

3022 assert isinstance(lang, str) # The language of ``text`` 

3023 assert isinstance(text, str) 

3024 assert isinstance(tr, dict) 

3025 # print("parse_translation_desc:", text) 

3026 

3027 # Process all parenthesized parts from the translation item 

3028 note = None 

3029 restore_beginning = "" 

3030 restore_end = "" 

3031 while True: 

3032 beginning = False 

3033 # See if we can find a parenthesized expression at the end 

3034 m = re.search(r"\s*\((([^()]|\([^()]+\))+)\)\.?$", text) 

3035 if m: 

3036 par = m.group(1) 

3037 text = text[: m.start()] 

3038 if par.startswith(("literally ", "lit.")): 

3039 continue # Not useful for disambiguation in many idioms 

3040 else: 

3041 # See if we can find a parenthesized expression at the start 

3042 m = re.match(r"^\^?\((([^()]|\([^()]+\))+)\):?(\s+|$)", text) 

3043 if m: 

3044 par = m.group(1) 

3045 text = text[m.end() :] 

3046 beginning = True 

3047 if re.match(r"^(\d|\s|,| or | and )+$", par): 3047 ↛ 3052line 3047 didn't jump to line 3052 because the condition on line 3047 was never true

3048 # Looks like this beginning parenthesized expression only 

3049 # contains digits or their combinations. We assume such 

3050 # to be sense descriptions if no sense has been selected, 

3051 # or otherwise just ignore them. 

3052 if not tr.get("sense"): 

3053 tr["sense"] = par 

3054 continue 

3055 else: 

3056 # See if we can find a parenthesized expression in the middle. 

3057 # Romanizations are sometimes between word and gender marker, 

3058 # e.g. wife/English/Tr/Yiddish. 

3059 m = re.search(r"\s+\((([^()]|\([^()]+\))+)\)", text) 

3060 if m: 

3061 par = m.group(1) 

3062 text = text[: m.start()] + text[m.end() :] 

3063 else: 

3064 # No more parenthesized expressions - break out of the loop 

3065 break 

3066 

3067 # Some cleanup of artifacts that may result from skipping some templates 

3068 # in earlier stages 

3069 if par.startswith(": "): 3069 ↛ 3070line 3069 didn't jump to line 3070 because the condition on line 3069 was never true

3070 par = par[2:] 

3071 if par.endswith(","): 3071 ↛ 3072line 3071 didn't jump to line 3072 because the condition on line 3071 was never true

3072 par = par[:-1] 

3073 if re.match(r'^[“"]([^“”"]*)[“”"]$', par): 3073 ↛ 3074line 3073 didn't jump to line 3074 because the condition on line 3073 was never true

3074 par = par[1:-1] 

3075 par = par.strip() 

3076 

3077 # Check for special script pronunciation followed by romanization, 

3078 # used in many Asian languages. 

3079 lst = par.split(", ") 

3080 if len(lst) == 2: 

3081 a, r = lst 

3082 if classify_desc(a) == "other": 

3083 cls = classify_desc(r) 

3084 # print("parse_translation_desc: r={} cls={}".format(r, cls)) 

3085 if cls == "romanization" or ( 

3086 cls == "english" and len(r.split()) == 1 and r[0].islower() 

3087 ): 

3088 if tr.get("alt") and tr.get("alt") != a: 3088 ↛ 3089line 3088 didn't jump to line 3089 because the condition on line 3088 was never true

3089 wxr.wtp.debug( 

3090 'more than one value in "alt": {} vs. {}'.format( 

3091 tr["alt"], a 

3092 ), 

3093 sortid="form_descriptions/1930", 

3094 ) 

3095 tr["alt"] = a 

3096 if tr.get("roman") and tr.get("roman") != r: 3096 ↛ 3097line 3096 didn't jump to line 3097 because the condition on line 3096 was never true

3097 wxr.wtp.debug( 

3098 'more than one value in "roman": {} vs. {}'.format( 

3099 tr["roman"], r 

3100 ), 

3101 sortid="form_descriptions/1936", 

3102 ) 

3103 tr["roman"] = r 

3104 continue 

3105 

3106 # Check for certain comma-separated tags combined with English text 

3107 # at the beginning or end of a comma-separated parenthesized list 

3108 while len(lst) > 1: 

3109 cls = classify_desc(lst[0]) 

3110 if cls == "tags": 3110 ↛ 3111line 3110 didn't jump to line 3111 because the condition on line 3110 was never true

3111 tagsets, topics = decode_tags(lst[0]) 

3112 for t in tagsets: 

3113 data_extend(tr, "tags", t) 

3114 data_extend(tr, "topics", topics) 

3115 lst = lst[1:] 

3116 continue 

3117 cls = classify_desc(lst[-1]) 

3118 if cls == "tags": 

3119 tagsets, topics = decode_tags(lst[-1]) 

3120 for t in tagsets: 

3121 data_extend(tr, "tags", t) 

3122 data_extend(tr, "topics", topics) 

3123 lst = lst[:-1] 

3124 continue 

3125 break 

3126 par = ", ".join(lst) 

3127 

3128 if not par: 3128 ↛ 3129line 3128 didn't jump to line 3129 because the condition on line 3128 was never true

3129 continue 

3130 if re.search(tr_ignored_parens_re, par): 3130 ↛ 3131line 3130 didn't jump to line 3131 because the condition on line 3130 was never true

3131 continue 

3132 if par.startswith("numeral:"): 

3133 par = par[8:].strip() 

3134 

3135 # Classify the part in parenthesis and process accordingly 

3136 cls = classify_desc(par) 

3137 # print("parse_translation_desc classify: {!r} -> {}" 

3138 # .format(par, cls)) 

3139 if par == text: 

3140 pass 

3141 if par == "f": 3141 ↛ 3142line 3141 didn't jump to line 3142 because the condition on line 3141 was never true

3142 data_append(tr, "tags", "feminine") 

3143 elif par == "m": 3143 ↛ 3144line 3143 didn't jump to line 3144 because the condition on line 3143 was never true

3144 data_append(tr, "tags", "masculine") 

3145 elif cls == "tags": 

3146 tagsets, topics = decode_tags(par) 

3147 for tags in tagsets: 

3148 data_extend(tr, "tags", tags) 

3149 data_extend(tr, "topics", topics) 

3150 elif cls == "english": 

3151 # If the text contains any of certain grammatical words, treat it 

3152 # as a "note" instead of "english" 

3153 if re.search(tr_note_re, par): 

3154 if par.endswith(":"): 3154 ↛ 3155line 3154 didn't jump to line 3155 because the condition on line 3154 was never true

3155 par = par[:-1] 

3156 if par not in ("see entry for forms",): 3156 ↛ 3031line 3156 didn't jump to line 3031 because the condition on line 3156 was always true

3157 if note: 3157 ↛ 3158line 3157 didn't jump to line 3158 because the condition on line 3157 was never true

3158 note = note + ";" + par 

3159 else: 

3160 note = par 

3161 else: 

3162 # There can be more than one parenthesized english item, see 

3163 # e.g. Aunt/English/Translations/Tamil 

3164 if "translation" in tr and "english" in tr: 

3165 tr["english"] += "; " + par # DEPRECATED for "translation" 

3166 tr["translation"] += "; " + par 

3167 else: 

3168 tr["english"] = par # DEPRECATED for "translation" 

3169 tr["translation"] = par 

3170 elif cls == "romanization": 

3171 # print("roman text={!r} text cls={}" 

3172 # .format(text, classify_desc(text))) 

3173 if classify_desc(text) in ( 

3174 "english", 

3175 "romanization", 

3176 ) and lang not in ("Egyptian",): 

3177 if beginning: 

3178 restore_beginning += "({}) ".format(par) 

3179 else: 

3180 restore_end = " ({})".format(par) + restore_end 

3181 else: 

3182 if tr.get("roman"): 3182 ↛ 3183line 3182 didn't jump to line 3183 because the condition on line 3182 was never true

3183 wxr.wtp.debug( 

3184 'more than one value in "roman": {} vs. {}'.format( 

3185 tr["roman"], par 

3186 ), 

3187 sortid="form_descriptions/2013", 

3188 ) 

3189 tr["roman"] = par 

3190 elif cls == "taxonomic": 3190 ↛ 3191line 3190 didn't jump to line 3191 because the condition on line 3190 was never true

3191 if tr.get("taxonomic"): 

3192 wxr.wtp.debug( 

3193 'more than one value in "taxonomic": {} vs. {}'.format( 

3194 tr["taxonomic"], par 

3195 ), 

3196 sortid="form_descriptions/2019", 

3197 ) 

3198 if re.match(r"×[A-Z]", par): 

3199 data_append(tr, "tags", "extinct") 

3200 par = par[1:] 

3201 tr["taxonomic"] = par 

3202 elif cls == "other": 3202 ↛ 3212line 3202 didn't jump to line 3212 because the condition on line 3202 was always true

3203 if tr.get("alt"): 3203 ↛ 3204line 3203 didn't jump to line 3204 because the condition on line 3203 was never true

3204 wxr.wtp.debug( 

3205 'more than one value in "alt": {} vs. {}'.format( 

3206 tr["alt"], par 

3207 ), 

3208 sortid="form_descriptions/2028", 

3209 ) 

3210 tr["alt"] = par 

3211 else: 

3212 wxr.wtp.debug( 

3213 "parse_translation_desc unimplemented cls {}: {}".format( 

3214 cls, par 

3215 ), 

3216 sortid="form_descriptions/2033", 

3217 ) 

3218 

3219 # Check for gender indications in suffix 

3220 text, final_tags = parse_head_final_tags(wxr, lang, text) 

3221 data_extend(tr, "tags", final_tags) 

3222 

3223 # Restore those parts that we did not want to remove (they are often 

3224 # optional words or words that are always used with the given translation) 

3225 text = restore_beginning + text + restore_end 

3226 

3227 if note: 

3228 tr["note"] = note.strip() 

3229 if text and text not in ignored_translations: 

3230 tr["word"] = text.strip() 

3231 

3232 # Sometimes gender seems to be at the end of "roman" field, see e.g. 

3233 # fire/English/Noun/Translations/Egyptian (for "oxidation reaction") 

3234 roman = tr.get("roman") 

3235 if roman: 

3236 if roman.endswith(" f"): 3236 ↛ 3237line 3236 didn't jump to line 3237 because the condition on line 3236 was never true

3237 data_append(tr, "tags", "feminine") 

3238 tr["roman"] = roman[:-2].strip() 

3239 elif roman.endswith(" m"): 3239 ↛ 3240line 3239 didn't jump to line 3240 because the condition on line 3239 was never true

3240 data_append(tr, "tags", "masculine") 

3241 tr["roman"] = roman[:-2].strip() 

3242 

3243 # If the word now has "translation" field but no "roman" field, and 

3244 # the word would be classified "other" (generally non-latin 

3245 # characters), and the value in "translation" is only one lowercase 

3246 # word, move it to "roman". This happens semi-frequently when the 

3247 # translation is transliterated the same as some English word. 

3248 roman = tr.get("roman") 

3249 english = tr.get("translation") 

3250 if english and not roman and "word" in tr: 

3251 cls = classify_desc(tr["word"]) 

3252 if cls == "other" and " " not in english and english[0].islower(): 

3253 del tr["translation"] 

3254 if "english" in tr: # DEPRECATED for "translation" 3254 ↛ 3256line 3254 didn't jump to line 3256 because the condition on line 3254 was always true

3255 del tr["english"] 

3256 tr["roman"] = english 

3257 

3258 # If the entry now has both tr["roman"] and tr["word"] and they have 

3259 # the same value, delete tr["roman"] (e.g., man/English/Translations 

3260 # Evenki) 

3261 if tr.get("word") and tr.get("roman") == tr.get("word"): 3261 ↛ 3262line 3261 didn't jump to line 3262 because the condition on line 3261 was never true

3262 del tr["roman"] 

3263 

3264 

3265def parse_alt_or_inflection_of( 

3266 wxr: WiktextractContext, gloss: str, gloss_template_args: set[str] 

3267) -> Optional[tuple[list[str], Optional[list[AltOf]]]]: 

3268 """Tries to parse an inflection-of or alt-of description. If successful, 

3269 this returns (tags, alt-of/inflection-of-dict). If the description cannot 

3270 be parsed, this returns None. This may also return (tags, None) when the 

3271 gloss describes a form (or some other tags were extracted from it), but 

3272 there was no alt-of/form-of/synonym-of word.""" 

3273 # print("parse_alt_or_inflection_of: {!r}".format(gloss)) 

3274 # Occasionally inflection_of/alt_of have "A(n) " etc. at the beginning. 

3275 

3276 # Never interpret a gloss that is equal to the word itself as a tag 

3277 # (e.g., instrumental/Romanian, instrumental/Spanish). 

3278 if gloss.lower() == wxr.wtp.title.lower() or ( # type:ignore[union-attr] 

3279 len(gloss) >= 5 and distw([gloss.lower()], wxr.wtp.title.lower()) < 0.2 # type:ignore[union-attr] 

3280 ): 

3281 return None 

3282 

3283 # First try parsing it as-is 

3284 parsed = parse_alt_or_inflection_of1(wxr, gloss, gloss_template_args) 

3285 if parsed is not None: 

3286 return parsed 

3287 

3288 # Next try parsing it with the first character converted to lowercase if 

3289 # it was previously uppercase. 

3290 if gloss and gloss[0].isupper(): 

3291 gloss = gloss[0].lower() + gloss[1:] 

3292 parsed = parse_alt_or_inflection_of1(wxr, gloss, gloss_template_args) 

3293 if parsed is not None: 

3294 return parsed 

3295 

3296 return None 

3297 

3298 

3299# These tags are not allowed in alt-or-inflection-of parsing 

3300alt_infl_disallowed: set[str] = set( 

3301 [ 

3302 "error-unknown-tag", 

3303 "place", # Not in inflected forms and causes problems e.g. house/ 

3304 # English 

3305 ] 

3306) 

3307 

3308 

3309def parse_alt_or_inflection_of1( 

3310 wxr: WiktextractContext, gloss: str, gloss_template_args: set[str] 

3311) -> Optional[tuple[list[str], Optional[list[AltOf]]]]: 

3312 """Helper function for parse_alt_or_inflection_of. This handles a single 

3313 capitalization.""" 

3314 if not gloss or not gloss.strip(): 3314 ↛ 3315line 3314 didn't jump to line 3315 because the condition on line 3314 was never true

3315 return None 

3316 

3317 # Prevent some common errors where we would parse something we shouldn't 

3318 if re.search(r"(?i)form of address ", gloss): 3318 ↛ 3319line 3318 didn't jump to line 3319 because the condition on line 3318 was never true

3319 return None 

3320 

3321 gloss = re.sub(r"only used in [^,]+, ", "", gloss) 

3322 

3323 # First try all formats ending with "of" (or other known last words that 

3324 # can end a form description) 

3325 matches = list(re.finditer(r"\b(of|for|by|as|letter|number) ", gloss)) 

3326 m: Optional[re.Match] 

3327 for m in reversed(matches): 

3328 desc = gloss[: m.end()].strip() 

3329 base = gloss[m.end() :].strip() 

3330 tagsets, topics = decode_tags(desc, no_unknown_starts=True) 

3331 if not topics and any( 

3332 not (alt_infl_disallowed & set(ts)) for ts in tagsets 

3333 ): 

3334 # Successfully parsed, including "of" etc. 

3335 tags: list[str] = [] 

3336 # If you have ("Western-Armenian", ..., "form-of") as your 

3337 # tag set, it's most probable that it's something like 

3338 # "Western Armenian form of խոսել (xosel)", which should 

3339 # get "alt-of" instead of "form-of" (inflection). 

3340 # խօսիլ/Armenian 

3341 for ts_t in tagsets: 

3342 if "form-of" in ts_t and any( 

3343 valid_tags.get(tk) == "dialect" for tk in ts_t 

3344 ): 

3345 ts_s = (set(ts_t) - {"form-of"}) | {"alt-of"} 

3346 else: 

3347 ts_s = set(ts_t) 

3348 if not (alt_infl_disallowed & ts_s): 3348 ↛ 3341line 3348 didn't jump to line 3341 because the condition on line 3348 was always true

3349 tags.extend(ts_s) 

3350 if ( 

3351 "alt-of" in tags 

3352 or "form-of" in tags 

3353 or "synonym-of" in tags 

3354 or "compound-of" in tags 

3355 ): 

3356 break 

3357 if m.group(1) == "of": 

3358 # Try parsing without the final "of". This is commonly used in 

3359 # various form-of expressions. 

3360 desc = gloss[: m.start()] 

3361 base = gloss[m.end() :] 

3362 tagsets, topics = decode_tags(desc, no_unknown_starts=True) 

3363 # print("ALT_OR_INFL: desc={!r} base={!r} tagsets={} topics={}" 

3364 # .format(desc, base, tagsets, topics)) 

3365 if not topics and any( 

3366 not (alt_infl_disallowed & set(t)) for t in tagsets 

3367 ): 

3368 tags = [] 

3369 for t in tagsets: 

3370 if not (alt_infl_disallowed & set(t)): 3370 ↛ 3369line 3370 didn't jump to line 3369 because the condition on line 3370 was always true

3371 tags.extend(t) 

3372 # It must have at least one tag from form_of_tags 

3373 if set(tags) & form_of_tags: 

3374 # Accept this as form-of 

3375 tags.append("form-of") 

3376 break 

3377 if set(tags) & alt_of_tags: 

3378 # Accept this as alt-of 

3379 tags.append("alt-of") 

3380 break 

3381 

3382 else: 

3383 # Did not find a form description based on last word; see if the 

3384 # whole description is tags 

3385 tagsets, topics = decode_tags(gloss, no_unknown_starts=True) 

3386 if not topics and any( 

3387 not (alt_infl_disallowed & set(ts)) and form_of_tags & set(ts) 

3388 for ts in tagsets 

3389 ): 

3390 tags = [] 

3391 for ts in tagsets: 

3392 if not (alt_infl_disallowed & set(ts)) and form_of_tags & set( 3392 ↛ 3391line 3392 didn't jump to line 3391 because the condition on line 3392 was always true

3393 ts 

3394 ): 

3395 tags.extend(ts) 

3396 base = "" 

3397 else: 

3398 return None 

3399 

3400 # kludge for Spanish (again): 'x of [word] combined with [clitic]' 

3401 m = re.search(r"combined with \w+$", base) 

3402 if m: 3402 ↛ 3403line 3402 didn't jump to line 3403 because the condition on line 3402 was never true

3403 tagsets, topics = decode_tags(m.group(0), no_unknown_starts=True) 

3404 if not topics: 

3405 for ts in tagsets: 

3406 tags.extend(ts) 

3407 base = base[: m.start()] 

3408 

3409 # It is fairly common for form_of glosses to end with something like 

3410 # "ablative case" or "in instructive case". Parse that ending. 

3411 base = base.strip() 

3412 lst = base.split() 

3413 # print("parse_alt_or_inflection_of: lst={}".format(lst)) 

3414 if len(lst) >= 3 and lst[-1] in ("case", "case."): 3414 ↛ 3415line 3414 didn't jump to line 3415 because the condition on line 3414 was never true

3415 node = valid_sequences.children.get(lst[-2]) 

3416 if node and node.end: 

3417 for s in node.tags: 

3418 tags.extend(s.split(" ")) 

3419 lst = lst[:-2] 

3420 if lst[-1] == "in" and len(lst) > 1: 

3421 lst = lst[:-1] 

3422 

3423 # Eliminate empty and duplicate tags 

3424 tags = sorted(set(t for t in tags if t)) 

3425 

3426 # Clean up some extra stuff from the linked word, separating the text 

3427 # into ``base`` (the linked word) and ``extra`` (additional information, 

3428 # such as English translation or clarifying word sense information). 

3429 orig_base = base 

3430 base = re.sub(alt_of_form_of_clean_re, "", orig_base) 

3431 base = re.sub(r" [(⟨][^()]*[)⟩]", "", base) # Remove all (...) groups 

3432 extra = orig_base[len(base) :] 

3433 extra = re.sub(r"^[- :;.,,—]+", "", extra) 

3434 if extra.endswith(".") and extra.count(".") == 1: 

3435 extra = extra[:-1].strip() 

3436 m = re.match(r"^\(([^()]*)\)$", extra) 

3437 if m: 3437 ↛ 3438line 3437 didn't jump to line 3438 because the condition on line 3437 was never true

3438 extra = m.group(1) 

3439 else: 

3440 # These weird backets used in "slash mark" 

3441 m = re.match(r"^⟨([^()]*)⟩$", extra) 

3442 if m: 3442 ↛ 3443line 3442 didn't jump to line 3443 because the condition on line 3442 was never true

3443 extra = m.group(1) 

3444 m = re.match(r'^[“"]([^"“”]*)["”]$', extra) 

3445 if m: 3445 ↛ 3446line 3445 didn't jump to line 3446 because the condition on line 3445 was never true

3446 extra = m.group(1) 

3447 # Note: base might still contain comma-separated values and values 

3448 # separated by "and" 

3449 base = base.strip() 

3450 if base.endswith(",") and len(base) > 2: 3450 ↛ 3451line 3450 didn't jump to line 3451 because the condition on line 3450 was never true

3451 base = base[:-1].strip() 

3452 while ( 

3453 base.endswith(".") 

3454 and not wxr.wtp.page_exists(base) 

3455 and base not in gloss_template_args 

3456 ): 

3457 base = base[:-1].strip() 

3458 if base.endswith('(\u201cconjecture")'): 3458 ↛ 3459line 3458 didn't jump to line 3459 because the condition on line 3458 was never true

3459 base = base[:-14].strip() 

3460 tags.append("conjecture") 

3461 while ( 3461 ↛ 3466line 3461 didn't jump to line 3466 because the condition on line 3461 was never true

3462 base.endswith(".") 

3463 and not wxr.wtp.page_exists(base) 

3464 and base not in gloss_template_args 

3465 ): 

3466 base = base[:-1].strip() 

3467 if ( 3467 ↛ 3472line 3467 didn't jump to line 3472 because the condition on line 3467 was never true

3468 base.endswith(".") 

3469 and base not in gloss_template_args 

3470 and base[:-1] in gloss_template_args 

3471 ): 

3472 base = base[:-1] 

3473 base = base.strip() 

3474 if not base: 

3475 return tags, None 

3476 

3477 # Kludge: Spanish verb forms seem to have a dot added at the end. 

3478 # Remove it; we know of no Spanish verbs ending with a dot. 

3479 language = wxr.wtp.section 

3480 pos = wxr.wtp.subsection 

3481 # print("language={} pos={} base={}".format(language, pos, base)) 

3482 if ( 3482 ↛ 3488line 3482 didn't jump to line 3488 because the condition on line 3482 was never true

3483 base.endswith(".") 

3484 and len(base) > 1 

3485 and base[-2].isalpha() 

3486 and (language == "Spanish" and pos == "Verb") 

3487 ): 

3488 base = base[:-1] 

3489 

3490 # Split base to alternatives when multiple alternatives provided 

3491 parts = split_at_comma_semi(base, extra=[" / ", "/", r" \+ "]) 

3492 titleword = re.sub(r"^Reconstruction:[^/]*/", "", wxr.wtp.title or "") 

3493 if ( 

3494 len(parts) <= 1 

3495 or base.startswith("/") 

3496 or base.endswith("/") 

3497 or "/" in titleword 

3498 ): 

3499 parts = [base] 

3500 # Split base to alternatives when of form "a or b" and "a" and "b" are 

3501 # similar (generally spelling variants of the same word or similar words) 

3502 if len(parts) == 1: 

3503 pp = base.split() 

3504 if len(pp) == 3 and pp[1] == "or" and distw([pp[0]], pp[2]) < 0.4: 

3505 parts = [pp[0], pp[2]] 

3506 

3507 # Create form-of/alt-of entries based on the extracted data 

3508 dt_lst: list[AltOf] = [] 

3509 for p in parts: 

3510 # Check for some suspicious base forms 

3511 m = re.search(r"[.,] |[{}()]", p) 

3512 if m and not wxr.wtp.page_exists(p): 3512 ↛ 3513line 3512 didn't jump to line 3513 because the condition on line 3512 was never true

3513 wxr.wtp.debug( 

3514 "suspicious alt_of/form_of with {!r}: {}".format(m.group(0), p), 

3515 sortid="form_descriptions/2278", 

3516 ) 

3517 if p.startswith("*") and len(p) >= 3 and p[1].isalpha(): 3517 ↛ 3518line 3517 didn't jump to line 3518 because the condition on line 3517 was never true

3518 p = p[1:] 

3519 dt: AltOf = {"word": p} 

3520 if extra: 

3521 dt["extra"] = extra 

3522 dt_lst.append(dt) 

3523 # print("alt_or_infl_of returning tags={} lst={} base={!r}" 

3524 # .format(tags, lst, base)) 

3525 return tags, dt_lst 

3526 

3527 

3528@functools.lru_cache(maxsize=65536) 

3529def classify_desc( 

3530 desc: str, 

3531 allow_unknown_tags=False, 

3532 no_unknown_starts=False, 

3533 accepted: Union[tuple[str, ...], frozenset[str]] = tuple(), 

3534) -> str: 

3535 """Determines whether the given description is most likely tags, english, 

3536 a romanization, or something else. Returns one of: "tags", "english", 

3537 "romanization", or "other". If ``allow_unknown_tags`` is True, then 

3538 allow "tags" classification even when the only tags are those starting 

3539 with a word in allowed_unknown_starts.""" 

3540 assert isinstance(desc, str) 

3541 # Empty and whitespace-only strings are treated as "other" 

3542 desc = desc.strip() 

3543 if not desc: 

3544 return "other" 

3545 

3546 normalized_desc = unicodedata.normalize("NFKD", desc) 

3547 

3548 # If it can be fully decoded as tags without errors, treat as tags 

3549 tagsets, topics = decode_tags(desc, no_unknown_starts=no_unknown_starts) 

3550 for tagset in tagsets: 

3551 assert isinstance(tagset, (list, tuple, set)) 

3552 if "error-unknown-tag" not in tagset and ( 

3553 topics or allow_unknown_tags or any(" " not in x for x in tagset) 

3554 ): 

3555 return "tags" 

3556 

3557 # Check if it looks like the taxonomic name of a species 

3558 if desc in known_species: 

3559 return "taxonomic" 

3560 desc1 = re.sub(r"^×([A-Z])", r"\1", desc) 

3561 desc1 = re.sub(r"\s*×.*", "", desc1) 

3562 lst = desc1.split() 

3563 if len(lst) > 1 and len(lst) <= 5 and lst[0] in known_firsts: 

3564 have_non_english = 1 if lst[0].lower() not in english_words else 0 

3565 for x in lst[1:]: 

3566 if x in ("A", "B", "C", "D", "E", "F", "I", "II", "III", "IV", "V"): 

3567 continue 

3568 if x[0].isupper(): 

3569 break 

3570 if x not in english_words: 

3571 have_non_english += 1 

3572 else: 

3573 # Starts with known taxonomic term, does not contain uppercase 

3574 # words (except allowed letters) and at least one word is not 

3575 # English 

3576 if have_non_english >= len(lst) - 1 and have_non_english > 0: 3576 ↛ 3582line 3576 didn't jump to line 3582 because the condition on line 3576 was always true

3577 return "taxonomic" 

3578 

3579 # If all words are in our English dictionary, interpret as English. 

3580 # [ -~] is regex black magic, "ALL CHARACTERS from space to tilde" 

3581 # in ASCII. Took me a while to figure out. 

3582 if re.match(r"[ -~―—“”…'‘’ʹ€]+$", normalized_desc) and len(desc) > 1: 

3583 if desc in english_words and desc[0].isalpha(): 

3584 return "english" # Handles ones containing whitespace 

3585 desc1 = re.sub( 

3586 tokenizer_fixup_re, lambda m: tokenizer_fixup_map[m.group(0)], desc 

3587 ) 

3588 tokens = tokenizer.tokenize(desc1) 

3589 if not tokens: 3589 ↛ 3590line 3589 didn't jump to line 3590 because the condition on line 3589 was never true

3590 return "other" 

3591 lst_bool = list( 

3592 x not in not_english_words 

3593 and 

3594 # not x.isdigit() and 

3595 ( 

3596 x in english_words 

3597 or x.lower() in english_words 

3598 or x in known_firsts 

3599 or x[0].isdigit() 

3600 or x in accepted 

3601 or 

3602 # (x[0].isupper() and x.find("-") < 0 and x.isascii()) or 

3603 ( 

3604 x.endswith("s") and len(x) >= 4 and x[:-1] in english_words 

3605 ) # Plural 

3606 or ( 

3607 x.endswith("ies") 

3608 and len(x) >= 5 

3609 and x[:-3] + "y" in english_words 

3610 ) # E.g. lily - lilies 

3611 or ( 

3612 x.endswith("ing") 

3613 and len(x) >= 5 

3614 and x[:-3] in english_words 

3615 ) # E.g. bring - bringing 

3616 or ( 

3617 x.endswith("ing") 

3618 and len(x) >= 5 

3619 and x[:-3] + "e" in english_words 

3620 ) # E.g., tone - toning 

3621 or ( 

3622 x.endswith("ed") and len(x) >= 5 and x[:-2] in english_words 

3623 ) # E.g. hang - hanged 

3624 or ( 

3625 x.endswith("ed") 

3626 and len(x) >= 5 

3627 and x[:-2] + "e" in english_words 

3628 ) # E.g. atone - atoned 

3629 or (x.endswith("'s") and x[:-2] in english_words) 

3630 or (x.endswith("s'") and x[:-2] in english_words) 

3631 or ( 

3632 x.endswith("ise") 

3633 and len(x) >= 5 

3634 and x[:-3] + "ize" in english_words 

3635 ) 

3636 or ( 

3637 x.endswith("ised") 

3638 and len(x) >= 6 

3639 and x[:-4] + "ized" in english_words 

3640 ) 

3641 or ( 

3642 x.endswith("ising") 

3643 and len(x) >= 7 

3644 and x[:-5] + "izing" in english_words 

3645 ) 

3646 or ( 

3647 re.search(r"[-/]", x) 

3648 and all( 

3649 ((y in english_words and len(y) > 2) or not y) 

3650 for y in re.split(r"[-/]", x) 

3651 ) 

3652 ) 

3653 ) 

3654 for x in tokens 

3655 ) 

3656 cnt = lst_bool.count(True) 

3657 rejected_words = tuple( 

3658 x for i, x in enumerate(tokens) if not lst_bool[i] 

3659 ) 

3660 if ( 

3661 any( 

3662 lst_bool[i] and x[0].isalpha() and len(x) > 1 

3663 for i, x in enumerate(tokens) 

3664 ) 

3665 and not desc.startswith("-") 

3666 and not desc.endswith("-") 

3667 and re.search(r"\w+", desc) 

3668 and ( 

3669 cnt == len(lst_bool) 

3670 or ( 

3671 any( 

3672 lst_bool[i] and len(x) > 3 for i, x in enumerate(tokens) 

3673 ) 

3674 and cnt >= len(lst_bool) - 1 

3675 ) 

3676 or cnt / len(lst_bool) >= 0.8 

3677 or ( 

3678 all(x in potentially_english_words for x in rejected_words) 

3679 and cnt / len(lst_bool) >= 0.50 

3680 ) 

3681 ) 

3682 ): 

3683 return "english" 

3684 # Some translations have apparent pronunciation descriptions in /.../ 

3685 # which we'll put in the romanization field (even though they probably are 

3686 # not exactly romanizations). 

3687 if desc.startswith("/") and desc.endswith("/"): 

3688 return "romanization" 

3689 # If all characters are in classes that could occur in romanizations, 

3690 # treat as romanization 

3691 classes = list( 

3692 unicodedata.category(x) if x not in ("-", ",", ":", "/", '"') else "OK" 

3693 for x in normalized_desc 

3694 ) 

3695 classes1 = [] 

3696 num_latin = 0 

3697 num_greek = 0 

3698 # part = "" 

3699 # for ch, cl in zip(normalized_desc, classes): 

3700 # part += f"{ch}({cl})" 

3701 # print(part) 

3702 for ch, cl in zip(normalized_desc, classes): 

3703 if ch in ( 

3704 "'", # ' in Arabic, / in IPA-like parenthesized forms 

3705 ".", # e.g., "..." in translations 

3706 ";", 

3707 ":", 

3708 "!", 

3709 "‘", 

3710 "’", 

3711 '"', 

3712 "“", 

3713 "”", 

3714 "/", 

3715 "?", 

3716 "…", # alternative to "..." 

3717 "⁉", # 見る/Japanese automatic transcriptions... 

3718 "?", 

3719 "!", 

3720 "⁻", # superscript -, used in some Cantonese roman, e.g. "we" 

3721 "ʔ", 

3722 "ʼ", 

3723 "ʾ", 

3724 "ʹ", 

3725 ): # ʹ e.g. in understand/English/verb Russian transl 

3726 classes1.append("OK") 

3727 continue 

3728 if cl not in ("Ll", "Lu"): 

3729 classes1.append(cl) 

3730 continue 

3731 try: 

3732 name = unicodedata.name(ch) 

3733 first = name.split()[0] 

3734 if first == "LATIN": 

3735 num_latin += 1 

3736 elif first == "GREEK": 

3737 num_greek += 1 

3738 elif first == "COMBINING": # Combining diacritic 3738 ↛ 3739line 3738 didn't jump to line 3739 because the condition on line 3738 was never true

3739 cl = "OK" 

3740 elif re.match(non_latin_scripts_re, name): 3740 ↛ 3744line 3740 didn't jump to line 3744 because the condition on line 3740 was always true

3741 cl = "NO" # Not acceptable in romanizations 

3742 except ValueError: 

3743 cl = "NO" # Not acceptable in romanizations 

3744 classes1.append(cl) 

3745 # print("classify_desc: {!r} classes1: {}".format(desc, classes1)) 

3746 # print(set(classes1) ) 

3747 if all( 

3748 x in ("Ll", "Lu", "Lt", "Lm", "Mn", "Mc", "Zs", "Nd", "OK") 

3749 for x in classes1 

3750 ): 

3751 if ( 

3752 (num_latin >= num_greek + 2 or num_greek == 0) 

3753 and classes1.count("OK") < len(classes1) 

3754 and classes1.count("Nd") < len(classes1) 

3755 ): 

3756 return "romanization" 

3757 # Otherwise it is something else, such as hanji version of the word 

3758 return "other" 

3759 

3760 

3761def remove_text_in_parentheses(text: str) -> str: 

3762 parentheses = 0 

3763 new_text = "" 

3764 for c in text: 

3765 if c == "(": 

3766 parentheses += 1 

3767 elif c == ")": 

3768 parentheses -= 1 

3769 elif parentheses == 0: 

3770 new_text += c 

3771 return new_text