Coverage for src/wiktextract/extractor/en/form_descriptions.py: 79%

1415 statements  

« prev     ^ index     » next       coverage.py v7.14.1, created at 2026-06-03 06:55 +0000

1# Code for parsing linguistic form descriptions and tags for word senses 

2# (both the word entry head - initial part and parenthesized parts - 

3# and tags at the beginning of word senses) 

4# 

5# Copyright (c) 2020-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org 

6 

7import functools 

8import re 

9import unicodedata 

10from typing import ( 

11 Any, 

12 Literal, 

13 Optional, 

14 Sequence, 

15 Union, 

16) 

17 

18import Levenshtein 

19from nltk import TweetTokenizer # type:ignore[import-untyped] 

20from wikitextprocessor.parser import WikiNode 

21 

22from ...datautils import data_append, data_extend, split_at_comma_semi 

23from ...page import extract_links_from_node 

24from ...tags import ( 

25 alt_of_tags, 

26 form_of_tags, 

27 head_final_bantu_langs, 

28 head_final_bantu_map, 

29 head_final_numeric_langs, 

30 head_final_other_langs, 

31 head_final_other_map, 

32 head_final_semitic_langs, 

33 head_final_semitic_map, 

34 uppercase_tags, 

35 valid_tags, 

36 xlat_descs_map, 

37 xlat_head_map, 

38 xlat_tags_map, 

39) 

40from ...topics import topic_generalize_map, valid_topics 

41from ...wxr_context import WiktextractContext 

42from .english_words import ( 

43 english_words, 

44 not_english_words, 

45 potentially_english_words, 

46) 

47from .form_descriptions_known_firsts import known_firsts 

48from .taxondata import known_species 

49from .type_utils import ( 

50 AltOf, 

51 FormData, 

52 LinkageData, 

53 SenseData, 

54 SoundData, 

55 TemplateData, 

56 TranslationData, 

57 WordData, 

58) 

59 

60# Tokenizer for classify_desc() 

61tokenizer = TweetTokenizer() 

62 

63# These are ignored as the value of a related form in form head. 

64IGNORED_RELATED: set[str] = set( 

65 [ 

66 "-", 

67 "־", 

68 "᠆", 

69 "‐", 

70 "‑", 

71 "‒", 

72 "–", 

73 "—", 

74 "―", 

75 "−", 

76 "⸺", 

77 "⸻", 

78 "﹘", 

79 "﹣", 

80 "-", 

81 "?", 

82 "(none)", 

83 ] 

84) 

85 

86 

87# First words of unicodedata.name() that indicate scripts that cannot be 

88# accepted in romanizations or english (i.e., should be considered "other" 

89# in classify_desc()). 

90non_latin_scripts: list[str] = [ 

91 "ADLAM", 

92 "ARABIC", 

93 "ARABIC-INDIC", 

94 "ARMENIAN", 

95 "BALINESE", 

96 "BENGALI", 

97 "BRAHMI", 

98 "BRAILLE", 

99 "CANADIAN", 

100 "CHAKMA", 

101 "CHAM", 

102 "CHEROKEE", 

103 "CJK", 

104 "COPTIC", 

105 "COUNTING ROD", 

106 "CUNEIFORM", 

107 "CYRILLIC", 

108 "DOUBLE-STRUCK", 

109 "EGYPTIAN", 

110 "ETHIOPIC", 

111 "EXTENDED ARABIC-INDIC", 

112 "GEORGIAN", 

113 "GLAGOLITIC", 

114 "GOTHIC", 

115 "GREEK", 

116 "GUJARATI", 

117 "GURMUKHI", 

118 "HANGUL", 

119 "HANIFI ROHINGYA", 

120 "HEBREW", 

121 "HIRAGANA", 

122 "JAVANESE", 

123 "KANNADA", 

124 "KATAKANA", 

125 "KAYAH LI", 

126 "KHMER", 

127 "KHUDAWADI", 

128 "LAO", 

129 "LEPCHA", 

130 "LIMBU", 

131 "MALAYALAM", 

132 "MEETEI", 

133 "MYANMAR", 

134 "NEW TAI LUE", 

135 "NKO", 

136 "OL CHIKI", 

137 "OLD PERSIAN", 

138 "OLD SOUTH ARABIAN", 

139 "ORIYA", 

140 "OSMANYA", 

141 "PHOENICIAN", 

142 "SAURASHTRA", 

143 "SHARADA", 

144 "SINHALA", 

145 "SUNDANESE", 

146 "SYLOTI", 

147 "TAI THAM", 

148 "TAKRI", 

149 "TAMIL", 

150 "TELUGU", 

151 "THAANA", 

152 "THAI", 

153 "TIBETAN", 

154 "TIFINAGH", 

155 "TIRHUTA", 

156 "UGARITIC", 

157 "WARANG CITI", 

158 "YI", 

159] 

160non_latin_scripts_re = re.compile( 

161 r"(" + r"|".join(re.escape(x) for x in non_latin_scripts) + r")\b" 

162) 

163 

164# Sanity check xlat_head_map values 

165for k, v in xlat_head_map.items(): 

166 if v.startswith("?"): 

167 v = v[1:] 

168 for tag in v.split(): 

169 if tag not in valid_tags: 169 ↛ 170line 169 didn't jump to line 170 because the condition on line 169 was never true

170 print( 

171 "WARNING: xlat_head_map[{}] contains" 

172 " unrecognized tag {}".format(k, tag) 

173 ) 

174 

175# Regexp for finding nested translations from translation items (these are 

176# used in, e.g., year/English/Translations/Arabic). This is actually used 

177# in page.py. 

178nested_translations_re = re.compile( 

179 r"\s+\((({}): ([^()]|\([^()]+\))+)\)".format( 

180 "|".join( 

181 re.escape(x.removeprefix("?")) 

182 for x in sorted(xlat_head_map.values(), key=len, reverse=True) 

183 if x and not x.startswith("class-") 

184 ) 

185 ) 

186) 

187 

188# Regexp that matches head tag specifiers. Used to match tags from end of 

189# translations and linkages 

190head_final_re_text = r"( -)?( ({}))+".format( 

191 "|".join( 

192 re.escape(x) 

193 for x in 

194 # The sort is to put longer ones first, preferring them in 

195 # the regexp match 

196 sorted(xlat_head_map.keys(), key=len, reverse=True) 

197 ) 

198) 

199head_final_re = re.compile(head_final_re_text + r"$") 

200 

201# Regexp used to match head tag specifiers at end of a form for certain 

202# Bantu languages (particularly Swahili and similar languages). 

203head_final_bantu_re_text = r" ({})".format( 

204 "|".join(re.escape(x) for x in head_final_bantu_map.keys()) 

205) 

206head_final_bantu_re = re.compile(head_final_bantu_re_text + "$") 

207 

208# Regexp used to match head tag specifiers at end of a form for certain 

209# Semitic languages (particularly Arabic and similar languages). 

210head_final_semitic_re_text = r" ({})".format( 

211 "|".join(re.escape(x) for x in head_final_semitic_map.keys()) 

212) 

213head_final_semitic_re = re.compile(head_final_semitic_re_text + "$") 

214 

215# Regexp used to match head tag specifiers at end of a form for certain 

216# other languages (e.g., Lithuanian, Finnish, French). 

217head_final_other_re_text = r" ({})".format( 

218 "|".join(re.escape(x) for x in head_final_other_map.keys()) 

219) 

220head_final_other_re = re.compile(head_final_other_re_text + "$") 

221 

222# Regexp for splitting heads. See parse_word_head(). 

223head_split_re_text_part_1 = ( 

224 "(" 

225 + head_final_re_text 

226 + "|" 

227 + head_final_bantu_re_text 

228 + "|" 

229 + head_final_semitic_re_text 

230 + "|" 

231 + head_final_other_re_text 

232) 

233 

234head_split_re_text = head_split_re_text_part_1 + ")?( or |[,;]+| *$)" 

235 

236head_split_re_text_no_semicolon = head_split_re_text_part_1 + ")?( or |,+| *$)" 

237 

238head_split_re = re.compile(head_split_re_text) 

239head_split_no_semicolon_re = re.compile(head_split_re_text_no_semicolon) 

240 

241head_split_re_parens = 0 

242for m in re.finditer(r"(^|[^\\])[(]+", head_split_re_text): 

243 head_split_re_parens += m.group(0).count("(") 

244 

245# Parenthesized parts that are ignored in translations 

246tr_ignored_parens: set[str] = set( 

247 [ 

248 "please verify", 

249 "(please verify)", 

250 "transliteration needed", 

251 "(transliteration needed)", 

252 "in words with back vowel harmony", 

253 "(in words with back vowel harmony)", 

254 "in words with front vowel harmony", 

255 "(in words with front vowel harmony)", 

256 "see below", 

257 "see usage notes below", 

258 ] 

259) 

260tr_ignored_parens_re = re.compile( 

261 r"^(" 

262 + "|".join(re.escape(x) for x in tr_ignored_parens) 

263 + ")$" 

264 + r"|^(Can we clean up|Can we verify|for other meanings see " 

265 r"lit\. )" 

266) 

267 

268# Translations that are ignored 

269ignored_translations: set[str] = set( 

270 [ 

271 "[script needed]", 

272 "please add this translation if you can", 

273 ] 

274) 

275 

276# Put english text into the "note" field in a translation if it contains one 

277# of these words 

278tr_note_re = re.compile( 

279 r"(\b(article|definite|indefinite|superlative|comparative|pattern|" 

280 r"adjective|adjectives|clause|clauses|pronoun|pronouns|preposition|prep|" 

281 r"postposition|postp|action|actions|articles|" 

282 r"adverb|adverbs|noun|nouns|verb|verbs|before|" 

283 r"after|placed|prefix|suffix|used with|translated|" 

284 r"nominative|genitive|dative|infinitive|participle|past|perfect|imperfect|" 

285 r"perfective|imperfective|auxiliary|negative|future|present|tense|aspect|" 

286 r"conjugation|declension|class|category|plural|singular|positive|" 

287 r"seldom used|formal|informal|familiar|unspoken|spoken|written|" 

288 r"indicative|progressive|conditional|potential|" 

289 r"accusative|adessive|inessive|superessive|elative|allative|" 

290 r"dialect|dialects|object|subject|predicate|movies|recommended|language|" 

291 r"locative|continuous|simple|continuousness|gerund|subjunctive|" 

292 r"periphrastically|no equivalent|not used|not always used|" 

293 r"used only with|not applicable|use the|signifying|wordplay|pronounced|" 

294 r"preconsonantal|spelled|spelling|respelling|respellings|phonetic|" 

295 r"may be replaced|stricter sense|for nonhumans|" 

296 r"sense:|used:|in full:|informally used|followed by|" 

297 r"not restricted to|pertaining to|or optionally with|are optional|" 

298 r"in conjunction with|in compounds|depending on the relationship|" 

299 r"person addressed|one person|multiple persons|may be replaced with|" 

300 r"optionally completed with|in the phrase|in response to|" 

301 r"before a|before an|preceded by|verbs ending|very common|after a verb|" 

302 r"with verb|with uncountable|with the objects|with stative|" 

303 r"can be replaced by|often after|used before|used after|" 

304 r"used in|clipping of|spoken|somewhat|capitalized|" 

305 r"short form|shortening of|shortened form|initialism of|" 

306 r"said to|rare:|rarer also|is rarer|negatively connoted|" 

307 r"previously mentioned|uncountable noun|countable noun|" 

308 r"countable nouns|uncountable nouns|" 

309 r"with predicative|with -|with imperfect|with a negated|" 

310 r"colloquial|misspelling|holophrastic|frequently|esp\.|especially|" 

311 r'"|' 

312 r"general term|after a vowel|before a vowel|" 

313 r"form|regular|irregular|alternative)" 

314 r")($|[) ])|^(" 

315 # Following are only matched at the beginning of the string 

316 r"pl|pl\.|see:|pl:|sg:|plurals:|e\.g\.|e\.g\.:" 

317 r"|e\.g\.,|cf\.|compare|such as|" 

318 r"see|only|often|usually|used|usage:|of|not|in|compare|usu\.|" 

319 r"as|about|abbrv\.|abbreviation|abbr\.|that:|optionally|" 

320 r"mainly|from|for|also|also:|acronym|" 

321 r"\+|with) " 

322) 

323# \b does not work at the end??? 

324 

325# Related forms matching this regexp will be considered suspicious if the 

326# page title does not also match one of these. 

327suspicious_related_re = re.compile( 

328 r"(^| )(f|m|n|c|or|pl|sg|inan|anim|pers|anml|impf|pf|vir|nvir)( |$)" 

329 r"|[][:=<>&#*|]" 

330 r"| \d+$" 

331) 

332 

333# Word forms (head forms, translations, etc) that will be considered ok and 

334# silently accepted even if they would otherwise trigger a suspicious 

335# form warning. 

336ok_suspicious_forms: set[str] = set( 

337 [ 

338 "but en or", # "golden goal"/English/Tr/French 

339 "cœur en or", # "heart of gold"/Eng/Tr/French 

340 "en or", # golden/Eng/Tr/French 

341 "men du", # jet/Etym2/Noun/Tr/Cornish 

342 "parachute en or", # "golden parachute"/Eng/Tr/French 

343 "vieil or", # "old gold"/Eng/Tr/French 

344 # "all that glitters is not gold"/Eng/Tr/French 

345 "tout ce qui brille n’est pas or", 

346 "μη αποκλειστικό or", # inclusive or/Eng/Tr/Greek 

347 "period or full stop", 

348 ] 

349) 

350 

351 

352# Replacements to be done in classify_desc before tokenizing. This is a 

353# workaround for shortcomings in TweetTokenizer. 

354tokenizer_fixup_map = { 

355 r"a.m.": "AM", 

356 r"p.m.": "PM", 

357} 

358tokenizer_fixup_re = re.compile( 

359 r"\b(" 

360 + "|".join( 

361 re.escape(x) 

362 for x in sorted( 

363 tokenizer_fixup_map.keys(), key=lambda x: len(x), reverse=True 

364 ) 

365 ) 

366 + r")" 

367) 

368 

369# Unknown tags starting with these words will be silently ignored. 

370ignored_unknown_starts: set[str] = set( 

371 [ 

372 "originally", 

373 "e.g.", 

374 "c.f.", 

375 "supplanted by", 

376 "supplied by", 

377 ] 

378) 

379 

380ignored_unknown_starts_re = re.compile( 

381 r"^(" 

382 + "|".join( 

383 re.escape(x) 

384 for x in sorted(ignored_unknown_starts, key=lambda x: -len(x)) 

385 ) 

386 + ") " 

387) 

388 

389# If an unknown sequence starts with one of these, it will continue as an 

390# unknown sequence until the end, unless it turns out to have a replacement. 

391allowed_unknown_starts: set[str] = set( 

392 [ 

393 "Relating", 

394 "accompanied", 

395 "added", 

396 "after", 

397 "answering", 

398 "as", 

399 "based", 

400 "before", 

401 "conjugated", 

402 "conjunction", 

403 "construed", 

404 "especially", 

405 "expression:", 

406 "figurative:", 

407 "followed", 

408 "for", 

409 "forms", 

410 "from", 

411 "governs", 

412 "in", 

413 "indicating", 

414 "modifying", 

415 "normally", 

416 "not", 

417 "of", 

418 "preceding", 

419 "prefixed", 

420 "referring", 

421 "relating", 

422 "revived", 

423 "said", 

424 "since", 

425 "takes", 

426 "used", 

427 "with", 

428 "With", 

429 "without", 

430 ] 

431) 

432# Allow the ignored unknown starts without complaining 

433allowed_unknown_starts.update(ignored_unknown_starts) 

434 

435# Full unknown tags that will be ignored in decode_tags() 

436# XXX this is unused, ask Tatu where the contents is now 

437ignored_unknown_tags: set[str] = set([]) 

438 

439# Head endings that are mapped to tags 

440head_end_map = { 

441 " 1st conj.": "conjugation-1", 

442 " 2nd conj.": "conjugation-2", 

443 " 3rd conj.": "conjugation-3", 

444 " 4th conj.": "conjugation-4", 

445 " 5th conj.": "conjugation-5", 

446 " 6th conj.": "conjugation-6", 

447 " 7th conj.": "conjugation-7", 

448} 

449head_end_re = re.compile( 

450 r"(" + "|".join(re.escape(x) for x in head_end_map.keys()) + r")$" 

451) 

452 

453 

454# Dictionary of language-specific parenthesized head part starts that 

455# either introduce new tags or modify previous tags. The value for each 

456# language is a dictionary that maps the first word of the head part to 

457# (rem_tags, add_tags), where ``rem_tags`` can be True to remove all previous 

458# tags or a space-separated string of tags to remove, and ``add_tags`` should 

459# be a string of tags to add. 

460lang_specific_head_map: dict[ 

461 str, dict[str, Union[tuple[str, str], tuple[Literal[True], str]]] 

462] = { 

463 "Danish": { 

464 # prefix: (rem_tags space separate string/True, add_tags s-sep str) 

465 "c": ("neuter", "common-gender"), 

466 "n": ("common-gender", "neuter"), 

467 "pl": ("singular neuter common-gender", "plural"), 

468 "sg": ("plural neuter common-gender", "singular"), 

469 }, 

470} 

471 

472 

473# Regular expression used to strip additional stuff from the end of alt_of and 

474# form_of. 

475alt_of_form_of_clean_re = re.compile( 

476 r"(?s)(" 

477 + "|".join( 

478 [ 

479 r":", 

480 r'[“"]', 

481 r";", 

482 r" \(", 

483 r" - ", 

484 r" ־ ", 

485 r" ᠆ ", 

486 r" ‐ ", 

487 r" ‑ ", 

488 r" ‒ ", 

489 r" – ", 

490 r" — ", 

491 r" ― ", 

492 r" − ", 

493 r" ⸺ ", 

494 r" ⸻ ", 

495 r" ﹘ ", 

496 r" ﹣ ", 

497 r" - ", 

498 r" \+ ", 

499 r" \(with ", 

500 r" with -ra/-re", 

501 r"\. Used ", 

502 r"\. Also ", 

503 r"\. Since ", 

504 r"\. A ", 

505 r"\.\. A ", 

506 r"\. An ", 

507 r"\.\. An ", 

508 r"\. an ", 

509 r"\. The ", 

510 r"\. Spanish ", 

511 r"\. Language ", 

512 r"\. former name of ", 

513 r"\. AIM", 

514 r"\. OT", 

515 r"\. Not ", 

516 r"\. Now ", 

517 r"\. Nowadays ", 

518 r"\. Early ", 

519 r"\. ASEAN", 

520 r"\. UN", 

521 r"\. IMF", 

522 r"\. WHO", 

523 r"\. WIPO", 

524 r"\. AC", 

525 r"\. DC", 

526 r"\. DNA", 

527 r"\. RNA", 

528 r"\. SOB", 

529 r"\. IMO", 

530 r"\. Behavior", 

531 r"\. Income ", 

532 r"\. More ", 

533 r"\. Most ", 

534 r"\. Only ", 

535 r"\. Also ", 

536 r"\. From ", 

537 r"\. Of ", 

538 r"\.\. Of ", 

539 r"\. To ", 

540 r"\. For ", 

541 r"\. If ", 

542 r"\. Praenominal ", 

543 r"\. This ", 

544 r"\. Replaced ", 

545 r"\. CHCS is the ", 

546 r"\. Equivalent ", 

547 r"\. Initialism ", 

548 r"\. Note ", 

549 r"\. Alternative ", 

550 r"\. Compare ", 

551 r"\. Cf\. ", 

552 r"\. Comparable ", 

553 r"\. Involves ", 

554 r"\. Sometimes ", 

555 r"\. Commonly ", 

556 r"\. Often ", 

557 r"\. Typically ", 

558 r"\. Possibly ", 

559 r"\. Although ", 

560 r"\. Rare ", 

561 r"\. Instead ", 

562 r"\. Integrated ", 

563 r"\. Distinguished ", 

564 r"\. Given ", 

565 r"\. Found ", 

566 r"\. Was ", 

567 r"\. In ", 

568 r"\. It ", 

569 r"\.\. It ", 

570 r"\. One ", 

571 r"\. Any ", 

572 r"\. They ", 

573 r"\. Members ", 

574 r"\. Each ", 

575 r"\. Original ", 

576 r"\. Especially ", 

577 r"\. Usually ", 

578 r"\. Known ", 

579 r"\.\. Known ", 

580 r"\. See ", 

581 r"\. see ", 

582 r"\. target was not ", 

583 r"\. Popular ", 

584 r"\. Pedantic ", 

585 r"\. Positive ", 

586 r"\. Society ", 

587 r"\. Plan ", 

588 r"\. Environmentally ", 

589 r"\. Affording ", 

590 r"\. Encompasses ", 

591 r"\. Expresses ", 

592 r"\. Indicates ", 

593 r"\. Text ", 

594 r"\. Large ", 

595 r"\. Sub-sorting ", 

596 r"\. Sax", 

597 r"\. First-person ", 

598 r"\. Second-person ", 

599 r"\. Third-person ", 

600 r"\. 1st ", 

601 r"\. 2nd ", 

602 r"\. 3rd ", 

603 r"\. Term ", 

604 r"\. Northeastern ", 

605 r"\. Northwestern ", 

606 r"\. Southeast ", 

607 r"\. Egyptian ", 

608 r"\. English ", 

609 r"\. Cape Province was split into ", 

610 r"\. Pañcat", 

611 r"\. of the ", 

612 r"\. is ", 

613 r"\. after ", 

614 r"\. or ", 

615 r"\. chromed", 

616 r"\. percussion", 

617 r"\. with his ", 

618 r"\. a\.k\.a\. ", 

619 r"\. comparative form ", 

620 r"\. singular ", 

621 r"\. plural ", 

622 r"\. present ", 

623 r"\. his ", 

624 r"\. her ", 

625 r"\. equivalent ", 

626 r"\. measuring ", 

627 r"\. used in ", 

628 r"\. cutely ", 

629 r"\. Protects", 

630 r'\. "', 

631 r"\.^", 

632 r"\. \+ ", 

633 r"\., ", 

634 r". — ", 

635 r", a ", 

636 r", an ", 

637 r", the ", 

638 r", obsolete ", 

639 r", possessed", # 'd/English 

640 r", imitating", # 1/English 

641 r", derived from", 

642 r", called ", 

643 r", especially ", 

644 r", slang for ", 

645 r", used to", # c/o /English 

646 r", commonly", # b/w /English 

647 r" corresponding to ", 

648 r" equivalent to ", 

649 r" popularized by ", 

650 r" denoting ", 

651 r" in its various senses\.", 

652 r" used by ", 

653 r" but not for ", 

654 r" since ", 

655 r" i\.e\. ", 

656 r" i\. e\. ", 

657 r" e\.g\. ", 

658 r" eg\. ", 

659 r" etc\. ", 

660 r"\[http", 

661 r" — used as ", 

662 r" by K\. Forsyth ", 

663 r" by J\. R\. Allen ", 

664 r" by S\. Ferguson ", 

665 r" by G\. Donaldson ", 

666 r" May refer to ", 

667 r" An area or region ", 

668 ] 

669 ) 

670 + r").*$" 

671) 

672 

673 

674class ValidNode: 

675 """Node in the valid_sequences tree. Each node is part of a chain 

676 or chains that form sequences built out of keys in key->tags 

677 maps like xlat_tags, etc. The ValidNode's 'word' is the key 

678 by which it is refered to in the root dict or a `children` dict, 

679 `end` marks that the node is the end-terminus of a sequence (but 

680 it can still continue if the sequence is shared by the start of 

681 other sequences: "nominative$" and "nominative plural$" for example), 

682 `tags` and `topics` are the dicts containing tag and topic strings 

683 for terminal nodes (end==True).""" 

684 

685 __slots__ = ( 

686 "end", 

687 "tags", 

688 "topics", 

689 "children", 

690 ) 

691 

692 def __init__( 

693 self, 

694 end=False, 

695 tags: Optional[list[str]] = None, 

696 topics: Optional[list[str]] = None, 

697 children: Optional[dict[str, "ValidNode"]] = None, 

698 ) -> None: 

699 self.end = end 

700 self.tags: list[str] = tags or [] 

701 self.topics: list[str] = topics or [] 

702 self.children: dict[str, "ValidNode"] = children or {} 

703 

704 

705def add_to_valid_tree(tree: ValidNode, desc: str, v: Optional[str]) -> None: 

706 """Helper function for building trees of valid tags/sequences during 

707 initialization.""" 

708 assert isinstance(tree, ValidNode) 

709 assert isinstance(desc, str) 

710 assert v is None or isinstance(v, str) 

711 node = tree 

712 

713 # Build the tree structure: each node has children nodes 

714 # whose names are denoted by their dict key. 

715 for w in desc.split(" "): 

716 if w in node.children: 

717 node = node.children[w] 

718 else: 

719 new_node = ValidNode() 

720 node.children[w] = new_node 

721 node = new_node 

722 if not node.end: 

723 node.end = True 

724 if not v: 

725 return None # Terminate early because there are no tags 

726 

727 tagslist = [] 

728 topicslist = [] 

729 for vv in v.split(): 

730 if vv in valid_tags: 

731 tagslist.append(vv) 

732 elif vv in valid_topics: 732 ↛ 735line 732 didn't jump to line 735 because the condition on line 732 was always true

733 topicslist.append(vv) 

734 else: 

735 print( 

736 "WARNING: tag/topic {!r} maps to unknown {!r}".format(desc, vv) 

737 ) 

738 topics = " ".join(topicslist) 

739 tags = " ".join(tagslist) 

740 # Changed to "_tags" and "_topics" to avoid possible key-collisions. 

741 if topics: 

742 node.topics.extend([topics]) 

743 if tags: 

744 node.tags.extend([tags]) 

745 

746 

747def add_to_valid_tree1( 

748 tree: ValidNode, 

749 k: str, 

750 v: Union[list[str], tuple[str, ...], str], 

751 valid_values: Union[set[str], dict[str, Any]], 

752) -> list[str]: 

753 assert isinstance(tree, ValidNode) 

754 assert isinstance(k, str) 

755 assert v is None or isinstance(v, (list, tuple, str)) 

756 assert isinstance(valid_values, (set, dict)) 

757 if not v: 757 ↛ 758line 757 didn't jump to line 758 because the condition on line 757 was never true

758 add_to_valid_tree(valid_sequences, k, None) 

759 return [] 

760 elif isinstance(v, str): 

761 v = [v] 

762 q = [] 

763 for vv in v: 

764 assert isinstance(vv, str) 

765 add_to_valid_tree(valid_sequences, k, vv) 

766 vvs = vv.split() 

767 for x in vvs: 

768 q.append(x) 

769 # return each individual tag 

770 return q 

771 

772 

773def add_to_valid_tree_mapping( 

774 tree: ValidNode, 

775 mapping: Union[dict[str, Union[list[str], str]], dict[str, str]], 

776 valid_values: Union[set[str], dict[str, Any]], 

777 recurse: bool, 

778) -> None: 

779 assert isinstance(tree, ValidNode) 

780 assert isinstance(mapping, dict) 

781 assert isinstance(valid_values, (set, dict)) 

782 assert recurse in (True, False) 

783 for k, v in mapping.items(): 

784 assert isinstance(k, str) 

785 assert isinstance(v, (list, str)) 

786 if isinstance(v, str): 

787 q = add_to_valid_tree1(tree, k, [v], valid_values) 

788 else: 

789 q = add_to_valid_tree1(tree, k, v, valid_values) 

790 if recurse: 

791 visited = set() 

792 while q: 

793 v = q.pop() 

794 if v in visited: 

795 continue 

796 visited.add(v) 

797 if v not in mapping: 

798 continue 

799 vv = mapping[v] 

800 qq = add_to_valid_tree1(tree, k, vv, valid_values) 

801 q.extend(qq) 

802 

803 

804# Tree of sequences considered to be tags (includes sequences that are 

805# mapped to something that becomes one or more valid tags) 

806valid_sequences = ValidNode() 

807sequences_with_slashes: set[str] = set() 

808for tag in valid_tags: 

809 # The basic tags used in our tag system; some are a bit weird, but easier 

810 # to implement this with 'false' positives than filter out stuff no one else 

811 # uses. 

812 if "/" in tag: 

813 sequences_with_slashes.add(tag) 

814 add_to_valid_tree(valid_sequences, tag, tag) 

815for tag in uppercase_tags: 

816 hyphenated = re.sub(r"\s+", "-", tag) 

817 if "/" in tag: 

818 sequences_with_slashes.add(tag) 

819 add_to_valid_tree(valid_sequences, tag, hyphenated) 

820 

821# xlat_tags_map! 

822add_to_valid_tree_mapping(valid_sequences, xlat_tags_map, valid_tags, False) 

823for k in xlat_tags_map: 

824 if "/" in k: 

825 sequences_with_slashes.add(k) 

826# Add topics to the same table, with all generalized topics also added 

827for topic in valid_topics: 

828 assert " " not in topic 

829 if "/" in topic: 829 ↛ 830line 829 didn't jump to line 830 because the condition on line 829 was never true

830 sequences_with_slashes.add(topic) 

831 add_to_valid_tree(valid_sequences, topic, topic) 

832# Let each original topic value stand alone. These are not generally on 

833# valid_topics. We add the original topics with spaces replaced by hyphens. 

834for topic in topic_generalize_map.keys(): 

835 hyphenated = re.sub(r"\s+", "-", topic) 

836 if "/" in topic: 836 ↛ 837line 836 didn't jump to line 837 because the condition on line 836 was never true

837 sequences_with_slashes.add(topic) 

838 add_to_valid_tree(valid_sequences, topic, hyphenated) 

839# Add canonicalized/generalized topic values 

840add_to_valid_tree_mapping( 

841 valid_sequences, topic_generalize_map, valid_topics, True 

842) 

843 

844# Regex used to divide a decode candidate into parts that shouldn't 

845# have their slashes turned into spaces 

846slashes_re = re.compile( 

847 r"(" + "|".join((re.escape(s) for s in sequences_with_slashes)) + r")" 

848) 

849 

850# Regexp used to find "words" from word heads and linguistic descriptions 

851word_pattern = ( 

852 r"[^ ,;()\u200e]+|" 

853 r"\([^ ,;()\u200e]+\)[^ ,;()\u200e]+|" 

854 r"[\u2800-\u28ff]|" # Braille characters 

855 r"\(([^()]|\([^()]*\))*\)" 

856) 

857 

858word_re_global = re.compile(word_pattern) 

859 

860 

861def distw(titleparts: Sequence[str], word: str) -> float: 

862 """Computes how distinct ``word`` is from the most similar word in 

863 ``titleparts``. Returns 1 if words completely distinct, 0 if 

864 identical, or otherwise something in between.""" 

865 assert isinstance(titleparts, (list, tuple)) 

866 assert isinstance(word, str) 

867 w = min( 

868 Levenshtein.distance(word, tw) / max(len(tw), len(word)) 

869 for tw in titleparts 

870 ) 

871 return w 

872 

873 

874def map_with( 

875 ht: dict[str, str | list[str]] | dict[str, str], 

876 lst: Sequence[str], 

877) -> list[str]: 

878 """Takes alternatives from ``lst``, maps them using ``ht`` to zero or 

879 more alternatives each, and returns a combined list of alternatives.""" 

880 assert isinstance(ht, dict) 

881 assert isinstance(lst, (list, tuple)) 

882 ret = [] 

883 for x in lst: 

884 assert isinstance(x, str) 

885 x = x.strip() 

886 x = ht.get(x, x) 

887 if isinstance(x, str): 887 ↛ 890line 887 didn't jump to line 890 because the condition on line 887 was always true

888 if x: 888 ↛ 883line 888 didn't jump to line 883 because the condition on line 888 was always true

889 ret.append(x) 

890 elif isinstance(x, (list, tuple)): 

891 ret.extend(x) 

892 else: 

893 raise RuntimeError("map_with unexpected value: {!r}".format(x)) 

894 return ret 

895 

896 

897TagList = list[str] 

898PosPathStep = tuple[int, TagList, TagList] 

899 

900 

901def check_unknown( 

902 from_i: int, 

903 to_i: int, 

904 i: int, 

905 wordlst: Sequence[str], 

906 allow_any: bool, 

907 no_unknown_starts: bool, 

908) -> list[PosPathStep]: 

909 """Check if the current section from_i->to_i is actually unknown 

910 or if it needs some special handling. We already presupposed that 

911 this is UNKNOWN; this is just called to see what *kind* of UNKNOWN.""" 

912 assert isinstance(to_i, int) 

913 assert isinstance(from_i, int) 

914 assert isinstance(i, int) 

915 # Adds unknown tag if needed. Returns new last_i 

916 # print("check_unknown to_i={} from_i={} i={}" 

917 # .format(to_i, from_i, i)) 

918 if from_i >= to_i: 

919 return [] 

920 words = wordlst[from_i:to_i] 

921 tag = " ".join(words) 

922 assert tag 

923 # print(f"{tag=}") 

924 if re.match(ignored_unknown_starts_re, tag): 

925 # Tags with this start are to be ignored 

926 return [(from_i, ["UNKNOWN"], [])] 

927 if tag in ignored_unknown_tags: 927 ↛ 928line 927 didn't jump to line 928 because the condition on line 927 was never true

928 return [] # One of the tags listed as to be ignored 

929 if tag in ("and", "or"): 

930 return [] 

931 if ( 

932 not allow_any 

933 and not words[0].startswith("~") 

934 and ( 

935 no_unknown_starts 

936 or words[0] not in allowed_unknown_starts 

937 or len(words) <= 1 

938 ) 

939 ): 

940 # print("ERR allow_any={} words={}" 

941 # .format(allow_any, words)) 

942 return [ 

943 (from_i, ["UNKNOWN"], ["error-unknown-tag"]) 

944 ] # Add ``tag`` here to include 

945 else: 

946 return [(from_i, ["UNKNOWN"], [tag])] 

947 

948 

949def add_new1( 

950 node: ValidNode, 

951 i: int, 

952 start_i: int, 

953 last_i: int, 

954 new_paths: list[list[PosPathStep]], 

955 new_nodes: list[tuple[ValidNode, int, int]], 

956 pos_paths: list[list[list[PosPathStep]]], 

957 wordlst: list[str], 

958 allow_any: bool, 

959 no_unknown_starts: bool, 

960 max_last_i: int, 

961) -> int: 

962 assert isinstance(new_paths, list) 

963 # print("add_new: start_i={} last_i={}".format(start_i, last_i)) 

964 # print("$ {} last_i={} start_i={}" 

965 # .format(w, last_i, start_i)) 

966 max_last_i = max(max_last_i, last_i) # if last_i has grown 

967 if (node, start_i, last_i) not in new_nodes: 

968 new_nodes.append((node, start_i, last_i)) 

969 if node.end: 

970 # We can see a terminal point in the search tree. 

971 u = check_unknown( 

972 last_i, start_i, i, wordlst, allow_any, no_unknown_starts 

973 ) 

974 # Create new paths candidates based on different past possible 

975 # paths; pos_path[last_i] contains possible paths, so add this 

976 # new one at the beginning(?) 

977 # The list comprehension inside the parens generates an iterable 

978 # of lists, so this is .extend( [(last_i...)], [(last_i...)], ... ) 

979 # XXX: this is becoming impossible to annotate, nodes might 

980 # need to become classed objects and not just dicts, or at least 

981 # a TypedDict with a "children" node 

982 new_paths.extend( 

983 [(last_i, node.tags, node.topics)] + u + x 

984 for x in pos_paths[last_i] 

985 ) 

986 max_last_i = i + 1 

987 return max_last_i 

988 

989 

990@functools.lru_cache(maxsize=65536) 

991def decode_tags( 

992 src: str, 

993 allow_any=False, 

994 no_unknown_starts=False, 

995) -> tuple[list[tuple[str, ...]], list[str]]: 

996 tagsets, topics = decode_tags1(src, allow_any, no_unknown_starts) 

997 # print(f"decode_tags: {src=}, {tagsets=}") 

998 

999 # Insert retry-code here that modifies the text source 

1000 if ( 

1001 any(s.startswith("error-") for tagset in tagsets for s in tagset) 

1002 # I hate Python's *nested* list comprehension syntax ^ 

1003 or any(s.startswith("error-") for s in topics) 

1004 ): 

1005 new_tagsets: list[tuple[str, ...]] = [] 

1006 new_topics: list[str] = [] 

1007 

1008 if "/" in src: 

1009 # slashes_re contains valid key entries with slashes; we're going 

1010 # to skip them by splitting the string and skipping handling every 

1011 # second entry, which contains the splitting group like "masculine/ 

1012 # feminine" style keys. 

1013 split_parts = re.split(slashes_re, src) 

1014 new_parts: list[str] = [] 

1015 if len(split_parts) > 1: 

1016 for i, s in enumerate(split_parts): 

1017 if i % 2 == 0: 

1018 new_parts.append(s.replace("/", " ")) 

1019 else: 

1020 new_parts.append(s) 

1021 new_src = "".join(new_parts) 

1022 else: 

1023 new_src = src 

1024 new_tagsets, new_topics = decode_tags1( 

1025 new_src, allow_any, no_unknown_starts 

1026 ) 

1027 elif " or " in src or " and " in src: 

1028 # Annoying kludge. 

1029 new_src = src.replace(" and ", " ") 

1030 new_src = new_src.replace(" or ", " ") 

1031 new_tagsets, new_topics = decode_tags1( 

1032 new_src, allow_any, no_unknown_starts 

1033 ) 

1034 # print(f"{new_tagsets=}") 

1035 

1036 if new_tagsets or new_topics: 

1037 old_errors = sum( 

1038 1 for tagset in tagsets for s in tagset if s.startswith("error") 

1039 ) 

1040 old_errors += sum(1 for s in topics if s.startswith("error")) 

1041 new_errors = sum( 

1042 1 

1043 for new_tagset in new_tagsets 

1044 for s in new_tagset 

1045 if s.startswith("error") 

1046 ) 

1047 new_errors += sum(1 for s in new_topics if s.startswith("error")) 

1048 

1049 if new_errors <= old_errors: 1049 ↛ 1052line 1049 didn't jump to line 1052 because the condition on line 1049 was always true

1050 return new_tagsets, new_topics 

1051 

1052 return tagsets, topics 

1053 

1054 

1055def decode_tags1( 

1056 src: str, 

1057 allow_any=False, 

1058 no_unknown_starts=False, 

1059) -> tuple[list[tuple[str, ...]], list[str]]: 

1060 """Decodes tags, doing some canonicalizations. This returns a list of 

1061 lists of tags and a list of topics.""" 

1062 assert isinstance(src, str) 

1063 

1064 # print("decode_tags: src={!r}".format(src)) 

1065 

1066 pos_paths: list[list[list[PosPathStep]]] = [[[]]] 

1067 wordlst: list[str] = [] 

1068 max_last_i = 0 # pre-initialized here so that it can be used as a ref 

1069 

1070 add_new = functools.partial( 

1071 add_new1, # pre-set parameters and references for function 

1072 pos_paths=pos_paths, 

1073 wordlst=wordlst, 

1074 allow_any=allow_any, 

1075 no_unknown_starts=no_unknown_starts, 

1076 max_last_i=max_last_i, 

1077 ) 

1078 # First split the tags at commas and semicolons. Their significance is that 

1079 # a multi-word sequence cannot continue across them. 

1080 parts = split_at_comma_semi(src, extra=[";", ":"]) 

1081 

1082 for part in parts: 

1083 max_last_i = len(wordlst) # "how far have we gone?" 

1084 lst1 = part.split() 

1085 if not lst1: 

1086 continue 

1087 wordlst.extend(lst1) 

1088 cur_nodes: list[tuple[ValidNode, int, int]] = [] # Currently seen 

1089 for w in lst1: 

1090 i = len(pos_paths) - 1 

1091 new_nodes: list[tuple[ValidNode, int, int]] = [] 

1092 # replacement nodes for next loop 

1093 new_paths: list[list[PosPathStep]] = [] 

1094 # print("ITER i={} w={} max_last_i={} wordlst={}" 

1095 # .format(i, w, max_last_i, wordlst)) 

1096 node: ValidNode 

1097 start_i: int 

1098 last_i: int 

1099 for node, start_i, last_i in cur_nodes: 

1100 # ValidNodes are part of a search tree that checks if a 

1101 # phrase is found in xlat_tags_map and other text->tags dicts. 

1102 if w in node.children: 

1103 # the phrase continues down the tree 

1104 # print("INC", w) 

1105 max_last_i = add_new( 

1106 node.children[w], 

1107 i, 

1108 start_i, 

1109 last_i, 

1110 new_paths, 

1111 new_nodes, 

1112 ) 

1113 if node.end: 

1114 # we've hit an end point, the tags and topics have already 

1115 # been gathered at some point, don't do anything with the 

1116 # old stuff 

1117 if w in valid_sequences.children: 

1118 # This starts a *new* possible section 

1119 max_last_i = add_new( 

1120 valid_sequences.children[w], # root-> 

1121 i, 

1122 i, 

1123 i, 

1124 new_paths, 

1125 new_nodes, 

1126 ) 

1127 if w not in node.children and not node.end: 

1128 # print("w not in node and $: i={} last_i={} wordlst={}" 

1129 # .format(i, last_i, wordlst)) 

1130 # If i == last_i == 0, for example (beginning) 

1131 if ( 

1132 i == last_i 

1133 or no_unknown_starts 

1134 or wordlst[last_i] not in allowed_unknown_starts 

1135 ): 

1136 # print("NEW", w) 

1137 if w in valid_sequences.children: 

1138 # Start new sequences here 

1139 max_last_i = add_new( 

1140 valid_sequences.children[w], 

1141 i, 

1142 i, 

1143 last_i, 

1144 new_paths, 

1145 new_nodes, 

1146 ) 

1147 if not new_nodes: 

1148 # This is run at the start when i == max_last_i == 0, 

1149 # which is what populates the first node in new_nodes. 

1150 # Some initial words cause the rest to be interpreted as unknown 

1151 # print("not new nodes: i={} last_i={} wordlst={}" 

1152 # .format(i, max_last_i, wordlst)) 

1153 if ( 

1154 i == max_last_i 

1155 or no_unknown_starts 

1156 or wordlst[max_last_i] not in allowed_unknown_starts 

1157 ): 

1158 # print("RECOVER w={} i={} max_last_i={} wordlst={}" 

1159 # .format(w, i, max_last_i, wordlst)) 

1160 if w in valid_sequences.children: 

1161 max_last_i = add_new( 

1162 # new sequence from root 

1163 valid_sequences.children[w], 

1164 i, 

1165 i, 

1166 max_last_i, 

1167 new_paths, 

1168 new_nodes, 

1169 ) 

1170 cur_nodes = new_nodes # Completely replace nodes! 

1171 # 2023-08-18, fix to improve performance 

1172 # Decode tags does a big search of the best-shortest matching 

1173 # sequences of tags, but the original algorithm didn't have 

1174 # any culling happen during operation, so in a case with 

1175 # a lot of tags (for example, big blocks of text inserted 

1176 # somewhere by mistake that is processed by decode_tags), 

1177 # it would lead to exponential growth of new_paths contents. 

1178 # This culling, using the same weighting algorithm code as 

1179 # in the original is just applied to new_paths before it is 

1180 # added to pos_paths. Basically it's "take the 10 best paths". 

1181 # This *can* cause bugs if it gets stuck in a local minimum 

1182 # or something, but this whole process is one-dimensional 

1183 # and not that complex, so hopefully it works out... 

1184 pw = [] 

1185 path: list[PosPathStep] 

1186 for path in new_paths: 

1187 weight = len(path) 

1188 if any(x[1] == ["UNKNOWN"] for x in path): 

1189 weight += 100 # Penalize unknown paths 

1190 pw.append((weight, path)) 

1191 new_paths = [weightpath[1] for weightpath in sorted(pw)[:10]] 

1192 pos_paths.append(new_paths) 

1193 

1194 # print("END max_last_i={} len(wordlst)={} len(pos_paths)={}" 

1195 # .format(max_last_i, len(wordlst), len(pos_paths))) 

1196 

1197 if cur_nodes: 

1198 # print("END HAVE_NODES") 

1199 for node, start_i, last_i in cur_nodes: 

1200 if node.end: 

1201 # print("$ END start_i={} last_i={}" 

1202 # .format(start_i, last_i)) 

1203 for path in pos_paths[start_i]: 

1204 pos_paths[-1].append( 

1205 [(last_i, node.tags, node.topics)] + path 

1206 ) 

1207 else: 

1208 # print("UNK END start_i={} last_i={} wordlst={}" 

1209 # .format(start_i, last_i, wordlst)) 

1210 u = check_unknown( 

1211 last_i, 

1212 len(wordlst), 

1213 len(wordlst), 

1214 wordlst, 

1215 allow_any, 

1216 no_unknown_starts, 

1217 ) 

1218 if pos_paths[start_i]: 

1219 for path in pos_paths[start_i]: 

1220 pos_paths[-1].append(u + path) 

1221 else: 

1222 pos_paths[-1].append(u) 

1223 else: 

1224 # Check for a final unknown tag 

1225 # print("NO END NODES max_last_i={}".format(max_last_i)) 

1226 paths = pos_paths[max_last_i] or [[]] 

1227 u = check_unknown( 

1228 max_last_i, 

1229 len(wordlst), 

1230 len(wordlst), 

1231 wordlst, 

1232 allow_any, 

1233 no_unknown_starts, 

1234 ) 

1235 if u: 

1236 # print("end max_last_i={}".format(max_last_i)) 

1237 for path in list(paths): # Copy in case it is the last pos 

1238 pos_paths[-1].append(u + path) 

1239 

1240 # import json 

1241 # print("POS_PATHS:", json.dumps(pos_paths, indent=2, sort_keys=True)) 

1242 

1243 if not pos_paths[-1]: 

1244 # print("decode_tags: {}: EMPTY POS_PATHS[-1]".format(src)) 

1245 return [], [] 

1246 

1247 # Find the best path 

1248 pw = [] 

1249 for path in pos_paths[-1]: 

1250 weight = len(path) 

1251 if any(x[1] == ["UNKNOWN"] for x in path): 

1252 weight += 100 # Penalize unknown paths 

1253 pw.append((weight, path)) 

1254 path = min(pw)[1] 

1255 

1256 # Convert the best path to tagsets and topics 

1257 tagsets: list[list[str]] = [[]] 

1258 topics: list[str] = [] 

1259 for i, tagspec, topicspec in path: 

1260 if len(tagsets or "") > 16: 

1261 # ctx.error("Too many tagsets! This is probably exponential", 

1262 # sortid="form_descriptions/20230818") 

1263 return [("error-unknown-tag", "error-exponential-tagsets")], [] 

1264 if tagspec == ["UNKNOWN"]: 

1265 new_tagsets = [] 

1266 for x in tagsets: 

1267 new_tagsets.append(x + topicspec) 

1268 tagsets = new_tagsets 

1269 continue 

1270 if tagspec: 

1271 new_tagsets = [] 

1272 for x in tagsets: 

1273 for t in tagspec: 

1274 if t: 1274 ↛ 1281line 1274 didn't jump to line 1281 because the condition on line 1274 was always true

1275 new_tags = list(x) 

1276 for tag in t.split(): 

1277 if tag not in new_tags: 

1278 new_tags.append(tag) 

1279 new_tagsets.append(new_tags) 

1280 else: 

1281 new_tagsets.append(x) 

1282 tagsets = new_tagsets 

1283 if topicspec: 

1284 for t in topicspec: 

1285 for topic in t.split(): 

1286 if topic not in topics: 

1287 topics.append(topic) 

1288 

1289 # print("unsorted tagsets:", tagsets) 

1290 ret_tagsets = sorted(set(tuple(sorted(set(tags))) for tags in tagsets)) 

1291 # topics = list(sorted(set(topics))) XXX tests expect not sorted 

1292 # print("decode_tags: {} -> {} topics {}".format(src, tagsets, topics)) 

1293 # Yes, ret_tagsets is a list of tags in tuples, while topics is a LIST 

1294 # of tags. Turning topics into a tuple breaks tests, turning the tuples 

1295 # inside tagsets into lists breaks tests, I'm leaving them mismatched 

1296 # for now. XXX 

1297 return ret_tagsets, topics 

1298 

1299 

1300def parse_head_final_tags( 

1301 wxr: WiktextractContext, lang: str, form: str 

1302) -> tuple[str, list[str]]: 

1303 """Parses tags that are allowed at the end of a form head from the end 

1304 of the form. This can also be used for parsing the final gender etc tags 

1305 from translations and linkages.""" 

1306 assert isinstance(wxr, WiktextractContext) 

1307 assert isinstance(lang, str) # Should be language that "form" is for 

1308 assert isinstance(form, str) 

1309 

1310 # print("parse_head_final_tags: lang={} form={!r}".format(lang, form)) 

1311 

1312 # Make sure there are no double spaces in the form as this code does not 

1313 # handle them otherwise. 

1314 form = re.sub(r"\s+", " ", form.strip()) 

1315 if not form: 

1316 return form, [] 

1317 

1318 origform = form 

1319 

1320 tags = [] 

1321 

1322 # If parsing for certain Bantu languages (e.g., Swahili), handle 

1323 # some extra head-final tags first 

1324 if lang in head_final_bantu_langs: 

1325 m = re.search(head_final_bantu_re, form) 

1326 if m is not None: 

1327 tagkeys = m.group(1) 

1328 if not wxr.wtp.title.endswith(tagkeys): # type:ignore[union-attr] 1328 ↛ 1343line 1328 didn't jump to line 1343 because the condition on line 1328 was always true

1329 form = form[: m.start()] 

1330 v = head_final_bantu_map[tagkeys] 

1331 if v.startswith("?"): 1331 ↛ 1332line 1331 didn't jump to line 1332 because the condition on line 1331 was never true

1332 v = v[1:] 

1333 wxr.wtp.debug( 

1334 "suspicious suffix {!r} in language {}: {}".format( 

1335 tagkeys, lang, origform 

1336 ), 

1337 sortid="form_descriptions/1028", 

1338 ) 

1339 tags.extend(v.split()) 

1340 

1341 # If parsing for certain Semitic languages (e.g., Arabic), handle 

1342 # some extra head-final tags first 

1343 if lang in head_final_semitic_langs: 

1344 m = re.search(head_final_semitic_re, form) 

1345 if m is not None: 

1346 tagkeys = m.group(1) 

1347 if not wxr.wtp.title.endswith(tagkeys): # type:ignore[union-attr] 1347 ↛ 1362line 1347 didn't jump to line 1362 because the condition on line 1347 was always true

1348 form = form[: m.start()] 

1349 v = head_final_semitic_map[tagkeys] 

1350 if v.startswith("?"): 1350 ↛ 1351line 1350 didn't jump to line 1351 because the condition on line 1350 was never true

1351 v = v[1:] 

1352 wxr.wtp.debug( 

1353 "suspicious suffix {!r} in language {}: {}".format( 

1354 tagkeys, lang, origform 

1355 ), 

1356 sortid="form_descriptions/1043", 

1357 ) 

1358 tags.extend(v.split()) 

1359 

1360 # If parsing for certain other languages (e.g., Lithuanian, 

1361 # French, Finnish), handle some extra head-final tags first 

1362 if lang in head_final_other_langs: 

1363 m = re.search(head_final_other_re, form) 

1364 if m is not None: 

1365 tagkeys = m.group(1) 

1366 if not wxr.wtp.title.endswith(tagkeys): # type:ignore[union-attr] 1366 ↛ 1372line 1366 didn't jump to line 1372 because the condition on line 1366 was always true

1367 form = form[: m.start()] 

1368 tags.extend(head_final_other_map[tagkeys].split(" ")) 

1369 

1370 # Handle normal head-final tags 

1371 # Loop this until nothing is found 

1372 while True: 

1373 prev_form = form 

1374 m = re.search(head_final_re, form) 

1375 if m is not None: 

1376 # print(f"{m=}, {m.groups()=}") 

1377 tagkeys = m.group(3) 

1378 # Only replace tags ending with numbers in languages that have 

1379 # head-final numeric tags (e.g., Bantu classes); also, don't replace 

1380 # tags if the main title ends with them (then presume they are part 

1381 # of the word) 

1382 # print("head_final_tags form={!r} tagkeys={!r} lang={}" 

1383 # .format(form, tagkeys, lang)) 

1384 tagkeys_contains_digit = re.search(r"\d", tagkeys) 

1385 if ( 

1386 (not tagkeys_contains_digit or lang in head_final_numeric_langs) 

1387 and not wxr.wtp.title.endswith(" " + tagkeys) # type:ignore[union-attr] 

1388 and 

1389 # XXX the above test does not capture when the whole word is a 

1390 # xlat_head_map key, so I added the below test to complement 

1391 # it; does this break anything? 

1392 not wxr.wtp.title == tagkeys 

1393 ): # defunct/English, 

1394 # "more defunct" -> "more" ["archaic"] 

1395 if ( 1395 ↛ 1413line 1395 didn't jump to line 1413 because the condition on line 1395 was always true

1396 not tagkeys_contains_digit 

1397 or lang in head_final_numeric_langs 

1398 ): 

1399 # m.start(3) gets the start of what is in m.group(3), handy 

1400 form = form[: m.start(3)].strip() 

1401 v = xlat_head_map[tagkeys] 

1402 if v.startswith("?"): 1402 ↛ 1403line 1402 didn't jump to line 1403 because the condition on line 1402 was never true

1403 v = v[1:] 

1404 wxr.wtp.debug( 

1405 "suspicious suffix {!r} in language {}: {}".format( 

1406 tagkeys, lang, origform 

1407 ), 

1408 sortid="form_descriptions/1077", 

1409 ) 

1410 tags.extend(v.split()) 

1411 else: 

1412 break 

1413 if prev_form == form: 

1414 break 

1415 

1416 # Generate warnings about words ending in " or" after processing 

1417 if ( 

1418 (form.endswith(" or") and not origform.endswith(" or")) 

1419 or re.search( 

1420 r" (1|2|3|4|5|6|7|8|9|10|11|12|13|14|15|16|17|18|" 

1421 r"1a|2a|9a|10a|m1|f1|f2|m2|f3|m3|f4|m4|f5|m5|or|\?)" 

1422 r"($|/| (f|m|sg|pl|anim|inan))", 

1423 form, 

1424 ) 

1425 or form.endswith(" du") 

1426 ): 

1427 if form not in ok_suspicious_forms: 

1428 wxr.wtp.debug( 

1429 "suspicious unhandled suffix in {}:" 

1430 " {!r}, originally {!r}".format(lang, form, origform), 

1431 sortid="form_descriptions/1089", 

1432 ) 

1433 

1434 # print("parse_head_final_tags: form={!r} tags={}".format(form, tags)) 

1435 return form, tags 

1436 

1437 

1438def quote_kept_parens(s: str) -> str: 

1439 """Changes certain parenthesized expressions so that they won't be 

1440 interpreted as parentheses. This is used for parts that are kept as 

1441 part of the word, such as "read admiral (upper half)".""" 

1442 return re.sub( 

1443 r"\((lower half|upper half|k|s|n|II|III|A|C|G|U|Y|" 

1444 r"vinyl|p-phenylene vinylene|\(\(\s*\)\))\)", 

1445 r"__lpar__\1__rpar__", 

1446 s, 

1447 ) 

1448 

1449 

1450def quote_kept_ruby( 

1451 wxr: WiktextractContext, 

1452 ruby_tuples: list[ 

1453 tuple[ 

1454 str, 

1455 str, 

1456 ] 

1457 ], 

1458 s: str, 

1459) -> str: 

1460 if len(ruby_tuples) < 1: 1460 ↛ 1461line 1460 didn't jump to line 1461 because the condition on line 1460 was never true

1461 wxr.wtp.debug( 

1462 "quote_kept_ruby called with no ruby", 

1463 sortid="form_description/1114/20230517", 

1464 ) 

1465 return s 

1466 ks = [] 

1467 rs = [] 

1468 for k, r in ruby_tuples: 

1469 ks.append(re.escape(k)) 

1470 rs.append(re.escape(r)) 

1471 if not (ks and rs): 1471 ↛ 1472line 1471 didn't jump to line 1472 because the condition on line 1471 was never true

1472 wxr.wtp.debug( 

1473 f"empty column in ruby_tuples: {ruby_tuples}", 

1474 sortid="form_description/1124/20230606", 

1475 ) 

1476 return s 

1477 newm = re.compile( 

1478 r"({})\s*\(\s*({})\s*\)".format("|".join(ks), "|".join(rs)) 

1479 ) 

1480 rub_re = re.compile( 

1481 r"({})".format( 

1482 r"|".join( 

1483 r"{}\(*{}\)*".format( 

1484 re.escape(k), 

1485 re.escape(r), 

1486 ) 

1487 for k, r in ruby_tuples 

1488 ) 

1489 ) 

1490 ) 

1491 

1492 def paren_replace(m: re.Match) -> str: 

1493 return re.sub(newm, r"\1__lrub__\2__rrub__", m.group(0)) 

1494 

1495 return re.sub(rub_re, paren_replace, s) 

1496 

1497 

1498def unquote_kept_parens(s: str) -> str: 

1499 """Converts the quoted parentheses back to normal parentheses.""" 

1500 return re.sub(r"__lpar__(.*?)__rpar__", r"(\1)", s) 

1501 

1502 

1503def add_romanization( 

1504 wxr: WiktextractContext, 

1505 data: WordData, 

1506 roman: str, 

1507 text: str, 

1508 is_reconstruction: bool, 

1509 head_group: Optional[int], 

1510 ruby: Sequence[tuple[str, str]], 

1511) -> None: 

1512 tags_lst = ["romanization"] 

1513 m = re.match(r"([^:]+):(.+)", roman) 

1514 # This function's purpose is to intercept broken romanizations, 

1515 # like "Yale: hēnpyeng" style tags. Most romanization styles 

1516 # are already present as tags, so we can use decode_tags to find 

1517 # them. 

1518 if m: 1518 ↛ 1519line 1518 didn't jump to line 1519 because the condition on line 1518 was never true

1519 tagsets, topics = decode_tags(m.group(1)) 

1520 if tagsets: 

1521 for tags in tagsets: 

1522 tags_lst.extend(tags) 

1523 roman = m.group(2) 

1524 add_related( 

1525 wxr, 

1526 data, 

1527 tags_lst, 

1528 [roman], 

1529 text, 

1530 True, 

1531 is_reconstruction, 

1532 head_group, 

1533 ruby, 

1534 ) 

1535 

1536 

1537def add_related( 

1538 wxr: WiktextractContext, 

1539 data: WordData, 

1540 tags_lst: Union[list[str], tuple[str, ...]], 

1541 related_list: list[str], 

1542 origtext: str, 

1543 add_all_canonicals: bool, 

1544 is_reconstruction: bool, 

1545 head_group: Optional[int], 

1546 ruby_data: Optional[Sequence[tuple[str, str]]] = None, 

1547 links: list[tuple[str, str]] | None = None, 

1548 link_dict: dict[str, list[str]] | None = None, 

1549) -> Optional[list[tuple[str, ...]]]: 

1550 """Internal helper function for some post-processing entries for related 

1551 forms (e.g., in word head). This returns a list of list of tags to be 

1552 added to following related forms or None (cf. walrus/English word head, 

1553 parenthesized part starting with "both").""" 

1554 assert isinstance(wxr, WiktextractContext) 

1555 assert isinstance(tags_lst, (list, tuple)) 

1556 for x in tags_lst: 

1557 assert isinstance(x, str) 

1558 assert isinstance(related_list, (list, tuple)) 

1559 assert isinstance(origtext, str) 

1560 assert add_all_canonicals in (True, False) 

1561 assert isinstance(ruby_data, (list, tuple)) or ruby_data is None 

1562 if ruby_data is None: 1562 ↛ 1563line 1562 didn't jump to line 1563 because the condition on line 1562 was never true

1563 ruby_data = [] 

1564 related = " ".join(related_list) 

1565 # print("add_related: tags_lst={} related={}".format(tags_lst, related)) 

1566 if related == "[please provide]": 1566 ↛ 1567line 1566 didn't jump to line 1567 because the condition on line 1566 was never true

1567 return None 

1568 if related in IGNORED_RELATED: 1568 ↛ 1569line 1568 didn't jump to line 1569 because the condition on line 1568 was never true

1569 return None 

1570 if is_reconstruction and related.startswith("*") and len(related) > 1: 

1571 related = related[1:] 

1572 

1573 # print(f"{links=}, {link_dict=}") 

1574 # Get title word, with any reconstruction prefix removed 

1575 titleword = re.sub(r"^Reconstruction:[^/]*/", "", wxr.wtp.title) # type:ignore[arg-type] 

1576 

1577 def check_related(related: str) -> None: 

1578 # Warn about some suspicious related forms 

1579 m = re.search(suspicious_related_re, related) 

1580 if (m and m.group(0) not in titleword) or ( 

1581 related in ("f", "m", "n", "c") and len(titleword) >= 3 

1582 ): 

1583 if "eumhun" in tags_lst: 1583 ↛ 1584line 1583 didn't jump to line 1584 because the condition on line 1583 was never true

1584 return 

1585 if "cangjie-input" in tags_lst: 1585 ↛ 1586line 1585 didn't jump to line 1586 because the condition on line 1585 was never true

1586 return 

1587 if "class" in tags_lst: 1587 ↛ 1588line 1587 didn't jump to line 1588 because the condition on line 1587 was never true

1588 return 

1589 if wxr.wtp.section == "Korean" and re.search( 1589 ↛ 1593line 1589 didn't jump to line 1593 because the condition on line 1589 was never true

1590 r"^\s*\w*>\w*\s*$", related 

1591 ): 

1592 # ignore Korean "i>ni" / "라>나" values 

1593 return 

1594 if ( 1594 ↛ 1601line 1594 didn't jump to line 1601 because the condition on line 1594 was never true

1595 wxr.wtp.section == "Burmese" 

1596 and "romanization" in tags_lst 

1597 and re.search(r":", related) 

1598 ): 

1599 # ignore Burmese with ":", that is used in Burmese 

1600 # translitteration of "း", the high-tone visarga. 

1601 return 

1602 wxr.wtp.debug( 

1603 "suspicious related form tags {}: {!r} in {!r}".format( 

1604 tags_lst, related, origtext 

1605 ), 

1606 sortid="form_descriptions/1147", 

1607 ) 

1608 

1609 following_tagsets = None # Tagsets to add to following related forms 

1610 roman = None 

1611 tagsets1: list[tuple[str, ...]] = [tuple()] 

1612 topics1: list[str] = [] 

1613 

1614 m = re.match(r"\((([^()]|\([^()]*\))*)\)\s+", related) 

1615 if m: 

1616 paren = m.group(1) 

1617 related = related[m.end() :] 

1618 m = re.match(r"^(all|both) (.*)", paren) 

1619 if m: 1619 ↛ 1620line 1619 didn't jump to line 1620 because the condition on line 1619 was never true

1620 tagsets1, topics1 = decode_tags(m.group(2)) 

1621 following_tagsets = tagsets1 

1622 else: 

1623 tagsets1, topics1 = decode_tags(paren) 

1624 else: 

1625 m = re.search(r"\s+\((([^()]|\([^()]*\))*)\)$", related) 

1626 if m: 

1627 paren = m.group(1) 

1628 if paren.startswith("U+"): 1628 ↛ 1629line 1628 didn't jump to line 1629 because the condition on line 1628 was never true

1629 related = related[: m.start()] 

1630 else: 

1631 cls = classify_desc(paren) 

1632 if ( 1632 ↛ 1639line 1632 didn't jump to line 1639 because the condition on line 1632 was always true

1633 cls in ("romanization", "english") 

1634 and classify_desc(related[: m.start()]) == "other" 

1635 ): 

1636 roman = paren 

1637 related = related[: m.start()] 

1638 else: 

1639 related = related[: m.start()] 

1640 tagsets1, topics1 = decode_tags(paren) 

1641 if related and related.startswith("{{"): 1641 ↛ 1642line 1641 didn't jump to line 1642 because the condition on line 1641 was never true

1642 wxr.wtp.debug( 

1643 "`{{` in word head form - possible Wiktionary error: {!r}".format( 

1644 related 

1645 ), 

1646 sortid="form_descriptions/1177", 

1647 ) 

1648 return None # Likely Wiktionary coding error 

1649 related = unquote_kept_parens(related) 

1650 # Split related by "/" (e.g., grande/Spanish) superlative in head 

1651 # Do not split if / in word title, see π//Japanese 

1652 if len(related) > 5 and "/" not in wxr.wtp.title: # type:ignore[operator] 

1653 alts = split_at_comma_semi(related, separators=["/"]) 

1654 else: 

1655 alts = [related] 

1656 if ruby_data: 

1657 # prepare some regex stuff in advance 

1658 ks, rs = [], [] 

1659 for k, r in ruby_data: 

1660 ks.append(re.escape(k)) 

1661 rs.append(re.escape(r)) 

1662 splitter = r"((?:{})__lrub__(?:{})__rrub__)".format( 

1663 "|".join(ks), "|".join(rs) 

1664 ) 

1665 for related in alts: 

1666 ruby: list[tuple[str, str]] = [] 

1667 if ruby_data: 

1668 new_related = [] 

1669 rub_split = re.split(splitter, related) 

1670 for s in rub_split: 

1671 m = re.match(r"(.+)__lrub__(.+)__rrub__", s) 

1672 if m: 

1673 # add ruby with (\1, \2) 

1674 ruby.append((m.group(1), m.group(2))) 

1675 new_related.append(m.group(1)) 

1676 else: 

1677 new_related.append(s) 

1678 related = "".join(new_related) 

1679 tagsets2, topics2 = decode_tags(" ".join(tags_lst)) 

1680 for tags1 in tagsets1: 

1681 assert isinstance(tags1, (list, tuple)) 

1682 for tags2 in tagsets2: 

1683 assert isinstance(tags1, (list, tuple)) 

1684 dt: LinkageData = {"word": related} 

1685 if roman: 

1686 dt["roman"] = roman 

1687 if ruby: 

1688 dt["ruby"] = ruby 

1689 if "alt-of" in tags2: 1689 ↛ 1690line 1689 didn't jump to line 1690 because the condition on line 1689 was never true

1690 check_related(related) 

1691 data_extend(data, "tags", tags1) 

1692 data_extend(data, "tags", tags2) 

1693 data_extend(data, "topics", topics1) 

1694 data_extend(data, "topics", topics2) 

1695 data_append(data, "alt_of", dt) 

1696 elif "form-of" in tags2: 1696 ↛ 1697line 1696 didn't jump to line 1697 because the condition on line 1696 was never true

1697 check_related(related) 

1698 data_extend(data, "tags", tags1) 

1699 data_extend(data, "tags", tags2) 

1700 data_extend(data, "topics", topics1) 

1701 data_extend(data, "topics", topics2) 

1702 data_append(data, "form_of", dt) 

1703 elif "compound-of" in tags2: 1703 ↛ 1704line 1703 didn't jump to line 1704 because the condition on line 1703 was never true

1704 check_related(related) 

1705 data_extend(data, "tags", tags1) 

1706 data_extend(data, "tags", tags2) 

1707 data_extend(data, "topics", topics1) 

1708 data_extend(data, "topics", topics2) 

1709 data_append(data, "compound", related) 

1710 else: 

1711 lang = wxr.wtp.section or "LANG_MISSING" 

1712 related, final_tags = parse_head_final_tags( 

1713 wxr, lang, related 

1714 ) 

1715 # print("add_related: related={!r} tags1={!r} tags2={!r} " 

1716 # "final_tags={!r}" 

1717 # .format(related, tags1, tags2, final_tags)) 

1718 tags = list(tags1) + list(tags2) + list(final_tags) 

1719 check_related(related) 

1720 form: FormData = {"form": related} 

1721 if ( 

1722 links 

1723 and link_dict 

1724 and ( 

1725 form_links := match_links_to_form( 

1726 wxr, related, links, link_dict 

1727 ) 

1728 ) 

1729 ): 

1730 form["links"] = form_links 

1731 if head_group: 

1732 form["head_nr"] = head_group 

1733 if roman: 

1734 form["roman"] = roman 

1735 if ruby: 

1736 form["ruby"] = ruby 

1737 data_extend(form, "topics", topics1) 

1738 data_extend(form, "topics", topics2) 

1739 if topics1 or topics2: 1739 ↛ 1740line 1739 didn't jump to line 1740 because the condition on line 1739 was never true

1740 wxr.wtp.debug( 

1741 "word head form has topics: {}".format(form), 

1742 sortid="form_descriptions/1233", 

1743 ) 

1744 # Add tags from canonical form into the main entry 

1745 if "canonical" in tags: 

1746 if related in ("m", "f") and len(titleword) > 1: 1746 ↛ 1747line 1746 didn't jump to line 1747 because the condition on line 1746 was never true

1747 wxr.wtp.debug( 

1748 "probably incorrect canonical form " 

1749 "{!r} ignored (probably tag combination " 

1750 "missing from xlat_head_map)".format(related), 

1751 sortid="form_descriptions/1241", 

1752 ) 

1753 continue 

1754 if ( 

1755 related != titleword 

1756 or add_all_canonicals 

1757 or topics1 

1758 or topics2 

1759 or ruby 

1760 ): 

1761 data_extend(form, "tags", sorted(set(tags))) 

1762 else: 

1763 # We won't add canonical form here 

1764 filtered_tags = list( 

1765 x for x in tags if x != "canonical" 

1766 ) 

1767 data_extend(data, "tags", filtered_tags) 

1768 continue 

1769 else: 

1770 data_extend(form, "tags", sorted(set(tags))) 

1771 # Only insert if the form is not already there 

1772 for old in data.get("forms", ()): 

1773 if form == old: 1773 ↛ 1774line 1773 didn't jump to line 1774 because the condition on line 1773 was never true

1774 break 

1775 else: 

1776 data_append(data, "forms", form) 

1777 

1778 # If this form had pre-tags that started with "both" or "all", add those 

1779 # tags also to following related forms that don't have their own tags 

1780 # specified. 

1781 return following_tagsets 

1782 

1783 

1784def match_links_to_form( 

1785 wxr: WiktextractContext, 

1786 form: str, 

1787 links: list[tuple[str, str]], 

1788 link_dict: dict[str, list[str]] | None, 

1789) -> list[tuple[str, str]] | None: 

1790 if not links: 1790 ↛ 1791line 1790 didn't jump to line 1791 because the condition on line 1790 was never true

1791 return None 

1792 if link_dict is None: 

1793 link_dict = {} 

1794 for ltxt, ltrg in links: 

1795 if ltxt not in link_dict: 

1796 link_dict[ltxt] = [ 

1797 ltrg, 

1798 ] 

1799 else: 

1800 link_dict[ltxt].append(ltrg) 

1801 ret: list[tuple[str, str]] = [] 

1802 if form in link_dict: 

1803 if len(link_dict[form]) > 1 and any( 1803 ↛ 1806line 1803 didn't jump to line 1806 because the condition on line 1803 was never true

1804 x != link_dict[form][0] for x in link_dict[form] 

1805 ): 

1806 wxr.wtp.warning( 

1807 f"{form=} has many different " 

1808 f"link candidates `{link_dict[form]}`, " 

1809 f"which can't be disambiguated.", 

1810 sortid="form_descriptions/match_links_to_form", 

1811 ) 

1812 for ltarg in link_dict[form]: 

1813 ret.append((form, ltarg)) 

1814 elif " " in form: 

1815 # split and search for a sequence of links... 

1816 split_forms = form.split() 

1817 found = False 

1818 for i, (ltext, ltarg) in enumerate(links): 

1819 if ltext == split_forms[0]: 

1820 for j, f in enumerate(split_forms): 

1821 if i + j >= len(links): 

1822 break 

1823 if f.strip(",;() ") != links[i + j][0].strip(",;() "): 

1824 break 

1825 if i + j == len(links): 1825 ↛ 1826line 1825 didn't jump to line 1826 because the condition on line 1825 was never true

1826 break 

1827 else: 

1828 found = True 

1829 if found: 

1830 ret = links[i : i + len(split_forms)] 

1831 break 

1832 # We only care about weird links 

1833 # print(f"{len(ret)=}, {ret}") 

1834 for txt, tar in ret: 

1835 if txt != tar and txt != tar[: tar.find("#")]: 

1836 break 

1837 else: 

1838 return None 

1839 return ret or None 

1840 

1841 

1842# Issue #967, in English word forms sometimes forms are skipped because 

1843# they are taggable words and their distw() is too big, like clipping from clip 

1844WORDS_WITH_FALSE_POSITIVE_TAGS: dict[str, list[str]] = { 

1845 "clip": ["clipping"], # XXX remember to change me back to clipping after 

1846 "English": ["English", "Englishes"], 

1847 "common": ["common", "commoner"], 

1848 # tests. 

1849} 

1850 

1851WORDS_WITH_FALSE_POSITIVE_FORMS: dict[str, list[str]] = { 

1852 "unaccountability": ["countable", "uncountable"], 

1853 "uncountability": ["countable", "uncountable"], 

1854} 

1855 

1856FALSE_POSITIVE_MISSING_FORMS: dict[str, list[str]] = {} 

1857 

1858FORM_ASSOCIATED_TAG_WORDS: set[str] = { 

1859 "participle", 

1860 "past", 

1861 "present", 

1862 "singular", 

1863 "plural", 

1864 "first-person", 

1865 "second-person", 

1866 "third-person", 

1867 "gerund", 

1868} 

1869 

1870SEMICOLON_REPLACEMENT = "__SEMICOLON__" 

1871 

1872 

1873def parse_word_head( 

1874 wxr: WiktextractContext, 

1875 word: str, 

1876 pos: str, 

1877 text: str, 

1878 data: WordData, 

1879 is_reconstruction: bool, 

1880 head_group: Optional[int], 

1881 original_header_nodes: list[WikiNode | str] | None = None, 

1882 ruby=None, 

1883 links: list[ 

1884 tuple[ 

1885 str, 

1886 str, 

1887 ] 

1888 ] 

1889 | None = None, 

1890 label_templates: list[TemplateData] | None = None, 

1891) -> None: 

1892 """Parses the head line for a word for in a particular language and 

1893 part-of-speech, extracting tags and related forms.""" 

1894 assert isinstance(wxr, WiktextractContext) 

1895 assert isinstance(pos, str) 

1896 assert isinstance(text, str) 

1897 assert isinstance(data, dict) 

1898 assert isinstance(ruby, (list, tuple)) or ruby is None 

1899 if ruby is None: 

1900 ruby = [] 

1901 assert is_reconstruction in (True, False) 

1902 # print("PARSE_WORD_HEAD: {}: {!r}".format(wxr.wtp.section, text)) 

1903 # print(f"PARSE_WORD_HEAD: {data=}") 

1904 # print(f"PARSE_WORD_HEAD: {links=}") 

1905 

1906 # Save original text for if we want to look for mismatched form-links 

1907 

1908 link_dict: dict[str, list[str]] | None 

1909 if links is not None: 

1910 link_dict = {} 

1911 for ltxt, ltrg in links: 

1912 if ltxt not in link_dict: 

1913 link_dict[ltxt] = [ 

1914 ltrg, 

1915 ] 

1916 else: 

1917 link_dict[ltxt].append(ltrg) 

1918 else: 

1919 link_dict = None 

1920 

1921 # print(f"MAIN: {links=}") 

1922 link_words_not_alnum = [] 

1923 if not word.isalnum() and not word.replace("-", "").isalnum(): 

1924 # `-` is kosher, add more of these if needed. 

1925 # if the word contains non-letter or -number characters, it 

1926 # might have something that messes with split-at-semi-comma; we 

1927 # collect links so that we can skip splitting them. 

1928 if links is None and original_header_nodes is not None: 

1929 links, _ = extract_links_from_node( 

1930 wxr, 

1931 original_header_nodes, 

1932 remove_anchor_tags=True, 

1933 expand_nodes=True, 

1934 ) 

1935 if links is not None: 1935 ↛ 1939line 1935 didn't jump to line 1939 because the condition on line 1935 was always true

1936 for ltext, ltar in links: 

1937 if not ltext.isalnum(): 

1938 link_words_not_alnum.append(ltext) 

1939 if word not in link_words_not_alnum: 1939 ↛ 1942line 1939 didn't jump to line 1942 because the condition on line 1939 was always true

1940 link_words_not_alnum.append(word) 

1941 

1942 if link_words_not_alnum is None: 1942 ↛ 1943line 1942 didn't jump to line 1943 because the condition on line 1942 was never true

1943 link_words_not_alnum = [] 

1944 

1945 if len(link_words_not_alnum) > 0: 

1946 # if we have link data (that is, links with stuff like commas and 

1947 # spaces, replace word_re with a modified local scope pattern 

1948 # print(f"links {list((c, ord(c)) for link in links for c in link)=}") 

1949 word_re = re.compile( 

1950 r"\b" # In case we have forms that are longer and contain links 

1951 + 

1952 # or words as a substring... 

1953 r"\b|\b".join( 

1954 sorted( 

1955 (re.escape(s) for s in link_words_not_alnum), 

1956 key=lambda x: -len(x), 

1957 ) 

1958 ) 

1959 + r"\b|" 

1960 + word_pattern 

1961 ) 

1962 else: 

1963 word_re = word_re_global 

1964 

1965 if "Lua execution error" in text or "Lua timeout error" in text: 1965 ↛ 1966line 1965 didn't jump to line 1966 because the condition on line 1965 was never true

1966 return 

1967 

1968 # Fix words with "superlative:" or "comparative:" at end of head 

1969 # e.g. grande/Spanish/Adj 

1970 text = re.sub(r" (superlative|comparative): (.*)", r" (\1 \2)", text) 

1971 

1972 # Parse Arabic non-past forms, e.g. أبلع/Arabic/Verb 

1973 m = re.search(r", non-past ([^)]+ \([^)]+\))", text) 

1974 if m: 

1975 add_related( 

1976 wxr, 

1977 data, 

1978 ["non-past"], 

1979 [m.group(1)], 

1980 text, 

1981 True, 

1982 is_reconstruction, 

1983 head_group, 

1984 ruby, 

1985 links, 

1986 link_dict, 

1987 ) 

1988 text = text[: m.start()] + text[m.end() :] 

1989 

1990 language = wxr.wtp.section 

1991 titleword = re.sub( 

1992 r"^Reconstruction:[^/]*/", "", wxr.wtp.title or "MISSING_TITLE" 

1993 ) 

1994 titleparts = list( 

1995 m.group(0) 

1996 for m in re.finditer(word_re, wxr.wtp.title or "MISSING_TITLE") 

1997 ) 

1998 if not titleparts: 1998 ↛ 1999line 1998 didn't jump to line 1999 because the condition on line 1998 was never true

1999 return 

2000 

2001 # Remove " or" from the end to prevent weird canonical forms 

2002 if text.endswith(" or"): 

2003 for tp in titleparts: 

2004 if text.endswith(tp): 2004 ↛ 2005line 2004 didn't jump to line 2005 because the condition on line 2004 was never true

2005 break 

2006 else: 

2007 text = text.removesuffix(" or").rstrip() 

2008 

2009 # Handle the part of the head that is not in parentheses. However, certain 

2010 # parenthesized parts are part of word, and those must be handled 

2011 # specially here. 

2012 if ruby: 

2013 text = quote_kept_ruby(wxr, ruby, text) 

2014 base = text 

2015 base = quote_kept_parens(base) 

2016 base = remove_text_in_parentheses(base) 

2017 base = base.replace("?", "") # Removes uncertain articles etc 

2018 base = re.sub(r"\s+", " ", base) 

2019 base = re.sub(r" ([,;])", r"\1", base) 

2020 base = re.sub(r" • ", r" ", base) 

2021 # Many languages use • as a punctuation mark separating the base 

2022 # from the rest of the head. στάδιος/Ancient Greek, issue #176 

2023 base = base.strip() 

2024 # print(f"{base=}, {text=}") 

2025 

2026 # Check for certain endings in head (mostly for compatibility with weird 

2027 # heads, e.g. rata/Romanian "1st conj." at end) 

2028 m = re.search(head_end_re, base) 

2029 tags: Union[tuple[str, ...], list[str]] = [] 

2030 if m: 2030 ↛ 2031line 2030 didn't jump to line 2031 because the condition on line 2030 was never true

2031 tags = head_end_map[m.group(1).lower()].split() 

2032 data_extend(data, "tags", tags) 

2033 base = base[: m.start()] 

2034 

2035 # Special case: handle Hán Nôm readings for Vietnamese characters 

2036 m = re.match( 

2037 r"{}: (Hán Nôm) readings: (.*)".format(re.escape(titleword)), base 

2038 ) 

2039 if m: 2039 ↛ 2040line 2039 didn't jump to line 2040 because the condition on line 2039 was never true

2040 tag, readings = m.groups() 

2041 tag = re.sub(r"\s+", "-", tag) 

2042 for reading in split_at_comma_semi( 

2043 readings, skipped=link_words_not_alnum 

2044 ): 

2045 add_related( 

2046 wxr, 

2047 data, 

2048 [tag], 

2049 [reading], 

2050 text, 

2051 True, 

2052 is_reconstruction, 

2053 head_group, 

2054 ruby, 

2055 links, 

2056 link_dict, 

2057 ) 

2058 return 

2059 

2060 # Special case: Hebrew " [pattern: nnn]" ending 

2061 m = re.search(r"\s+\[pattern: ([^]]+)\]", base) 

2062 if m: 2062 ↛ 2063line 2062 didn't jump to line 2063 because the condition on line 2062 was never true

2063 add_related( 

2064 wxr, 

2065 data, 

2066 ["class"], 

2067 [m.group(1)], 

2068 text, 

2069 True, 

2070 is_reconstruction, 

2071 head_group, 

2072 ruby, 

2073 links, 

2074 link_dict, 

2075 ) 

2076 base = base[: m.start()] + base[m.end() :] 

2077 

2078 # Clean away some messy "Upload an image" template text used in 

2079 # American Sign Language: 

2080 # S@NearBaseForearm-PalmUp Frontandback S@BaseForearm-PalmUp 

2081 m = re.search(r"Upload .+ gif image.", base) 

2082 if m: 2082 ↛ 2083line 2082 didn't jump to line 2083 because the condition on line 2082 was never true

2083 base = base[: m.start()] + base[m.end() :] 

2084 

2085 semicolon_present = False 

2086 # Split the head into alternatives. This is a complicated task, as 

2087 # we do not want so split on "or" or "," when immediately followed by more 

2088 # head-final tags, but otherwise do want to split by them. 

2089 # 20230907 added "or" to this to handle 'true or false', titles with 'or' 

2090 if wxr.wtp.title and ( 

2091 "," in wxr.wtp.title or ";" in wxr.wtp.title or " or " in wxr.wtp.title 

2092 ): 

2093 # If the title has ";", we don't want to split on that and can remove 

2094 # the ; from the splitting regex pretty easily because it's uncommon. 

2095 # However, commas are so common that not splitting on them is just 

2096 # not feasible, and we have to just deal with that if there are 

2097 # alternative forms or variations with stray commas that shouldn't 

2098 # be split. 

2099 if ";" in wxr.wtp.title: 

2100 semicolon_present = True 

2101 base = base.replace(";", SEMICOLON_REPLACEMENT) 

2102 default_splitter = head_split_no_semicolon_re 

2103 else: 

2104 default_splitter = head_split_re 

2105 # A kludge to handle article titles/phrases with commas. 

2106 # Preprocess splits to first capture the title, then handle 

2107 # all the others as usual. 

2108 presplits = re.split(r"({})".format(wxr.wtp.title), base) 

2109 splits = [] 

2110 for psplit in presplits: 

2111 if psplit == wxr.wtp.title: 

2112 splits.append(psplit) 

2113 else: 

2114 splits.extend(re.split(default_splitter, psplit)) 

2115 else: 

2116 # Do the normal split; previous behavior. 

2117 splits = re.split(head_split_re, base) 

2118 # print("BASE: ", repr(base)) 

2119 # print("SPLITS:", splits) 

2120 alts: list[str] = [] 

2121 # print("parse_word_head: splits:", splits, 

2122 # "head_split_re_parens:", head_split_re_parens) 

2123 for i in range( 

2124 0, len(splits) - head_split_re_parens, head_split_re_parens + 1 

2125 ): 

2126 v = splits[i] 

2127 ending = splits[i + 1] or "" # XXX is this correct??? 

2128 # print("parse_word_head alts v={!r} ending={!r} alts={}" 

2129 # .format(v, ending, alts)) 

2130 if alts and (v == "" and ending): 

2131 assert ending[0] == " " 

2132 alts[-1] += " or" + ending # endings starts with space 

2133 elif v or ending: 

2134 alts.append((v or "") + (ending or "")) 

2135 last = splits[-1].strip() 

2136 conn = "" if len(splits) < 3 else splits[-2] 

2137 # print("parse_word_head alts last={!r} conn={!r} alts={}" 

2138 # .format(last, conn, alts)) 

2139 if ( 2139 ↛ 2150line 2139 didn't jump to line 2150 because the condition on line 2139 was never true

2140 alts 

2141 and last 

2142 and ( 

2143 last.split()[0] in xlat_head_map 

2144 or ( 

2145 conn == " or " 

2146 and (alts[-1] + " or " + last).strip() in xlat_head_map 

2147 ) 

2148 ) 

2149 ): 

2150 alts[-1] += " or " + last 

2151 elif last: 2151 ↛ 2152line 2151 didn't jump to line 2152 because the condition on line 2151 was never true

2152 alts.append(last) 

2153 

2154 # print("parse_word_head alts: {}".format(alts)) 

2155 # print(f"{base=}") 

2156 

2157 # Process the head alternatives 

2158 canonicals: list[tuple[list[str], list[str]]] = [] 

2159 mode: Optional[str] = None 

2160 for alt_i, alt in enumerate(alts): 

2161 alt = alt.strip() 

2162 if alt.startswith("compound form:"): 2162 ↛ 2163line 2162 didn't jump to line 2163 because the condition on line 2162 was never true

2163 mode = "compound-form" 

2164 alt = alt[14:].strip() 

2165 if ((dash_i := alt.find(" -")) > 0) and ( 

2166 dash_i > (wxr.wtp.title or "").find(" -") 

2167 ): 

2168 # test_en_head / test_suffixes_at_end_of_form1 

2169 # Some heads have suffixes that end up attached to the form 

2170 # like in https://en.wiktionary.org/wiki/%E6%A5%BD%E3%81%97%E3%81%84 

2171 alt = alt[:dash_i] 

2172 if mode == "compound-form": 2172 ↛ 2173line 2172 didn't jump to line 2173 because the condition on line 2172 was never true

2173 add_related( 

2174 wxr, 

2175 data, 

2176 ["in-compounds"], 

2177 [alt], 

2178 text, 

2179 True, 

2180 is_reconstruction, 

2181 head_group, 

2182 ruby, 

2183 links, 

2184 link_dict, 

2185 ) 

2186 continue 

2187 # For non-first parts, see if it can be treated as tags-only 

2188 if alt_i == 0: 

2189 expanded_alts = [alt] 

2190 else: 

2191 expanded_alts = map_with(xlat_descs_map, [alt]) 

2192 # print("EXPANDED_ALTS:", expanded_alts) 

2193 tagsets: Optional[list[tuple[str, ...]]] 

2194 for alt in expanded_alts: 

2195 baseparts = list(m.group(0) for m in word_re.finditer(alt)) 

2196 if alt_i > 0: 

2197 tagsets, topics = decode_tags(" ".join(baseparts)) 

2198 if not any("error-unknown-tag" in x for x in tagsets): 

2199 data_extend(data, "topics", topics) 

2200 for tags1 in tagsets: 

2201 data_extend(data, "tags", tags1) 

2202 continue 

2203 

2204 alt, tags = parse_head_final_tags( 

2205 wxr, language or "MISSING_LANG", alt 

2206 ) 

2207 tags = list(tags) # Make sure we don't modify anything cached 

2208 tags.append("canonical") 

2209 if alt_i == 0 and "," in wxr.wtp.title or ";" in wxr.wtp.title: # type:ignore[operator] 

2210 # Kludge to handle article titles/phrases with commas. 

2211 # basepart's regex strips commas, which leads to a 

2212 # canonical form that is the title phrase without a comma. 

2213 # basepart in add_related is almost immediately joined with 

2214 # spaces anyhow. XXX not exactly sure why it's 

2215 # canonicals.append((tags, baseparts)) and not (tags, [alt]) 

2216 baseparts = [alt] 

2217 canonicals.append((tags, baseparts)) 

2218 

2219 # If more of this kind of replace-and-return-original kind of stuff is 

2220 # needed, make semicolon_present into a flag enum, something like `modified` 

2221 if semicolon_present: 

2222 new_cans = [] 

2223 for tags, baseparts in canonicals: 

2224 new_cans.append( 

2225 ( 

2226 tags, 

2227 [s.replace(SEMICOLON_REPLACEMENT, ";") for s in baseparts], 

2228 ) 

2229 ) 

2230 canonicals = new_cans 

2231 for tags, baseparts in canonicals: 

2232 add_related( 

2233 wxr, 

2234 data, 

2235 tags, 

2236 baseparts, 

2237 text, 

2238 len(canonicals) > 1, 

2239 is_reconstruction, 

2240 head_group, 

2241 ruby, 

2242 links, 

2243 link_dict, 

2244 ) 

2245 

2246 # Handle parenthesized descriptors for the word form and links to 

2247 # related words 

2248 text = quote_kept_parens(text) 

2249 parens = list( 

2250 m.group(2) 

2251 for m in re.finditer(r"(^|\s)\((([^()]|\([^()]*\))*)\)", text) 

2252 ) 

2253 parens.extend( 

2254 m.group(1) 

2255 for m in re.finditer(r"[^\s]\((([^()]|\([^()]*\))*)\)($|\s)", text) 

2256 ) 

2257 have_romanization = False 

2258 have_ruby = False 

2259 hiragana = "" 

2260 katakana = "" 

2261 for paren in parens: 

2262 paren = paren.strip() 

2263 if not paren: 2263 ↛ 2264line 2263 didn't jump to line 2264 because the condition on line 2263 was never true

2264 continue 

2265 can_be_form = True 

2266 if label_templates is not None and paren.startswith( 

2267 "__LABEL_TEMPLATE_" 

2268 ): 

2269 # wxr.wtp.warning("Found label template in head") 

2270 # continue 

2271 can_be_form = False 

2272 m = re.match(r"__LABEL_TEMPLATE_(\d+)__", paren) 

2273 if m is None: 2273 ↛ 2274line 2273 didn't jump to line 2274 because the condition on line 2273 was never true

2274 wxr.wtp.warning( 

2275 f"Label template list magic phrase is broken: `{paren}`", 

2276 sortid="20260508/label list index broken", 

2277 ) 

2278 continue 

2279 ht = label_templates[int(m.group(1))] 

2280 desc = ht.get("expansion", "").strip() 

2281 if desc: 2281 ↛ 2284line 2281 didn't jump to line 2284 because the condition on line 2281 was always true

2282 paren = desc 

2283 else: 

2284 wxr.wtp.warning( 

2285 f"Label template seems to have no text contents: {ht=}", 

2286 sortid="20260508/label_templates", 

2287 ) 

2288 continue 

2289 if paren.startswith("see "): 

2290 continue 

2291 if paren.startswith("U+"): 2291 ↛ 2292line 2291 didn't jump to line 2292 because the condition on line 2291 was never true

2292 continue 

2293 # In some rare cases, strip word that inflects form the form 

2294 # description, e.g. "look through rose-tinted glasses"/English. 

2295 # `([looks])` 

2296 paren = re.sub(r"\s*\(\[[^])]*\]\)", "", paren) 

2297 

2298 # If it starts with hiragana or katakana, treat as such form. Note 

2299 # that each hiragana/katakana character is in separate parentheses, 

2300 # so we must concatenate them. 

2301 try: 

2302 un = unicodedata.name(paren[0]).split()[0] 

2303 except ValueError: 

2304 un = "INVALID" 

2305 if un == "KATAKANA": 2305 ↛ 2306line 2305 didn't jump to line 2306 because the condition on line 2305 was never true

2306 katakana += paren 

2307 have_ruby = True 

2308 continue 

2309 if un == "HIRAGANA": 2309 ↛ 2310line 2309 didn't jump to line 2310 because the condition on line 2309 was never true

2310 hiragana += paren 

2311 have_ruby = True 

2312 continue 

2313 

2314 # Parse format ", 16 (Japan, Mainland), 17 (Hong Kong, Taiwan) strokes," 

2315 # in the middle of the parenthesized expression, e.g. 薄 

2316 def strokes_repl(m: re.Match) -> str: 

2317 strokes1, tags1, strokes2, tags2 = m.groups() 

2318 for strokes, tags in [[strokes1, tags1], [strokes2, tags2]]: 

2319 tags = tags.split(", ") 

2320 tags = list( 

2321 "Mainland China" if t == "Mainland" else t for t in tags 

2322 ) 

2323 tags.append("strokes") 

2324 add_related( 

2325 wxr, 

2326 data, 

2327 tags, 

2328 [strokes], 

2329 text, 

2330 True, 

2331 is_reconstruction, 

2332 head_group, 

2333 ruby, 

2334 links, 

2335 link_dict, 

2336 ) 

2337 return ", " 

2338 

2339 if can_be_form is True: 

2340 paren = re.sub( 

2341 r", (\d+) \(([^()]+)\), (\d+) \(([^()]+)\) strokes, ", 

2342 strokes_repl, 

2343 paren, 

2344 ) 

2345 

2346 descriptors = map_with(xlat_descs_map, [paren]) 

2347 new_desc = [] 

2348 for desc in descriptors: 

2349 new_desc.extend( 

2350 map_with( 

2351 xlat_tags_map, 

2352 split_at_comma_semi( 

2353 desc, extra=[", or "], skipped=link_words_not_alnum 

2354 ), 

2355 ) 

2356 ) 

2357 prev_tags: Union[list[list[str]], list[tuple[str, ...]], None] = None 

2358 following_tags = None # Added to prev_tags from previous parenthesized 

2359 # part, e.g. walrus/English 

2360 # "(both nonstandard, proscribed, uncommon)" 

2361 for desc_i, desc in enumerate(new_desc): 

2362 # print("HEAD DESC: {!r}".format(desc)) 

2363 

2364 # Abort on certain descriptors (assume remaining values are 

2365 # examples or uninteresting, cf. gaan/Navajo, horior/Latin) 

2366 if re.match(r"^(per |e\.g\.$)", desc): 2366 ↛ 2367line 2366 didn't jump to line 2367 because the condition on line 2366 was never true

2367 break 

2368 

2369 # If it all consists of CJK characters, add it with the 

2370 # CJK tag. This is used at least for some Vietnamese 

2371 # words (e.g., ba/Vietnamese) 

2372 try: 

2373 if ( 2373 ↛ 2377line 2373 didn't jump to line 2377 because the condition on line 2373 was never true

2374 all(unicodedata.name(x).startswith("CJK ") for x in desc) 

2375 and can_be_form 

2376 ): 

2377 add_related( 

2378 wxr, 

2379 data, 

2380 ["CJK"], 

2381 [desc], 

2382 text, 

2383 True, 

2384 is_reconstruction, 

2385 head_group, 

2386 ruby, 

2387 links, 

2388 link_dict, 

2389 ) 

2390 continue 

2391 except ValueError: 

2392 pass 

2393 

2394 # Handle some special cases 

2395 splitdesc = desc.split() 

2396 if ( 2396 ↛ 2406line 2396 didn't jump to line 2406 because the condition on line 2396 was never true

2397 len(splitdesc) >= 3 

2398 and splitdesc[1] == "superlative" 

2399 and classify_desc(splitdesc[0]) != "tags" 

2400 and prev_tags 

2401 and can_be_form 

2402 ): 

2403 # Handle the special case of second comparative after comma, 

2404 # followed by superlative without comma. E.g. 

2405 # mal/Portuguese/Adv 

2406 for ts in prev_tags: 

2407 add_related( 

2408 wxr, 

2409 data, 

2410 ts, 

2411 [splitdesc[0]], 

2412 text, 

2413 True, 

2414 is_reconstruction, 

2415 head_group, 

2416 ruby, 

2417 links, 

2418 link_dict, 

2419 ) 

2420 desc = " ".join(splitdesc[1:]) 

2421 elif ( 2421 ↛ 2430line 2421 didn't jump to line 2430 because the condition on line 2421 was never true

2422 len(splitdesc) == 2 

2423 and splitdesc[0] in ("also", "and") 

2424 and prev_tags 

2425 and classify_desc(splitdesc[1]) != "tags" 

2426 and can_be_form 

2427 ): 

2428 # Sometimes alternative forms are prefixed with "also" or 

2429 # "and" 

2430 for ts in prev_tags: 

2431 add_related( 

2432 wxr, 

2433 data, 

2434 ts, 

2435 [splitdesc[1]], 

2436 text, 

2437 True, 

2438 is_reconstruction, 

2439 head_group, 

2440 ruby, 

2441 links, 

2442 link_dict, 

2443 ) 

2444 continue 

2445 elif len(splitdesc) >= 2 and splitdesc[0] in ("including",): 2445 ↛ 2446line 2445 didn't jump to line 2446 because the condition on line 2445 was never true

2446 continue 

2447 

2448 # If only one word, assume it is comma-separated alternative 

2449 # to the previous one 

2450 if len(splitdesc) == 1: 

2451 cls = classify_desc(desc) 

2452 if cls != "tags" and can_be_form: 

2453 if prev_tags: 2453 ↛ 2455line 2453 didn't jump to line 2455 because the condition on line 2453 was never true

2454 # Assume comma-separated alternative to previous one 

2455 for ts in prev_tags: 

2456 add_related( 

2457 wxr, 

2458 data, 

2459 ts, 

2460 [desc], 

2461 text, 

2462 True, 

2463 is_reconstruction, 

2464 head_group, 

2465 ruby, 

2466 links, 

2467 link_dict, 

2468 ) 

2469 continue 

2470 elif distw(titleparts, desc) <= 0.5: 2470 ↛ 2473line 2470 didn't jump to line 2473 because the condition on line 2470 was never true

2471 # Similar to head word, assume a dialectal variation to 

2472 # the base form. Cf. go/Alemannic German/Verb 

2473 add_related( 

2474 wxr, 

2475 data, 

2476 ["alternative"], 

2477 [desc], 

2478 text, 

2479 True, 

2480 is_reconstruction, 

2481 head_group, 

2482 ruby, 

2483 links, 

2484 link_dict, 

2485 ) 

2486 continue 

2487 elif ( 

2488 cls in ("romanization", "english") 

2489 and not have_romanization 

2490 and classify_desc(titleword) == "other" 

2491 and not ( 

2492 "categories" in data and desc in data["categories"] 

2493 ) 

2494 ): 

2495 # Assume it to be a romanization 

2496 add_romanization( 

2497 wxr, 

2498 data, 

2499 desc, 

2500 text, 

2501 is_reconstruction, 

2502 head_group, 

2503 ruby, 

2504 ) 

2505 have_romanization = True 

2506 continue 

2507 

2508 m = re.match(r"^(\d+) strokes?$", desc) 

2509 if m and can_be_form: 

2510 # Special case, used to give #strokes for Han characters 

2511 add_related( 

2512 wxr, 

2513 data, 

2514 ["strokes"], 

2515 [m.group(1)], 

2516 text, 

2517 True, 

2518 is_reconstruction, 

2519 head_group, 

2520 ruby, 

2521 links, 

2522 link_dict, 

2523 ) 

2524 continue 

2525 

2526 # See if it is radical+strokes 

2527 m = re.match( 

2528 r"^([\u2F00-\u2FDF\u2E80-\u2EFF\U00018800-\U00018AFF" 

2529 r"\uA490-\uA4CF\u4E00-\u9FFF]\+\d+)" 

2530 r"( in (Japanese|Chinese|traditional Chinese|" 

2531 r"simplified Chinese))?$", 

2532 desc, 

2533 ) 

2534 if m and can_be_form: 2534 ↛ 2537line 2534 didn't jump to line 2537 because the condition on line 2534 was never true

2535 # Special case, used to give radical + strokes for Han 

2536 # characters 

2537 radical_strokes = m.group(1) 

2538 lang = m.group(3) 

2539 t = ["radical+strokes"] 

2540 if lang: 

2541 t.extend(lang.split()) 

2542 add_related( 

2543 wxr, 

2544 data, 

2545 t, 

2546 [radical_strokes], 

2547 text, 

2548 True, 

2549 is_reconstruction, 

2550 head_group, 

2551 ruby, 

2552 links, 

2553 link_dict, 

2554 ) 

2555 prev_tags = None 

2556 following_tags = None 

2557 continue 

2558 

2559 # See if it indicates historical Katakana ortography (←) or 

2560 # just otherwise katakana/hiragana form 

2561 m = re.match(r"←\s*|kana\s+", desc) 

2562 if m: 2562 ↛ 2563line 2562 didn't jump to line 2563 because the condition on line 2562 was never true

2563 if desc.startswith("←"): 

2564 t1 = "historical " 

2565 else: 

2566 t1 = "" 

2567 x = desc[m.end() :] 

2568 if x.endswith("?"): 

2569 x = x[:-1] 

2570 # XXX should we add a tag indicating uncertainty? 

2571 if x: 

2572 name = unicodedata.name(x[0]) 

2573 if name.startswith("HIRAGANA "): 

2574 desc = t1 + "hiragana " + x 

2575 elif name.startswith("KATAKANA "): 

2576 desc = t1 + "katakana " + x 

2577 

2578 # See if it is "n strokes in Chinese" or similar 

2579 m = re.match( 

2580 r"(\d+) strokes in (Chinese|Japanese|" 

2581 r"traditional Chinese|simplified Chinese)$", 

2582 desc, 

2583 ) 

2584 if m and can_be_form: 2584 ↛ 2586line 2584 didn't jump to line 2586 because the condition on line 2584 was never true

2585 # Special case, used to give just strokes for some Han chars 

2586 strokes = m.group(1) 

2587 lang = m.group(2) 

2588 t = ["strokes"] 

2589 t.extend(lang.split()) 

2590 add_related( 

2591 wxr, 

2592 data, 

2593 t, 

2594 [strokes], 

2595 text, 

2596 True, 

2597 is_reconstruction, 

2598 head_group, 

2599 ruby, 

2600 links, 

2601 link_dict, 

2602 ) 

2603 prev_tags = None 

2604 following_tags = None 

2605 continue 

2606 

2607 # American Sign Language has images (or requests for image) 

2608 # as heads, + this ASL gloss after. 

2609 m2 = re.search(r"\(ASL gloss:\s+(.*)\)", text) 

2610 if m2 and can_be_form: 2610 ↛ 2611line 2610 didn't jump to line 2611 because the condition on line 2610 was never true

2611 add_related( 

2612 wxr, 

2613 data, 

2614 ["ASL-gloss"], 

2615 [m2.group(1)], 

2616 text, 

2617 True, 

2618 is_reconstruction, 

2619 head_group, 

2620 ruby, 

2621 links, 

2622 link_dict, 

2623 ) 

2624 continue 

2625 

2626 parts = list(m.group(0) for m in re.finditer(word_re, desc)) 

2627 if not parts: 2627 ↛ 2628line 2627 didn't jump to line 2628 because the condition on line 2627 was never true

2628 prev_tags = None 

2629 following_tags = None 

2630 continue 

2631 

2632 # Check for certain language-specific header part starts that 

2633 # modify 

2634 if ( 2634 ↛ 2639line 2634 didn't jump to line 2639 because the condition on line 2634 was never true

2635 len(parts) == 2 

2636 and language in lang_specific_head_map 

2637 and can_be_form 

2638 ): 

2639 ht2 = lang_specific_head_map[language] 

2640 if parts[0] in ht2: 

2641 rem_tags, add_tags = ht2[parts[0]] 

2642 new_prev_tags1: list[list[str]] = [] 

2643 tags2: Union[tuple[str, ...], list[str]] 

2644 for tags2 in prev_tags or [()]: 

2645 if rem_tags is True: # Remove all old tags 

2646 tsets = set() 

2647 else: 

2648 tsets = set(tags2) - set(rem_tags.split()) 

2649 tsets = tsets | set(add_tags.split()) 

2650 tags = list(sorted(tsets)) 

2651 add_related( 

2652 wxr, 

2653 data, 

2654 tags, 

2655 [parts[1]], 

2656 text, 

2657 True, 

2658 is_reconstruction, 

2659 head_group, 

2660 ruby, 

2661 links, 

2662 link_dict, 

2663 ) 

2664 new_prev_tags1.append(tags) 

2665 prev_tags = new_prev_tags1 

2666 following_tags = None 

2667 continue 

2668 

2669 # Handle the special case of descriptors that are parenthesized, 

2670 # e.g., (archaic or Scotland) 

2671 m = re.match(r"\(([^)]+)\)\s+(.*)$", desc) 

2672 if m is not None and classify_desc(m.group(1)) == "tags": 2672 ↛ 2673line 2672 didn't jump to line 2673 because the condition on line 2672 was never true

2673 tagpart = m.group(1) 

2674 related = [m.group(2)] 

2675 tagsets, topics = decode_tags(tagpart, no_unknown_starts=True) 

2676 if topics: 

2677 wxr.wtp.debug( 

2678 "parenthized head part {!r} contains topics: {}".format( 

2679 tagpart, topics 

2680 ), 

2681 sortid="form_descriptions/1647", 

2682 ) 

2683 elif m is not None and re.match(r"in the sense ", m.group(1)): 2683 ↛ 2686line 2683 didn't jump to line 2686 because the condition on line 2683 was never true

2684 # Handle certain ignored cases 

2685 # e.g. bord/Danish: in the sense "plank" 

2686 related = [m.group(2)] 

2687 tagsets = [()] 

2688 else: 

2689 # Normal parsing of the descriptor 

2690 alt_related = None 

2691 alt_tagsets = None 

2692 tagsets = None 

2693 for i in range(len(parts), 0, -1): 

2694 related = parts[i:] 

2695 tagparts = parts[:i] 

2696 # print(" i={} related={} tagparts={}" 

2697 # .format(i, related, tagparts)) 

2698 tagsets, topics = decode_tags( 

2699 " ".join(tagparts), no_unknown_starts=True 

2700 ) 

2701 # print("tagparts={!r} tagsets={} topics={} related={} " 

2702 # "alt_related={} distw={:.2f}" 

2703 # .format(tagparts, tagsets, topics, related, 

2704 # alt_related, 

2705 # distw(titleparts, parts[i - 1]))) 

2706 if ( 

2707 topics 

2708 or not tagsets 

2709 or any("error-unknown-tag" in x for x in tagsets) 

2710 ): 

2711 if alt_related is not None: 2711 ↛ 2713line 2711 didn't jump to line 2713 because the condition on line 2711 was never true

2712 # We already had a good division, so let's stop. 

2713 break 

2714 # Bad division, try deeper 

2715 continue 

2716 # print(f"{parts[i-1]=}, {parts=}") 

2717 if ( 

2718 i > 1 

2719 and len(parts[i - 1]) >= 4 

2720 and ( 

2721 distw(titleparts, parts[i - 1]) <= 0.4 

2722 or ( 

2723 wxr.wtp.section == "English" 

2724 and wxr.wtp.title 

2725 in WORDS_WITH_FALSE_POSITIVE_TAGS 

2726 and parts[i - 1] 

2727 in WORDS_WITH_FALSE_POSITIVE_TAGS[wxr.wtp.title] 

2728 ) 

2729 ) 

2730 # Fixes 'unaccountability' wiktext #1196 

2731 and not ( 

2732 wxr.wtp.section == "English" 

2733 and wxr.wtp.title in WORDS_WITH_FALSE_POSITIVE_FORMS 

2734 and parts[i - 1] 

2735 in WORDS_WITH_FALSE_POSITIVE_FORMS[wxr.wtp.title] 

2736 ) 

2737 # Fixes wiktextract #983, where "participle" 

2738 # was too close to "Martinize" and so this accepted 

2739 # ["participle", "Martinize"] as matching; this 

2740 # kludge prevents this from happening if titleparts 

2741 # is shorter than what would be 'related'. 

2742 # This breaks if we want to detect stuff that 

2743 # actually gets an extra space-separated word when 

2744 # 'inflected'. 

2745 and ( 

2746 len(titleparts) >= len(parts[i - 1 :]) 

2747 or "or" in parts[i - 1 :] 

2748 ) 

2749 ): 

2750 # print(f"Reached; {parts=}, {parts[i-1]=}") 

2751 alt_related = related 

2752 alt_tagsets = tagsets 

2753 continue 

2754 alt_related = None 

2755 alt_tagsets = None 

2756 break 

2757 # for-else 

2758 else: 

2759 if alt_related is None and can_be_form: 

2760 # Check if the parenthesized part is likely a 

2761 # romanization 

2762 if ( 2762 ↛ 2770line 2762 didn't jump to line 2770 because the condition on line 2762 was never true

2763 (have_ruby or classify_desc(base) == "other") 

2764 and classify_desc(paren) == "romanization" 

2765 and not ( 

2766 "categories" in data 

2767 and desc in data["categories"] 

2768 ) 

2769 ): 

2770 for r in split_at_comma_semi( 

2771 paren, 

2772 extra=[" or "], 

2773 skipped=link_words_not_alnum, 

2774 ): 

2775 add_romanization( 

2776 wxr, 

2777 data, 

2778 r, 

2779 text, 

2780 is_reconstruction, 

2781 head_group, 

2782 ruby, 

2783 ) 

2784 have_romanization = True 

2785 continue 

2786 tagsets = [("error-unrecognized-head-form",)] 

2787 wxr.wtp.debug( 

2788 "unrecognized head form: {}".format(desc), 

2789 sortid="form_descriptions/1698", 

2790 ) 

2791 continue 

2792 

2793 if alt_related is not None: 2793 ↛ 2794line 2793 didn't jump to line 2794 because the condition on line 2793 was never true

2794 related = alt_related 

2795 tagsets = alt_tagsets 

2796 

2797 # print("FORM END: tagsets={} related={}".format(tagsets, related)) 

2798 # print("==================") 

2799 

2800 if ( 2800 ↛ 2821line 2800 didn't jump to line 2821 because the condition on line 2800 was never true

2801 len(related) <= 0 

2802 and wxr.wtp.section == "English" 

2803 and tagsets is not None 

2804 and len(tagsets) > 0 

2805 and not any( 

2806 s.startswith("error-") for tagset in tagsets for s in tagset 

2807 ) 

2808 and any( 

2809 s in FORM_ASSOCIATED_TAG_WORDS 

2810 for tagset in tagsets 

2811 for s in tagset 

2812 ) 

2813 and ( 

2814 wxr.wtp.title not in FALSE_POSITIVE_MISSING_FORMS 

2815 and not any( 

2816 rel in FALSE_POSITIVE_MISSING_FORMS[wxr.wtp.title or ""] 

2817 for rel in related 

2818 ) 

2819 ) 

2820 ): 

2821 wxr.wtp.debug( 

2822 f"Form tags without form: {desc=}, {tagsets=}", 

2823 sortid="form_description/20250107", 

2824 ) 

2825 if not tagsets: 2825 ↛ 2826line 2825 didn't jump to line 2826 because the condition on line 2825 was never true

2826 continue 

2827 

2828 # print(f"{alts=}, {related=}") 

2829 

2830 assert isinstance(related, (list, tuple)) 

2831 related_str = " ".join(related) 

2832 if "or" in titleparts: 

2833 alts = [related_str] 

2834 else: 

2835 alts = split_at_comma_semi( 

2836 related_str, 

2837 separators=[r"\bor\b"], 

2838 skipped=link_words_not_alnum, 

2839 ) 

2840 # print(f"{related_str=}, {alts=}") 

2841 if not alts: 

2842 alts = [""] 

2843 for related_str in alts: 

2844 if related_str and can_be_form: 

2845 if prev_tags and ( 

2846 all( 

2847 all( 

2848 t in ["nonstandard", "dialectal"] 

2849 or valid_tags[t] == "dialect" 

2850 for t in tags 

2851 ) 

2852 for ts in tagsets 

2853 ) 

2854 or ( 

2855 any("participle" in ts for ts in prev_tags) 

2856 and all( 

2857 "attributive" in ts 

2858 or any(valid_tags[t] == "gender" for t in ts) 

2859 for ts in tagsets 

2860 ) 

2861 ) 

2862 ): 

2863 # Merged with previous tags. Don't update previous 

2864 # tags here; cf. burn/English/Verb 

2865 for tags_l in tagsets: 

2866 for ts in prev_tags: 

2867 tags_l1 = sorted(set(tags_l) | set(ts)) 

2868 add_related( 

2869 wxr, 

2870 data, 

2871 tags_l1, 

2872 [related_str], 

2873 text, 

2874 True, 

2875 is_reconstruction, 

2876 head_group, 

2877 ruby, 

2878 links, 

2879 link_dict, 

2880 ) 

2881 else: 

2882 # Not merged with previous tags 

2883 for tags_l in tagsets: 

2884 if following_tags is not None: 2884 ↛ 2885line 2884 didn't jump to line 2885 because the condition on line 2884 was never true

2885 for ts in following_tags: 

2886 tags_l1 = list( 

2887 sorted(set(tags_l) | set(ts)) 

2888 ) 

2889 add_related( 

2890 wxr, 

2891 data, 

2892 tags_l1, 

2893 [related_str], 

2894 text, 

2895 True, 

2896 is_reconstruction, 

2897 head_group, 

2898 ruby, 

2899 links, 

2900 link_dict, 

2901 ) 

2902 else: 

2903 ret = add_related( 

2904 wxr, 

2905 data, 

2906 tags_l, 

2907 [related_str], 

2908 text, 

2909 True, 

2910 is_reconstruction, 

2911 head_group, 

2912 ruby, 

2913 links, 

2914 link_dict, 

2915 ) 

2916 if ret is not None: 2916 ↛ 2917line 2916 didn't jump to line 2917 because the condition on line 2916 was never true

2917 following_tags = ret 

2918 prev_tags = tagsets 

2919 else: 

2920 if desc_i < len(new_desc) - 1 and all( 2920 ↛ 2927line 2920 didn't jump to line 2927 because the condition on line 2920 was never true

2921 "participle" in ts or "infinitive" in ts 

2922 for ts in tagsets 

2923 ): 

2924 # Interpret it as a standalone form description 

2925 # in the middle, probably followed by forms or 

2926 # language-specific descriptors. cf. drikke/Danish 

2927 new_prev_tags2 = [] 

2928 for ts1 in prev_tags or [()]: 

2929 for ts2 in tagsets: 

2930 ts = tuple(sorted(set(ts1) | set(ts2))) 

2931 new_prev_tags2.append(ts) 

2932 prev_tags = new_prev_tags2 

2933 continue 

2934 for tags in tagsets: 

2935 data_extend(data, "tags", tags) 

2936 prev_tags = tagsets 

2937 following_tags = None 

2938 

2939 # Finally, if we collected hirakana/katakana, add them now 

2940 if hiragana: 2940 ↛ 2941line 2940 didn't jump to line 2941 because the condition on line 2940 was never true

2941 add_related( 

2942 wxr, 

2943 data, 

2944 ["hiragana"], 

2945 [hiragana], 

2946 text, 

2947 True, 

2948 is_reconstruction, 

2949 head_group, 

2950 ruby, 

2951 ) 

2952 if katakana: 2952 ↛ 2953line 2952 didn't jump to line 2953 because the condition on line 2952 was never true

2953 add_related( 

2954 wxr, 

2955 data, 

2956 ["katakana"], 

2957 [katakana], 

2958 text, 

2959 True, 

2960 is_reconstruction, 

2961 head_group, 

2962 ruby, 

2963 ) 

2964 

2965 # XXX check if this is actually relevant, tags in word root data 

2966 # is extremely rare (not sure where they slip through). 

2967 tags = data.get("tags", []) # type:ignore 

2968 if len(tags) > 0: 

2969 # wxr.wtp.debug( 

2970 # f"Tags appear in word root data: {data['tags']=}", # type:ignore 

2971 # sortid="form_descriptions/2620/20240606", 

2972 # ) # Messes up tests. 

2973 data["tags"] = sorted(set(tags)) # type:ignore 

2974 

2975 

2976def parse_sense_qualifier( 

2977 wxr: WiktextractContext, text: str, data: Union[SenseData, LinkageData] 

2978) -> None: 

2979 """Parses tags or topics for a sense or some other data. The values are 

2980 added into the dictionary ``data``.""" 

2981 assert isinstance(wxr, WiktextractContext) 

2982 assert isinstance(text, str) 

2983 assert isinstance(data, dict) 

2984 # print("parse_sense_qualifier:", text) 

2985 if re.match(r"\([^()]+\)$", text): 2985 ↛ 2986line 2985 didn't jump to line 2986 because the condition on line 2985 was never true

2986 text = text[1:-1] 

2987 if re.match(r'"[^"]+"$', text): 2987 ↛ 2988line 2987 didn't jump to line 2988 because the condition on line 2987 was never true

2988 text = text[1:-1] 

2989 lst = map_with(xlat_descs_map, [text]) 

2990 sense_tags: list[str] = [] 

2991 for text in lst: 

2992 for semi in split_at_comma_semi(text): 

2993 if not semi: 2993 ↛ 2994line 2993 didn't jump to line 2994 because the condition on line 2993 was never true

2994 continue 

2995 orig_semi = semi 

2996 idx = semi.find(":") 

2997 if idx >= 0: 2997 ↛ 2998line 2997 didn't jump to line 2998 because the condition on line 2997 was never true

2998 semi = semi[:idx] 

2999 cls = classify_desc(semi, allow_unknown_tags=True) 

3000 # print("parse_sense_qualifier: classify_desc: {} -> {}" 

3001 # .format(semi, cls)) 

3002 if cls == "tags": 

3003 tagsets, topics = decode_tags(semi) 

3004 data_extend(data, "topics", topics) 

3005 # XXX should think how to handle distinct options better, 

3006 # e.g., "singular and plural genitive"; that can't really be 

3007 # done with changing the calling convention of this function. 

3008 # Should split sense if more than one category of tags differs. 

3009 for tags in tagsets: 

3010 sense_tags.extend(tags) 

3011 elif cls == "taxonomic": 3011 ↛ 3012line 3011 didn't jump to line 3012 because the condition on line 3011 was never true

3012 if re.match(r"×[A-Z]", semi): 

3013 sense_tags.append("extinct") 

3014 semi = semi[1:] 

3015 data["taxonomic"] = semi 

3016 elif cls == "english": 

3017 if "qualifier" in data and data["qualifier"] != orig_semi: 3017 ↛ 3018line 3017 didn't jump to line 3018 because the condition on line 3017 was never true

3018 data["qualifier"] += "; " + orig_semi 

3019 else: 

3020 data["qualifier"] = orig_semi 

3021 else: 

3022 wxr.wtp.debug( 

3023 "unrecognized sense qualifier: {}".format(text), 

3024 sortid="form_descriptions/1831", 

3025 ) 

3026 sense_tags = sorted(set(sense_tags)) 

3027 data_extend(data, "tags", sense_tags) 

3028 

3029 

3030def parse_pronunciation_tags( 

3031 wxr: WiktextractContext, text: str, data: SoundData 

3032) -> None: 

3033 assert isinstance(wxr, WiktextractContext) 

3034 assert isinstance(text, str) 

3035 assert isinstance(data, dict) 

3036 text = text.strip() 

3037 if not text: 

3038 return 

3039 cls = classify_desc(text) 

3040 notes = [] 

3041 if cls == "tags": 

3042 tagsets, topics = decode_tags(text) 

3043 data_extend(data, "topics", topics) 

3044 for tagset in tagsets: 

3045 for t in tagset: 

3046 if " " in t: 3046 ↛ 3047line 3046 didn't jump to line 3047 because the condition on line 3046 was never true

3047 notes.append(t) 

3048 else: 

3049 data_append(data, "tags", t) 

3050 else: 

3051 notes.append(text) 

3052 if notes: 

3053 data["note"] = "; ".join(notes) 

3054 

3055 

3056def parse_translation_desc( 

3057 wxr: WiktextractContext, lang: str, text: str, tr: TranslationData 

3058) -> None: 

3059 assert isinstance(wxr, WiktextractContext) 

3060 assert isinstance(lang, str) # The language of ``text`` 

3061 assert isinstance(text, str) 

3062 assert isinstance(tr, dict) 

3063 # print("parse_translation_desc:", text) 

3064 

3065 # Process all parenthesized parts from the translation item 

3066 note = None 

3067 restore_beginning = "" 

3068 restore_end = "" 

3069 while True: 

3070 beginning = False 

3071 # See if we can find a parenthesized expression at the end 

3072 m = re.search(r"\s*\((([^()]|\([^()]+\))+)\)\.?$", text) 

3073 if m: 

3074 par = m.group(1) 

3075 text = text[: m.start()] 

3076 if par.startswith(("literally ", "lit.")): 

3077 continue # Not useful for disambiguation in many idioms 

3078 else: 

3079 # See if we can find a parenthesized expression at the start 

3080 m = re.match(r"^\^?\((([^()]|\([^()]+\))+)\):?(\s+|$)", text) 

3081 if m: 

3082 par = m.group(1) 

3083 text = text[m.end() :] 

3084 beginning = True 

3085 if re.match(r"^(\d|\s|,| or | and )+$", par): 3085 ↛ 3090line 3085 didn't jump to line 3090 because the condition on line 3085 was never true

3086 # Looks like this beginning parenthesized expression only 

3087 # contains digits or their combinations. We assume such 

3088 # to be sense descriptions if no sense has been selected, 

3089 # or otherwise just ignore them. 

3090 if not tr.get("sense"): 

3091 tr["sense"] = par 

3092 continue 

3093 else: 

3094 # See if we can find a parenthesized expression in the middle. 

3095 # Romanizations are sometimes between word and gender marker, 

3096 # e.g. wife/English/Tr/Yiddish. 

3097 m = re.search(r"\s+\((([^()]|\([^()]+\))+)\)", text) 

3098 if m: 

3099 par = m.group(1) 

3100 text = text[: m.start()] + text[m.end() :] 

3101 else: 

3102 # No more parenthesized expressions - break out of the loop 

3103 break 

3104 

3105 # Some cleanup of artifacts that may result from skipping some templates 

3106 # in earlier stages 

3107 if par.startswith(": "): 3107 ↛ 3108line 3107 didn't jump to line 3108 because the condition on line 3107 was never true

3108 par = par[2:] 

3109 if par.endswith(","): 3109 ↛ 3110line 3109 didn't jump to line 3110 because the condition on line 3109 was never true

3110 par = par[:-1] 

3111 if re.match(r'^[“"]([^“”"]*)[“”"]$', par): 3111 ↛ 3112line 3111 didn't jump to line 3112 because the condition on line 3111 was never true

3112 par = par[1:-1] 

3113 par = par.strip() 

3114 

3115 # Check for special script pronunciation followed by romanization, 

3116 # used in many Asian languages. 

3117 lst = par.split(", ") 

3118 if len(lst) == 2: 

3119 a, r = lst 

3120 if classify_desc(a) == "other": 

3121 cls = classify_desc(r) 

3122 # print("parse_translation_desc: r={} cls={}".format(r, cls)) 

3123 if cls == "romanization" or ( 

3124 cls == "english" and len(r.split()) == 1 and r[0].islower() 

3125 ): 

3126 if tr.get("alt") and tr.get("alt") != a: 3126 ↛ 3127line 3126 didn't jump to line 3127 because the condition on line 3126 was never true

3127 wxr.wtp.debug( 

3128 'more than one value in "alt": {} vs. {}'.format( 

3129 tr["alt"], a 

3130 ), 

3131 sortid="form_descriptions/1930", 

3132 ) 

3133 tr["alt"] = a 

3134 if tr.get("roman") and tr.get("roman") != r: 3134 ↛ 3135line 3134 didn't jump to line 3135 because the condition on line 3134 was never true

3135 wxr.wtp.debug( 

3136 'more than one value in "roman": {} vs. {}'.format( 

3137 tr["roman"], r 

3138 ), 

3139 sortid="form_descriptions/1936", 

3140 ) 

3141 tr["roman"] = r 

3142 continue 

3143 

3144 # Check for certain comma-separated tags combined with English text 

3145 # at the beginning or end of a comma-separated parenthesized list 

3146 while len(lst) > 1: 

3147 cls = classify_desc(lst[0]) 

3148 if cls == "tags": 3148 ↛ 3149line 3148 didn't jump to line 3149 because the condition on line 3148 was never true

3149 tagsets, topics = decode_tags(lst[0]) 

3150 for t in tagsets: 

3151 data_extend(tr, "tags", t) 

3152 data_extend(tr, "topics", topics) 

3153 lst = lst[1:] 

3154 continue 

3155 cls = classify_desc(lst[-1]) 

3156 if cls == "tags": 

3157 tagsets, topics = decode_tags(lst[-1]) 

3158 for t in tagsets: 

3159 data_extend(tr, "tags", t) 

3160 data_extend(tr, "topics", topics) 

3161 lst = lst[:-1] 

3162 continue 

3163 break 

3164 par = ", ".join(lst) 

3165 

3166 if not par: 3166 ↛ 3167line 3166 didn't jump to line 3167 because the condition on line 3166 was never true

3167 continue 

3168 if re.search(tr_ignored_parens_re, par): 3168 ↛ 3169line 3168 didn't jump to line 3169 because the condition on line 3168 was never true

3169 continue 

3170 if par.startswith("numeral:"): 

3171 par = par[8:].strip() 

3172 

3173 # Classify the part in parenthesis and process accordingly 

3174 cls = classify_desc(par) 

3175 # print("parse_translation_desc classify: {!r} -> {}" 

3176 # .format(par, cls)) 

3177 if par == text: 

3178 pass 

3179 if par == "f": 3179 ↛ 3180line 3179 didn't jump to line 3180 because the condition on line 3179 was never true

3180 data_append(tr, "tags", "feminine") 

3181 elif par == "m": 3181 ↛ 3182line 3181 didn't jump to line 3182 because the condition on line 3181 was never true

3182 data_append(tr, "tags", "masculine") 

3183 elif cls == "tags": 

3184 tagsets, topics = decode_tags(par) 

3185 for tags in tagsets: 

3186 data_extend(tr, "tags", tags) 

3187 data_extend(tr, "topics", topics) 

3188 elif cls == "english": 

3189 # If the text contains any of certain grammatical words, treat it 

3190 # as a "note" instead of "english" 

3191 if re.search(tr_note_re, par): 

3192 if par.endswith(":"): 3192 ↛ 3193line 3192 didn't jump to line 3193 because the condition on line 3192 was never true

3193 par = par[:-1] 

3194 if par not in ("see entry for forms",): 3194 ↛ 3069line 3194 didn't jump to line 3069 because the condition on line 3194 was always true

3195 if note: 3195 ↛ 3196line 3195 didn't jump to line 3196 because the condition on line 3195 was never true

3196 note = note + ";" + par 

3197 else: 

3198 note = par 

3199 else: 

3200 # There can be more than one parenthesized english item, see 

3201 # e.g. Aunt/English/Translations/Tamil 

3202 if "translation" in tr and "english" in tr: 

3203 tr["english"] += "; " + par # DEPRECATED for "translation" 

3204 tr["translation"] += "; " + par 

3205 else: 

3206 tr["english"] = par # DEPRECATED for "translation" 

3207 tr["translation"] = par 

3208 elif cls == "romanization": 

3209 # print("roman text={!r} text cls={}" 

3210 # .format(text, classify_desc(text))) 

3211 if classify_desc(text) in ( 

3212 "english", 

3213 "romanization", 

3214 ) and lang not in ("Egyptian",): 

3215 if beginning: 

3216 restore_beginning += "({}) ".format(par) 

3217 else: 

3218 restore_end = " ({})".format(par) + restore_end 

3219 else: 

3220 if tr.get("roman"): 3220 ↛ 3221line 3220 didn't jump to line 3221 because the condition on line 3220 was never true

3221 wxr.wtp.debug( 

3222 'more than one value in "roman": {} vs. {}'.format( 

3223 tr["roman"], par 

3224 ), 

3225 sortid="form_descriptions/2013", 

3226 ) 

3227 tr["roman"] = par 

3228 elif cls == "taxonomic": 3228 ↛ 3229line 3228 didn't jump to line 3229 because the condition on line 3228 was never true

3229 if tr.get("taxonomic"): 

3230 wxr.wtp.debug( 

3231 'more than one value in "taxonomic": {} vs. {}'.format( 

3232 tr["taxonomic"], par 

3233 ), 

3234 sortid="form_descriptions/2019", 

3235 ) 

3236 if re.match(r"×[A-Z]", par): 

3237 data_append(tr, "tags", "extinct") 

3238 par = par[1:] 

3239 tr["taxonomic"] = par 

3240 elif cls == "other": 3240 ↛ 3250line 3240 didn't jump to line 3250 because the condition on line 3240 was always true

3241 if tr.get("alt"): 3241 ↛ 3242line 3241 didn't jump to line 3242 because the condition on line 3241 was never true

3242 wxr.wtp.debug( 

3243 'more than one value in "alt": {} vs. {}'.format( 

3244 tr["alt"], par 

3245 ), 

3246 sortid="form_descriptions/2028", 

3247 ) 

3248 tr["alt"] = par 

3249 else: 

3250 wxr.wtp.debug( 

3251 "parse_translation_desc unimplemented cls {}: {}".format( 

3252 cls, par 

3253 ), 

3254 sortid="form_descriptions/2033", 

3255 ) 

3256 

3257 # Check for gender indications in suffix 

3258 text, final_tags = parse_head_final_tags(wxr, lang, text) 

3259 data_extend(tr, "tags", final_tags) 

3260 

3261 # Restore those parts that we did not want to remove (they are often 

3262 # optional words or words that are always used with the given translation) 

3263 text = restore_beginning + text + restore_end 

3264 

3265 if note: 

3266 tr["note"] = note.strip() 

3267 if text and text not in ignored_translations: 

3268 tr["word"] = text.strip() 

3269 

3270 # Sometimes gender seems to be at the end of "roman" field, see e.g. 

3271 # fire/English/Noun/Translations/Egyptian (for "oxidation reaction") 

3272 roman = tr.get("roman") 

3273 if roman: 

3274 if roman.endswith(" f"): 3274 ↛ 3275line 3274 didn't jump to line 3275 because the condition on line 3274 was never true

3275 data_append(tr, "tags", "feminine") 

3276 tr["roman"] = roman[:-2].strip() 

3277 elif roman.endswith(" m"): 3277 ↛ 3278line 3277 didn't jump to line 3278 because the condition on line 3277 was never true

3278 data_append(tr, "tags", "masculine") 

3279 tr["roman"] = roman[:-2].strip() 

3280 

3281 # If the word now has "translation" field but no "roman" field, and 

3282 # the word would be classified "other" (generally non-latin 

3283 # characters), and the value in "translation" is only one lowercase 

3284 # word, move it to "roman". This happens semi-frequently when the 

3285 # translation is transliterated the same as some English word. 

3286 roman = tr.get("roman") 

3287 english = tr.get("translation") 

3288 if english and not roman and "word" in tr: 

3289 cls = classify_desc(tr["word"]) 

3290 if cls == "other" and " " not in english and english[0].islower(): 

3291 del tr["translation"] 

3292 if "english" in tr: # DEPRECATED for "translation" 3292 ↛ 3294line 3292 didn't jump to line 3294 because the condition on line 3292 was always true

3293 del tr["english"] 

3294 tr["roman"] = english 

3295 

3296 # If the entry now has both tr["roman"] and tr["word"] and they have 

3297 # the same value, delete tr["roman"] (e.g., man/English/Translations 

3298 # Evenki) 

3299 if tr.get("word") and tr.get("roman") == tr.get("word"): 3299 ↛ 3300line 3299 didn't jump to line 3300 because the condition on line 3299 was never true

3300 del tr["roman"] 

3301 

3302 

3303def parse_alt_or_inflection_of( 

3304 wxr: WiktextractContext, gloss: str, gloss_template_args: set[str] 

3305) -> Optional[tuple[list[str], Optional[list[AltOf]]]]: 

3306 """Tries to parse an inflection-of or alt-of description. If successful, 

3307 this returns (tags, alt-of/inflection-of-dict). If the description cannot 

3308 be parsed, this returns None. This may also return (tags, None) when the 

3309 gloss describes a form (or some other tags were extracted from it), but 

3310 there was no alt-of/form-of/synonym-of word.""" 

3311 # print("parse_alt_or_inflection_of: {!r}".format(gloss)) 

3312 # Occasionally inflection_of/alt_of have "A(n) " etc. at the beginning. 

3313 

3314 # Never interpret a gloss that is equal to the word itself as a tag 

3315 # (e.g., instrumental/Romanian, instrumental/Spanish). 

3316 if gloss.lower() == wxr.wtp.title.lower() or ( # type:ignore[union-attr] 

3317 len(gloss) >= 5 and distw([gloss.lower()], wxr.wtp.title.lower()) < 0.2 # type:ignore[union-attr] 

3318 ): 

3319 return None 

3320 

3321 # First try parsing it as-is 

3322 parsed = parse_alt_or_inflection_of1(wxr, gloss, gloss_template_args) 

3323 if parsed is not None: 

3324 return parsed 

3325 

3326 # Next try parsing it with the first character converted to lowercase if 

3327 # it was previously uppercase. 

3328 if gloss and gloss[0].isupper(): 

3329 gloss = gloss[0].lower() + gloss[1:] 

3330 parsed = parse_alt_or_inflection_of1(wxr, gloss, gloss_template_args) 

3331 if parsed is not None: 

3332 return parsed 

3333 

3334 return None 

3335 

3336 

3337# These tags are not allowed in alt-or-inflection-of parsing 

3338alt_infl_disallowed: set[str] = set( 

3339 [ 

3340 "error-unknown-tag", 

3341 "place", # Not in inflected forms and causes problems e.g. house/ 

3342 # English 

3343 ] 

3344) 

3345 

3346 

3347def parse_alt_or_inflection_of1( 

3348 wxr: WiktextractContext, gloss: str, gloss_template_args: set[str] 

3349) -> Optional[tuple[list[str], Optional[list[AltOf]]]]: 

3350 """Helper function for parse_alt_or_inflection_of. This handles a single 

3351 capitalization.""" 

3352 if not gloss or not gloss.strip(): 3352 ↛ 3353line 3352 didn't jump to line 3353 because the condition on line 3352 was never true

3353 return None 

3354 

3355 # Prevent some common errors where we would parse something we shouldn't 

3356 if re.search(r"(?i)form of address ", gloss): 3356 ↛ 3357line 3356 didn't jump to line 3357 because the condition on line 3356 was never true

3357 return None 

3358 

3359 gloss = re.sub(r"only used in [^,]+, ", "", gloss) 

3360 

3361 # First try all formats ending with "of" (or other known last words that 

3362 # can end a form description) 

3363 matches = list(re.finditer(r"\b(of|for|by|as|letter|number) ", gloss)) 

3364 m: Optional[re.Match] 

3365 for m in reversed(matches): 

3366 desc = gloss[: m.end()].strip() 

3367 base = gloss[m.end() :].strip() 

3368 tagsets, topics = decode_tags(desc, no_unknown_starts=True) 

3369 if not topics and any( 

3370 not (alt_infl_disallowed & set(ts)) for ts in tagsets 

3371 ): 

3372 # Successfully parsed, including "of" etc. 

3373 tags: list[str] = [] 

3374 # If you have ("Western-Armenian", ..., "form-of") as your 

3375 # tag set, it's most probable that it's something like 

3376 # "Western Armenian form of խոսել (xosel)", which should 

3377 # get "alt-of" instead of "form-of" (inflection). 

3378 # խօսիլ/Armenian 

3379 for ts_t in tagsets: 

3380 if "form-of" in ts_t and any( 

3381 valid_tags.get(tk) == "dialect" for tk in ts_t 

3382 ): 

3383 ts_s = (set(ts_t) - {"form-of"}) | {"alt-of"} 

3384 else: 

3385 ts_s = set(ts_t) 

3386 if not (alt_infl_disallowed & ts_s): 3386 ↛ 3379line 3386 didn't jump to line 3379 because the condition on line 3386 was always true

3387 tags.extend(ts_s) 

3388 if ( 

3389 "alt-of" in tags 

3390 or "form-of" in tags 

3391 or "synonym-of" in tags 

3392 or "compound-of" in tags 

3393 ): 

3394 break 

3395 if m.group(1) == "of": 

3396 # Try parsing without the final "of". This is commonly used in 

3397 # various form-of expressions. 

3398 desc = gloss[: m.start()] 

3399 base = gloss[m.end() :] 

3400 tagsets, topics = decode_tags(desc, no_unknown_starts=True) 

3401 # print("ALT_OR_INFL: desc={!r} base={!r} tagsets={} topics={}" 

3402 # .format(desc, base, tagsets, topics)) 

3403 if not topics and any( 

3404 not (alt_infl_disallowed & set(t)) for t in tagsets 

3405 ): 

3406 tags = [] 

3407 for t in tagsets: 

3408 if not (alt_infl_disallowed & set(t)): 3408 ↛ 3407line 3408 didn't jump to line 3407 because the condition on line 3408 was always true

3409 tags.extend(t) 

3410 # It must have at least one tag from form_of_tags 

3411 if set(tags) & form_of_tags: 

3412 # Accept this as form-of 

3413 tags.append("form-of") 

3414 break 

3415 if set(tags) & alt_of_tags: 

3416 # Accept this as alt-of 

3417 tags.append("alt-of") 

3418 break 

3419 

3420 else: 

3421 # Did not find a form description based on last word; see if the 

3422 # whole description is tags 

3423 tagsets, topics = decode_tags(gloss, no_unknown_starts=True) 

3424 if not topics and any( 

3425 not (alt_infl_disallowed & set(ts)) and form_of_tags & set(ts) 

3426 for ts in tagsets 

3427 ): 

3428 tags = [] 

3429 for ts in tagsets: 

3430 if not (alt_infl_disallowed & set(ts)) and form_of_tags & set( 3430 ↛ 3429line 3430 didn't jump to line 3429 because the condition on line 3430 was always true

3431 ts 

3432 ): 

3433 tags.extend(ts) 

3434 base = "" 

3435 else: 

3436 return None 

3437 

3438 # kludge for Spanish (again): 'x of [word] combined with [clitic]' 

3439 m = re.search(r"combined with \w+$", base) 

3440 if m: 3440 ↛ 3441line 3440 didn't jump to line 3441 because the condition on line 3440 was never true

3441 tagsets, topics = decode_tags(m.group(0), no_unknown_starts=True) 

3442 if not topics: 

3443 for ts in tagsets: 

3444 tags.extend(ts) 

3445 base = base[: m.start()] 

3446 

3447 # It is fairly common for form_of glosses to end with something like 

3448 # "ablative case" or "in instructive case". Parse that ending. 

3449 base = base.strip() 

3450 lst = base.split() 

3451 # print("parse_alt_or_inflection_of: lst={}".format(lst)) 

3452 if len(lst) >= 3 and lst[-1] in ("case", "case."): 3452 ↛ 3453line 3452 didn't jump to line 3453 because the condition on line 3452 was never true

3453 node = valid_sequences.children.get(lst[-2]) 

3454 if node and node.end: 

3455 for s in node.tags: 

3456 tags.extend(s.split(" ")) 

3457 lst = lst[:-2] 

3458 if lst[-1] == "in" and len(lst) > 1: 

3459 lst = lst[:-1] 

3460 

3461 # Eliminate empty and duplicate tags 

3462 tags = sorted(set(t for t in tags if t)) 

3463 

3464 # Clean up some extra stuff from the linked word, separating the text 

3465 # into ``base`` (the linked word) and ``extra`` (additional information, 

3466 # such as English translation or clarifying word sense information). 

3467 orig_base = base 

3468 base = re.sub(alt_of_form_of_clean_re, "", orig_base) 

3469 base = re.sub(r" [(⟨][^()]*[)⟩]", "", base) # Remove all (...) groups 

3470 extra = orig_base[len(base) :] 

3471 extra = re.sub(r"^[- :;.,,—]+", "", extra) 

3472 if extra.endswith(".") and extra.count(".") == 1: 

3473 extra = extra[:-1].strip() 

3474 m = re.match(r"^\(([^()]*)\)$", extra) 

3475 if m: 3475 ↛ 3476line 3475 didn't jump to line 3476 because the condition on line 3475 was never true

3476 extra = m.group(1) 

3477 else: 

3478 # These weird backets used in "slash mark" 

3479 m = re.match(r"^⟨([^()]*)⟩$", extra) 

3480 if m: 3480 ↛ 3481line 3480 didn't jump to line 3481 because the condition on line 3480 was never true

3481 extra = m.group(1) 

3482 m = re.match(r'^[“"]([^"“”]*)["”]$', extra) 

3483 if m: 3483 ↛ 3484line 3483 didn't jump to line 3484 because the condition on line 3483 was never true

3484 extra = m.group(1) 

3485 # Note: base might still contain comma-separated values and values 

3486 # separated by "and" 

3487 base = base.strip() 

3488 if base.endswith(",") and len(base) > 2: 3488 ↛ 3489line 3488 didn't jump to line 3489 because the condition on line 3488 was never true

3489 base = base[:-1].strip() 

3490 while ( 

3491 base.endswith(".") 

3492 and not wxr.wtp.page_exists(base) 

3493 and base not in gloss_template_args 

3494 ): 

3495 base = base[:-1].strip() 

3496 if base.endswith('(\u201cconjecture")'): 3496 ↛ 3497line 3496 didn't jump to line 3497 because the condition on line 3496 was never true

3497 base = base[:-14].strip() 

3498 tags.append("conjecture") 

3499 while ( 3499 ↛ 3504line 3499 didn't jump to line 3504 because the condition on line 3499 was never true

3500 base.endswith(".") 

3501 and not wxr.wtp.page_exists(base) 

3502 and base not in gloss_template_args 

3503 ): 

3504 base = base[:-1].strip() 

3505 if ( 3505 ↛ 3510line 3505 didn't jump to line 3510 because the condition on line 3505 was never true

3506 base.endswith(".") 

3507 and base not in gloss_template_args 

3508 and base[:-1] in gloss_template_args 

3509 ): 

3510 base = base[:-1] 

3511 base = base.strip() 

3512 if not base: 

3513 return tags, None 

3514 

3515 # Kludge: Spanish verb forms seem to have a dot added at the end. 

3516 # Remove it; we know of no Spanish verbs ending with a dot. 

3517 language = wxr.wtp.section 

3518 pos = wxr.wtp.subsection 

3519 # print("language={} pos={} base={}".format(language, pos, base)) 

3520 if ( 3520 ↛ 3526line 3520 didn't jump to line 3526 because the condition on line 3520 was never true

3521 base.endswith(".") 

3522 and len(base) > 1 

3523 and base[-2].isalpha() 

3524 and (language == "Spanish" and pos == "Verb") 

3525 ): 

3526 base = base[:-1] 

3527 

3528 # Split base to alternatives when multiple alternatives provided 

3529 parts = split_at_comma_semi(base, extra=[" / ", "/", r" \+ "]) 

3530 titleword = re.sub(r"^Reconstruction:[^/]*/", "", wxr.wtp.title or "") 

3531 if ( 

3532 len(parts) <= 1 

3533 or base.startswith("/") 

3534 or base.endswith("/") 

3535 or "/" in titleword 

3536 ): 

3537 parts = [base] 

3538 # Split base to alternatives when of form "a or b" and "a" and "b" are 

3539 # similar (generally spelling variants of the same word or similar words) 

3540 if len(parts) == 1: 

3541 pp = base.split() 

3542 if len(pp) == 3 and pp[1] == "or" and distw([pp[0]], pp[2]) < 0.4: 

3543 parts = [pp[0], pp[2]] 

3544 

3545 # Create form-of/alt-of entries based on the extracted data 

3546 dt_lst: list[AltOf] = [] 

3547 for p in parts: 

3548 # Check for some suspicious base forms 

3549 m = re.search(r"[.,] |[{}()]", p) 

3550 if m and not wxr.wtp.page_exists(p): 3550 ↛ 3551line 3550 didn't jump to line 3551 because the condition on line 3550 was never true

3551 wxr.wtp.debug( 

3552 "suspicious alt_of/form_of with {!r}: {}".format(m.group(0), p), 

3553 sortid="form_descriptions/2278", 

3554 ) 

3555 if p.startswith("*") and len(p) >= 3 and p[1].isalpha(): 3555 ↛ 3556line 3555 didn't jump to line 3556 because the condition on line 3555 was never true

3556 p = p[1:] 

3557 dt: AltOf = {"word": p} 

3558 if extra: 

3559 dt["extra"] = extra 

3560 dt_lst.append(dt) 

3561 # print("alt_or_infl_of returning tags={} lst={} base={!r}" 

3562 # .format(tags, lst, base)) 

3563 return tags, dt_lst 

3564 

3565 

3566@functools.lru_cache(maxsize=65536) 

3567def classify_desc( 

3568 desc: str, 

3569 allow_unknown_tags=False, 

3570 no_unknown_starts=False, 

3571 accepted: Union[tuple[str, ...], frozenset[str]] = tuple(), 

3572) -> str: 

3573 """Determines whether the given description is most likely tags, english, 

3574 a romanization, or something else. Returns one of: "tags", "english", 

3575 "romanization", or "other". If ``allow_unknown_tags`` is True, then 

3576 allow "tags" classification even when the only tags are those starting 

3577 with a word in allowed_unknown_starts.""" 

3578 assert isinstance(desc, str) 

3579 # Empty and whitespace-only strings are treated as "other" 

3580 desc = desc.strip() 

3581 if not desc: 

3582 return "other" 

3583 

3584 normalized_desc = unicodedata.normalize("NFKD", desc) 

3585 

3586 # If it can be fully decoded as tags without errors, treat as tags 

3587 tagsets, topics = decode_tags(desc, no_unknown_starts=no_unknown_starts) 

3588 for tagset in tagsets: 

3589 assert isinstance(tagset, (list, tuple, set)) 

3590 if "error-unknown-tag" not in tagset and ( 

3591 topics or allow_unknown_tags or any(" " not in x for x in tagset) 

3592 ): 

3593 return "tags" 

3594 

3595 # Check if it looks like the taxonomic name of a species 

3596 if desc in known_species: 

3597 return "taxonomic" 

3598 desc1 = re.sub(r"^×([A-Z])", r"\1", desc) 

3599 desc1 = re.sub(r"\s*×.*", "", desc1) 

3600 lst = desc1.split() 

3601 if len(lst) > 1 and len(lst) <= 5 and lst[0] in known_firsts: 

3602 have_non_english = 1 if lst[0].lower() not in english_words else 0 

3603 for x in lst[1:]: 

3604 if x in ("A", "B", "C", "D", "E", "F", "I", "II", "III", "IV", "V"): 

3605 continue 

3606 if x[0].isupper(): 

3607 break 

3608 if x not in english_words: 

3609 have_non_english += 1 

3610 else: 

3611 # Starts with known taxonomic term, does not contain uppercase 

3612 # words (except allowed letters) and at least one word is not 

3613 # English 

3614 if have_non_english >= len(lst) - 1 and have_non_english > 0: 3614 ↛ 3620line 3614 didn't jump to line 3620 because the condition on line 3614 was always true

3615 return "taxonomic" 

3616 

3617 # If all words are in our English dictionary, interpret as English. 

3618 # [ -~] is regex black magic, "ALL CHARACTERS from space to tilde" 

3619 # in ASCII. Took me a while to figure out. 

3620 if re.match(r"[ -~―—“”…'‘’ʹ€]+$", normalized_desc) and len(desc) > 1: 

3621 if desc in english_words and desc[0].isalpha(): 

3622 return "english" # Handles ones containing whitespace 

3623 desc1 = re.sub( 

3624 tokenizer_fixup_re, lambda m: tokenizer_fixup_map[m.group(0)], desc 

3625 ) 

3626 tokens = tokenizer.tokenize(desc1) 

3627 if not tokens: 3627 ↛ 3628line 3627 didn't jump to line 3628 because the condition on line 3627 was never true

3628 return "other" 

3629 lst_bool = list( 

3630 x not in not_english_words 

3631 and 

3632 # not x.isdigit() and 

3633 ( 

3634 x in english_words 

3635 or x.lower() in english_words 

3636 or x in known_firsts 

3637 or x[0].isdigit() 

3638 or x in accepted 

3639 or 

3640 # (x[0].isupper() and x.find("-") < 0 and x.isascii()) or 

3641 ( 

3642 x.endswith("s") and len(x) >= 4 and x[:-1] in english_words 

3643 ) # Plural 

3644 or ( 

3645 x.endswith("ies") 

3646 and len(x) >= 5 

3647 and x[:-3] + "y" in english_words 

3648 ) # E.g. lily - lilies 

3649 or ( 

3650 x.endswith("ing") 

3651 and len(x) >= 5 

3652 and x[:-3] in english_words 

3653 ) # E.g. bring - bringing 

3654 or ( 

3655 x.endswith("ing") 

3656 and len(x) >= 5 

3657 and x[:-3] + "e" in english_words 

3658 ) # E.g., tone - toning 

3659 or ( 

3660 x.endswith("ed") and len(x) >= 5 and x[:-2] in english_words 

3661 ) # E.g. hang - hanged 

3662 or ( 

3663 x.endswith("ed") 

3664 and len(x) >= 5 

3665 and x[:-2] + "e" in english_words 

3666 ) # E.g. atone - atoned 

3667 or (x.endswith("'s") and x[:-2] in english_words) 

3668 or (x.endswith("s'") and x[:-2] in english_words) 

3669 or ( 

3670 x.endswith("ise") 

3671 and len(x) >= 5 

3672 and x[:-3] + "ize" in english_words 

3673 ) 

3674 or ( 

3675 x.endswith("ised") 

3676 and len(x) >= 6 

3677 and x[:-4] + "ized" in english_words 

3678 ) 

3679 or ( 

3680 x.endswith("ising") 

3681 and len(x) >= 7 

3682 and x[:-5] + "izing" in english_words 

3683 ) 

3684 or ( 

3685 re.search(r"[-/]", x) 

3686 and all( 

3687 ((y in english_words and len(y) > 2) or not y) 

3688 for y in re.split(r"[-/]", x) 

3689 ) 

3690 ) 

3691 ) 

3692 for x in tokens 

3693 ) 

3694 cnt = lst_bool.count(True) 

3695 rejected_words = tuple( 

3696 x for i, x in enumerate(tokens) if not lst_bool[i] 

3697 ) 

3698 if ( 

3699 any( 

3700 lst_bool[i] and x[0].isalpha() and len(x) > 1 

3701 for i, x in enumerate(tokens) 

3702 ) 

3703 and not desc.startswith("-") 

3704 and not desc.endswith("-") 

3705 and re.search(r"\w+", desc) 

3706 and ( 

3707 cnt == len(lst_bool) 

3708 or ( 

3709 any( 

3710 lst_bool[i] and len(x) > 3 for i, x in enumerate(tokens) 

3711 ) 

3712 and cnt >= len(lst_bool) - 1 

3713 ) 

3714 or cnt / len(lst_bool) >= 0.8 

3715 or ( 

3716 all(x in potentially_english_words for x in rejected_words) 

3717 and cnt / len(lst_bool) >= 0.50 

3718 ) 

3719 ) 

3720 ): 

3721 return "english" 

3722 # Some translations have apparent pronunciation descriptions in /.../ 

3723 # which we'll put in the romanization field (even though they probably are 

3724 # not exactly romanizations). 

3725 if desc.startswith("/") and desc.endswith("/"): 

3726 return "romanization" 

3727 # If all characters are in classes that could occur in romanizations, 

3728 # treat as romanization 

3729 classes = list( 

3730 unicodedata.category(x) if x not in ("-", ",", ":", "/", '"') else "OK" 

3731 for x in normalized_desc 

3732 ) 

3733 classes1 = [] 

3734 num_latin = 0 

3735 num_greek = 0 

3736 # part = "" 

3737 # for ch, cl in zip(normalized_desc, classes): 

3738 # part += f"{ch}({cl})" 

3739 # print(part) 

3740 for ch, cl in zip(normalized_desc, classes): 

3741 if ch in ( 

3742 "'", # ' in Arabic, / in IPA-like parenthesized forms 

3743 ".", # e.g., "..." in translations 

3744 ";", 

3745 ":", 

3746 "!", 

3747 "‘", 

3748 "’", 

3749 '"', 

3750 "“", 

3751 "”", 

3752 "/", 

3753 "?", 

3754 "…", # alternative to "..." 

3755 "⁉", # 見る/Japanese automatic transcriptions... 

3756 "?", 

3757 "!", 

3758 "⁻", # superscript -, used in some Cantonese roman, e.g. "we" 

3759 "ʔ", 

3760 "ʼ", 

3761 "ʾ", 

3762 "ʹ", 

3763 ): # ʹ e.g. in understand/English/verb Russian transl 

3764 classes1.append("OK") 

3765 continue 

3766 if cl not in ("Ll", "Lu"): 

3767 classes1.append(cl) 

3768 continue 

3769 try: 

3770 name = unicodedata.name(ch) 

3771 first = name.split()[0] 

3772 if first == "LATIN": 

3773 num_latin += 1 

3774 elif first == "GREEK": 

3775 num_greek += 1 

3776 elif first == "COMBINING": # Combining diacritic 3776 ↛ 3777line 3776 didn't jump to line 3777 because the condition on line 3776 was never true

3777 cl = "OK" 

3778 elif re.match(non_latin_scripts_re, name): 3778 ↛ 3782line 3778 didn't jump to line 3782 because the condition on line 3778 was always true

3779 cl = "NO" # Not acceptable in romanizations 

3780 except ValueError: 

3781 cl = "NO" # Not acceptable in romanizations 

3782 classes1.append(cl) 

3783 # print("classify_desc: {!r} classes1: {}".format(desc, classes1)) 

3784 # print(set(classes1) ) 

3785 if all( 

3786 x in ("Ll", "Lu", "Lt", "Lm", "Mn", "Mc", "Zs", "Nd", "OK") 

3787 for x in classes1 

3788 ): 

3789 if ( 

3790 (num_latin >= num_greek + 2 or num_greek == 0) 

3791 and classes1.count("OK") < len(classes1) 

3792 and classes1.count("Nd") < len(classes1) 

3793 ): 

3794 return "romanization" 

3795 # Otherwise it is something else, such as hanji version of the word 

3796 return "other" 

3797 

3798 

3799def remove_text_in_parentheses(text: str) -> str: 

3800 parentheses = 0 

3801 new_text = "" 

3802 for c in text: 

3803 if c == "(": 

3804 parentheses += 1 

3805 elif c == ")": 

3806 parentheses -= 1 

3807 elif parentheses == 0: 

3808 new_text += c 

3809 return new_text