Coverage for src / wiktextract / extractor / en / form_descriptions.py: 78%

1324 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2025-12-09 05:43 +0000

1# Code for parsing linguistic form descriptions and tags for word senses 

2# (both the word entry head - initial part and parenthesized parts - 

3# and tags at the beginning of word senses) 

4# 

5# Copyright (c) 2020-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org 

6 

7import functools 

8import re 

9import unicodedata 

10from typing import ( 

11 Any, 

12 Literal, 

13 Optional, 

14 Sequence, 

15 Union, 

16) 

17 

18import Levenshtein 

19from nltk import TweetTokenizer # type:ignore[import-untyped] 

20 

21from ...datautils import data_append, data_extend, split_at_comma_semi 

22from ...tags import ( 

23 alt_of_tags, 

24 form_of_tags, 

25 head_final_bantu_langs, 

26 head_final_bantu_map, 

27 head_final_numeric_langs, 

28 head_final_other_langs, 

29 head_final_other_map, 

30 head_final_semitic_langs, 

31 head_final_semitic_map, 

32 uppercase_tags, 

33 valid_tags, 

34 xlat_descs_map, 

35 xlat_head_map, 

36 xlat_tags_map, 

37) 

38from ...topics import topic_generalize_map, valid_topics 

39from ...wxr_context import WiktextractContext 

40from .english_words import ( 

41 english_words, 

42 not_english_words, 

43 potentially_english_words, 

44) 

45from .form_descriptions_known_firsts import known_firsts 

46from .taxondata import known_species 

47from .type_utils import ( 

48 AltOf, 

49 FormData, 

50 LinkageData, 

51 SenseData, 

52 SoundData, 

53 TranslationData, 

54 WordData, 

55) 

56 

57# Tokenizer for classify_desc() 

58tokenizer = TweetTokenizer() 

59 

60# These are ignored as the value of a related form in form head. 

61IGNORED_RELATED: set[str] = set( 

62 [ 

63 "-", 

64 "־", 

65 "᠆", 

66 "‐", 

67 "‑", 

68 "‒", 

69 "–", 

70 "—", 

71 "―", 

72 "−", 

73 "⸺", 

74 "⸻", 

75 "﹘", 

76 "﹣", 

77 "-", 

78 "?", 

79 "(none)", 

80 ] 

81) 

82 

83 

84# First words of unicodedata.name() that indicate scripts that cannot be 

85# accepted in romanizations or english (i.e., should be considered "other" 

86# in classify_desc()). 

87non_latin_scripts: list[str] = [ 

88 "ADLAM", 

89 "ARABIC", 

90 "ARABIC-INDIC", 

91 "ARMENIAN", 

92 "BALINESE", 

93 "BENGALI", 

94 "BRAHMI", 

95 "BRAILLE", 

96 "CANADIAN", 

97 "CHAKMA", 

98 "CHAM", 

99 "CHEROKEE", 

100 "CJK", 

101 "COPTIC", 

102 "COUNTING ROD", 

103 "CUNEIFORM", 

104 "CYRILLIC", 

105 "DOUBLE-STRUCK", 

106 "EGYPTIAN", 

107 "ETHIOPIC", 

108 "EXTENDED ARABIC-INDIC", 

109 "GEORGIAN", 

110 "GLAGOLITIC", 

111 "GOTHIC", 

112 "GREEK", 

113 "GUJARATI", 

114 "GURMUKHI", 

115 "HANGUL", 

116 "HANIFI ROHINGYA", 

117 "HEBREW", 

118 "HIRAGANA", 

119 "JAVANESE", 

120 "KANNADA", 

121 "KATAKANA", 

122 "KAYAH LI", 

123 "KHMER", 

124 "KHUDAWADI", 

125 "LAO", 

126 "LEPCHA", 

127 "LIMBU", 

128 "MALAYALAM", 

129 "MEETEI", 

130 "MYANMAR", 

131 "NEW TAI LUE", 

132 "NKO", 

133 "OL CHIKI", 

134 "OLD PERSIAN", 

135 "OLD SOUTH ARABIAN", 

136 "ORIYA", 

137 "OSMANYA", 

138 "PHOENICIAN", 

139 "SAURASHTRA", 

140 "SHARADA", 

141 "SINHALA", 

142 "SUNDANESE", 

143 "SYLOTI", 

144 "TAI THAM", 

145 "TAKRI", 

146 "TAMIL", 

147 "TELUGU", 

148 "THAANA", 

149 "THAI", 

150 "TIBETAN", 

151 "TIFINAGH", 

152 "TIRHUTA", 

153 "UGARITIC", 

154 "WARANG CITI", 

155 "YI", 

156] 

157non_latin_scripts_re = re.compile( 

158 r"(" + r"|".join(re.escape(x) for x in non_latin_scripts) + r")\b" 

159) 

160 

161# Sanity check xlat_head_map values 

162for k, v in xlat_head_map.items(): 

163 if v.startswith("?"): 

164 v = v[1:] 

165 for tag in v.split(): 

166 if tag not in valid_tags: 166 ↛ 167line 166 didn't jump to line 167 because the condition on line 166 was never true

167 print( 

168 "WARNING: xlat_head_map[{}] contains unrecognized tag {}".format( 

169 k, tag 

170 ) 

171 ) 

172 

173# Regexp for finding nested translations from translation items (these are 

174# used in, e.g., year/English/Translations/Arabic). This is actually used 

175# in page.py. 

176nested_translations_re = re.compile( 

177 r"\s+\((({}): ([^()]|\([^()]+\))+)\)".format( 

178 "|".join( 

179 re.escape(x.removeprefix("?")) 

180 for x in sorted(xlat_head_map.values(), key=len, reverse=True) 

181 if x and not x.startswith("class-") 

182 ) 

183 ) 

184) 

185 

186# Regexp that matches head tag specifiers. Used to match tags from end of 

187# translations and linkages 

188head_final_re_text = r"( -)?( ({}))+".format( 

189 "|".join( 

190 re.escape(x) 

191 for x in 

192 # The sort is to put longer ones first, preferring them in 

193 # the regexp match 

194 sorted(xlat_head_map.keys(), key=len, reverse=True) 

195 ) 

196) 

197head_final_re = re.compile(head_final_re_text + "$") 

198 

199# Regexp used to match head tag specifiers at end of a form for certain 

200# Bantu languages (particularly Swahili and similar languages). 

201head_final_bantu_re_text = r" ({})".format( 

202 "|".join(re.escape(x) for x in head_final_bantu_map.keys()) 

203) 

204head_final_bantu_re = re.compile(head_final_bantu_re_text + "$") 

205 

206# Regexp used to match head tag specifiers at end of a form for certain 

207# Semitic languages (particularly Arabic and similar languages). 

208head_final_semitic_re_text = r" ({})".format( 

209 "|".join(re.escape(x) for x in head_final_semitic_map.keys()) 

210) 

211head_final_semitic_re = re.compile(head_final_semitic_re_text + "$") 

212 

213# Regexp used to match head tag specifiers at end of a form for certain 

214# other languages (e.g., Lithuanian, Finnish, French). 

215head_final_other_re_text = r" ({})".format( 

216 "|".join(re.escape(x) for x in head_final_other_map.keys()) 

217) 

218head_final_other_re = re.compile(head_final_other_re_text + "$") 

219 

220# Regexp for splitting heads. See parse_word_head(). 

221head_split_re_text = ( 

222 "(" 

223 + head_final_re_text 

224 + "|" 

225 + head_final_bantu_re_text 

226 + "|" 

227 + head_final_semitic_re_text 

228 + "|" 

229 + head_final_other_re_text 

230 + ")?( or |[,;]+)" 

231) 

232head_split_re = re.compile(head_split_re_text) 

233head_split_re_parens = 0 

234for m in re.finditer(r"(^|[^\\])[(]+", head_split_re_text): 

235 head_split_re_parens += m.group(0).count("(") 

236 

237# Parenthesized parts that are ignored in translations 

238tr_ignored_parens: set[str] = set( 

239 [ 

240 "please verify", 

241 "(please verify)", 

242 "transliteration needed", 

243 "(transliteration needed)", 

244 "in words with back vowel harmony", 

245 "(in words with back vowel harmony)", 

246 "in words with front vowel harmony", 

247 "(in words with front vowel harmony)", 

248 "see below", 

249 "see usage notes below", 

250 ] 

251) 

252tr_ignored_parens_re = re.compile( 

253 r"^(" 

254 + "|".join(re.escape(x) for x in tr_ignored_parens) 

255 + ")$" 

256 + r"|^(Can we clean up|Can we verify|for other meanings see " 

257 r"lit\. )" 

258) 

259 

260# Translations that are ignored 

261ignored_translations: set[str] = set( 

262 [ 

263 "[script needed]", 

264 "please add this translation if you can", 

265 ] 

266) 

267 

268# Put english text into the "note" field in a translation if it contains one 

269# of these words 

270tr_note_re = re.compile( 

271 r"(\b(article|definite|indefinite|superlative|comparative|pattern|" 

272 r"adjective|adjectives|clause|clauses|pronoun|pronouns|preposition|prep|" 

273 r"postposition|postp|action|actions|articles|" 

274 r"adverb|adverbs|noun|nouns|verb|verbs|before|" 

275 r"after|placed|prefix|suffix|used with|translated|" 

276 r"nominative|genitive|dative|infinitive|participle|past|perfect|imperfect|" 

277 r"perfective|imperfective|auxiliary|negative|future|present|tense|aspect|" 

278 r"conjugation|declension|class|category|plural|singular|positive|" 

279 r"seldom used|formal|informal|familiar|unspoken|spoken|written|" 

280 r"indicative|progressive|conditional|potential|" 

281 r"accusative|adessive|inessive|superessive|elative|allative|" 

282 r"dialect|dialects|object|subject|predicate|movies|recommended|language|" 

283 r"locative|continuous|simple|continuousness|gerund|subjunctive|" 

284 r"periphrastically|no equivalent|not used|not always used|" 

285 r"used only with|not applicable|use the|signifying|wordplay|pronounced|" 

286 r"preconsonantal|spelled|spelling|respelling|respellings|phonetic|" 

287 r"may be replaced|stricter sense|for nonhumans|" 

288 r"sense:|used:|in full:|informally used|followed by|" 

289 r"not restricted to|pertaining to|or optionally with|are optional|" 

290 r"in conjunction with|in compounds|depending on the relationship|" 

291 r"person addressed|one person|multiple persons|may be replaced with|" 

292 r"optionally completed with|in the phrase|in response to|" 

293 r"before a|before an|preceded by|verbs ending|very common|after a verb|" 

294 r"with verb|with uncountable|with the objects|with stative|" 

295 r"can be replaced by|often after|used before|used after|" 

296 r"used in|clipping of|spoken|somewhat|capitalized|" 

297 r"short form|shortening of|shortened form|initialism of|" 

298 r"said to|rare:|rarer also|is rarer|negatively connoted|" 

299 r"previously mentioned|uncountable noun|countable noun|" 

300 r"countable nouns|uncountable nouns|" 

301 r"with predicative|with -|with imperfect|with a negated|" 

302 r"colloquial|misspelling|holophrastic|frequently|esp\.|especially|" 

303 r'"|' 

304 r"general term|after a vowel|before a vowel|" 

305 r"form|regular|irregular|alternative)" 

306 r")($|[) ])|^(" 

307 # Following are only matched at the beginning of the string 

308 r"pl|pl\.|see:|pl:|sg:|plurals:|e\.g\.|e\.g\.:|e\.g\.,|cf\.|compare|such as|" 

309 r"see|only|often|usually|used|usage:|of|not|in|compare|usu\.|" 

310 r"as|about|abbrv\.|abbreviation|abbr\.|that:|optionally|" 

311 r"mainly|from|for|also|also:|acronym|" 

312 r"\+|with) " 

313) 

314# \b does not work at the end??? 

315 

316# Related forms matching this regexp will be considered suspicious if the 

317# page title does not also match one of these. 

318suspicious_related_re = re.compile( 

319 r"(^| )(f|m|n|c|or|pl|sg|inan|anim|pers|anml|impf|pf|vir|nvir)( |$)" 

320 r"|[][:=<>&#*|]" 

321 r"| \d+$" 

322) 

323 

324# Word forms (head forms, translations, etc) that will be considered ok and 

325# silently accepted even if they would otherwise trigger a suspicious 

326# form warning. 

327ok_suspicious_forms: set[str] = set( 

328 [ 

329 "but en or", # "golden goal"/English/Tr/French 

330 "cœur en or", # "heart of gold"/Eng/Tr/French 

331 "en or", # golden/Eng/Tr/French 

332 "men du", # jet/Etym2/Noun/Tr/Cornish 

333 "parachute en or", # "golden parachute"/Eng/Tr/French 

334 "vieil or", # "old gold"/Eng/Tr/French 

335 # "all that glitters is not gold"/Eng/Tr/French 

336 "tout ce qui brille n’est pas or", 

337 "μη αποκλειστικό or", # inclusive or/Eng/Tr/Greek 

338 "period or full stop", 

339 ] 

340) 

341 

342 

343# Replacements to be done in classify_desc before tokenizing. This is a 

344# workaround for shortcomings in TweetTokenizer. 

345tokenizer_fixup_map = { 

346 r"a.m.": "AM", 

347 r"p.m.": "PM", 

348} 

349tokenizer_fixup_re = re.compile( 

350 r"\b(" 

351 + "|".join( 

352 re.escape(x) 

353 for x in sorted( 

354 tokenizer_fixup_map.keys(), key=lambda x: len(x), reverse=True 

355 ) 

356 ) 

357 + r")" 

358) 

359 

360# Unknown tags starting with these words will be silently ignored. 

361ignored_unknown_starts: set[str] = set( 

362 [ 

363 "originally", 

364 "e.g.", 

365 "c.f.", 

366 "supplanted by", 

367 "supplied by", 

368 ] 

369) 

370 

371ignored_unknown_starts_re = re.compile( 

372 r"^(" 

373 + "|".join( 

374 re.escape(x) 

375 for x in sorted(ignored_unknown_starts, key=lambda x: -len(x)) 

376 ) 

377 + ") " 

378) 

379 

380# If an unknown sequence starts with one of these, it will continue as an 

381# unknown sequence until the end, unless it turns out to have a replacement. 

382allowed_unknown_starts: set[str] = set( 

383 [ 

384 "Relating", 

385 "accompanied", 

386 "added", 

387 "after", 

388 "answering", 

389 "as", 

390 "based", 

391 "before", 

392 "conjugated", 

393 "conjunction", 

394 "construed", 

395 "especially", 

396 "expression:", 

397 "figurative:", 

398 "followed", 

399 "for", 

400 "forms", 

401 "from", 

402 "governs", 

403 "in", 

404 "indicating", 

405 "modifying", 

406 "normally", 

407 "not", 

408 "of", 

409 "preceding", 

410 "prefixed", 

411 "referring", 

412 "relating", 

413 "revived", 

414 "said", 

415 "since", 

416 "takes", 

417 "used", 

418 "with", 

419 "With", 

420 "without", 

421 ] 

422) 

423# Allow the ignored unknown starts without complaining 

424allowed_unknown_starts.update(ignored_unknown_starts) 

425 

426# Full unknown tags that will be ignored in decode_tags() 

427# XXX this is unused, ask Tatu where the contents is now 

428ignored_unknown_tags: set[str] = set([]) 

429 

430# Head endings that are mapped to tags 

431head_end_map = { 

432 " 1st conj.": "conjugation-1", 

433 " 2nd conj.": "conjugation-2", 

434 " 3rd conj.": "conjugation-3", 

435 " 4th conj.": "conjugation-4", 

436 " 5th conj.": "conjugation-5", 

437 " 6th conj.": "conjugation-6", 

438 " 7th conj.": "conjugation-7", 

439} 

440head_end_re = re.compile( 

441 r"(" + "|".join(re.escape(x) for x in head_end_map.keys()) + r")$" 

442) 

443 

444 

445# Dictionary of language-specific parenthesized head part starts that 

446# either introduce new tags or modify previous tags. The value for each 

447# language is a dictionary that maps the first word of the head part to 

448# (rem_tags, add_tags), where ``rem_tags`` can be True to remove all previous 

449# tags or a space-separated string of tags to remove, and ``add_tags`` should 

450# be a string of tags to add. 

451lang_specific_head_map: dict[ 

452 str, dict[str, Union[tuple[str, str], tuple[Literal[True], str]]] 

453] = { 

454 "Danish": { 

455 # prefix: (rem_tags space separate string/True, add_tags s-sep str) 

456 "c": ("neuter", "common-gender"), 

457 "n": ("common-gender", "neuter"), 

458 "pl": ("singular neuter common-gender", "plural"), 

459 "sg": ("plural neuter common-gender", "singular"), 

460 }, 

461} 

462 

463 

464# Regular expression used to strip additional stuff from the end of alt_of and 

465# form_of. 

466alt_of_form_of_clean_re = re.compile( 

467 r"(?s)(" 

468 + "|".join( 

469 [ 

470 r":", 

471 r'[“"]', 

472 r";", 

473 r" \(", 

474 r" - ", 

475 r" ־ ", 

476 r" ᠆ ", 

477 r" ‐ ", 

478 r" ‑ ", 

479 r" ‒ ", 

480 r" – ", 

481 r" — ", 

482 r" ― ", 

483 r" − ", 

484 r" ⸺ ", 

485 r" ⸻ ", 

486 r" ﹘ ", 

487 r" ﹣ ", 

488 r" - ", 

489 r" \+ ", 

490 r" \(with ", 

491 r" with -ra/-re", 

492 r"\. Used ", 

493 r"\. Also ", 

494 r"\. Since ", 

495 r"\. A ", 

496 r"\.\. A ", 

497 r"\. An ", 

498 r"\.\. An ", 

499 r"\. an ", 

500 r"\. The ", 

501 r"\. Spanish ", 

502 r"\. Language ", 

503 r"\. former name of ", 

504 r"\. AIM", 

505 r"\. OT", 

506 r"\. Not ", 

507 r"\. Now ", 

508 r"\. Nowadays ", 

509 r"\. Early ", 

510 r"\. ASEAN", 

511 r"\. UN", 

512 r"\. IMF", 

513 r"\. WHO", 

514 r"\. WIPO", 

515 r"\. AC", 

516 r"\. DC", 

517 r"\. DNA", 

518 r"\. RNA", 

519 r"\. SOB", 

520 r"\. IMO", 

521 r"\. Behavior", 

522 r"\. Income ", 

523 r"\. More ", 

524 r"\. Most ", 

525 r"\. Only ", 

526 r"\. Also ", 

527 r"\. From ", 

528 r"\. Of ", 

529 r"\.\. Of ", 

530 r"\. To ", 

531 r"\. For ", 

532 r"\. If ", 

533 r"\. Praenominal ", 

534 r"\. This ", 

535 r"\. Replaced ", 

536 r"\. CHCS is the ", 

537 r"\. Equivalent ", 

538 r"\. Initialism ", 

539 r"\. Note ", 

540 r"\. Alternative ", 

541 r"\. Compare ", 

542 r"\. Cf\. ", 

543 r"\. Comparable ", 

544 r"\. Involves ", 

545 r"\. Sometimes ", 

546 r"\. Commonly ", 

547 r"\. Often ", 

548 r"\. Typically ", 

549 r"\. Possibly ", 

550 r"\. Although ", 

551 r"\. Rare ", 

552 r"\. Instead ", 

553 r"\. Integrated ", 

554 r"\. Distinguished ", 

555 r"\. Given ", 

556 r"\. Found ", 

557 r"\. Was ", 

558 r"\. In ", 

559 r"\. It ", 

560 r"\.\. It ", 

561 r"\. One ", 

562 r"\. Any ", 

563 r"\. They ", 

564 r"\. Members ", 

565 r"\. Each ", 

566 r"\. Original ", 

567 r"\. Especially ", 

568 r"\. Usually ", 

569 r"\. Known ", 

570 r"\.\. Known ", 

571 r"\. See ", 

572 r"\. see ", 

573 r"\. target was not ", 

574 r"\. Popular ", 

575 r"\. Pedantic ", 

576 r"\. Positive ", 

577 r"\. Society ", 

578 r"\. Plan ", 

579 r"\. Environmentally ", 

580 r"\. Affording ", 

581 r"\. Encompasses ", 

582 r"\. Expresses ", 

583 r"\. Indicates ", 

584 r"\. Text ", 

585 r"\. Large ", 

586 r"\. Sub-sorting ", 

587 r"\. Sax", 

588 r"\. First-person ", 

589 r"\. Second-person ", 

590 r"\. Third-person ", 

591 r"\. 1st ", 

592 r"\. 2nd ", 

593 r"\. 3rd ", 

594 r"\. Term ", 

595 r"\. Northeastern ", 

596 r"\. Northwestern ", 

597 r"\. Southeast ", 

598 r"\. Egyptian ", 

599 r"\. English ", 

600 r"\. Cape Province was split into ", 

601 r"\. Pañcat", 

602 r"\. of the ", 

603 r"\. is ", 

604 r"\. after ", 

605 r"\. or ", 

606 r"\. chromed", 

607 r"\. percussion", 

608 r"\. with his ", 

609 r"\. a\.k\.a\. ", 

610 r"\. comparative form ", 

611 r"\. singular ", 

612 r"\. plural ", 

613 r"\. present ", 

614 r"\. his ", 

615 r"\. her ", 

616 r"\. equivalent ", 

617 r"\. measuring ", 

618 r"\. used in ", 

619 r"\. cutely ", 

620 r"\. Protects", 

621 r'\. "', 

622 r"\.^", 

623 r"\. \+ ", 

624 r"\., ", 

625 r". — ", 

626 r", a ", 

627 r", an ", 

628 r", the ", 

629 r", obsolete ", 

630 r", possessed", # 'd/English 

631 r", imitating", # 1/English 

632 r", derived from", 

633 r", called ", 

634 r", especially ", 

635 r", slang for ", 

636 r" corresponding to ", 

637 r" equivalent to ", 

638 r" popularized by ", 

639 r" denoting ", 

640 r" in its various senses\.", 

641 r" used by ", 

642 r" but not for ", 

643 r" since ", 

644 r" i\.e\. ", 

645 r" i\. e\. ", 

646 r" e\.g\. ", 

647 r" eg\. ", 

648 r" etc\. ", 

649 r"\[http", 

650 r" — used as ", 

651 r" by K\. Forsyth ", 

652 r" by J\. R\. Allen ", 

653 r" by S\. Ferguson ", 

654 r" by G\. Donaldson ", 

655 r" May refer to ", 

656 r" An area or region ", 

657 ] 

658 ) 

659 + r").*$" 

660) 

661 

662 

663class ValidNode: 

664 """Node in the valid_sequences tree. Each node is part of a chain 

665 or chains that form sequences built out of keys in key->tags 

666 maps like xlat_tags, etc. The ValidNode's 'word' is the key 

667 by which it is refered to in the root dict or a `children` dict, 

668 `end` marks that the node is the end-terminus of a sequence (but 

669 it can still continue if the sequence is shared by the start of 

670 other sequences: "nominative$" and "nominative plural$" for example), 

671 `tags` and `topics` are the dicts containing tag and topic strings 

672 for terminal nodes (end==True).""" 

673 

674 __slots__ = ( 

675 "end", 

676 "tags", 

677 "topics", 

678 "children", 

679 ) 

680 

681 def __init__( 

682 self, 

683 end=False, 

684 tags: Optional[list[str]] = None, 

685 topics: Optional[list[str]] = None, 

686 children: Optional[dict[str, "ValidNode"]] = None, 

687 ) -> None: 

688 self.end = end 

689 self.tags: list[str] = tags or [] 

690 self.topics: list[str] = topics or [] 

691 self.children: dict[str, "ValidNode"] = children or {} 

692 

693 

694def add_to_valid_tree(tree: ValidNode, desc: str, v: Optional[str]) -> None: 

695 """Helper function for building trees of valid tags/sequences during 

696 initialization.""" 

697 assert isinstance(tree, ValidNode) 

698 assert isinstance(desc, str) 

699 assert v is None or isinstance(v, str) 

700 node = tree 

701 

702 # Build the tree structure: each node has children nodes 

703 # whose names are denoted by their dict key. 

704 for w in desc.split(" "): 

705 if w in node.children: 

706 node = node.children[w] 

707 else: 

708 new_node = ValidNode() 

709 node.children[w] = new_node 

710 node = new_node 

711 if not node.end: 

712 node.end = True 

713 if not v: 

714 return None # Terminate early because there are no tags 

715 

716 tagslist = [] 

717 topicslist = [] 

718 for vv in v.split(): 

719 if vv in valid_tags: 

720 tagslist.append(vv) 

721 elif vv in valid_topics: 721 ↛ 724line 721 didn't jump to line 724 because the condition on line 721 was always true

722 topicslist.append(vv) 

723 else: 

724 print( 

725 "WARNING: tag/topic {!r} maps to unknown {!r}".format(desc, vv) 

726 ) 

727 topics = " ".join(topicslist) 

728 tags = " ".join(tagslist) 

729 # Changed to "_tags" and "_topics" to avoid possible key-collisions. 

730 if topics: 

731 node.topics.extend([topics]) 

732 if tags: 

733 node.tags.extend([tags]) 

734 

735 

736def add_to_valid_tree1( 

737 tree: ValidNode, 

738 k: str, 

739 v: Union[list[str], tuple[str, ...], str], 

740 valid_values: Union[set[str], dict[str, Any]], 

741) -> list[str]: 

742 assert isinstance(tree, ValidNode) 

743 assert isinstance(k, str) 

744 assert v is None or isinstance(v, (list, tuple, str)) 

745 assert isinstance(valid_values, (set, dict)) 

746 if not v: 746 ↛ 747line 746 didn't jump to line 747 because the condition on line 746 was never true

747 add_to_valid_tree(valid_sequences, k, None) 

748 return [] 

749 elif isinstance(v, str): 

750 v = [v] 

751 q = [] 

752 for vv in v: 

753 assert isinstance(vv, str) 

754 add_to_valid_tree(valid_sequences, k, vv) 

755 vvs = vv.split() 

756 for x in vvs: 

757 q.append(x) 

758 # return each individual tag 

759 return q 

760 

761 

762def add_to_valid_tree_mapping( 

763 tree: ValidNode, 

764 mapping: Union[dict[str, Union[list[str], str]], dict[str, str]], 

765 valid_values: Union[set[str], dict[str, Any]], 

766 recurse: bool, 

767) -> None: 

768 assert isinstance(tree, ValidNode) 

769 assert isinstance(mapping, dict) 

770 assert isinstance(valid_values, (set, dict)) 

771 assert recurse in (True, False) 

772 for k, v in mapping.items(): 

773 assert isinstance(k, str) 

774 assert isinstance(v, (list, str)) 

775 if isinstance(v, str): 

776 q = add_to_valid_tree1(tree, k, [v], valid_values) 

777 else: 

778 q = add_to_valid_tree1(tree, k, v, valid_values) 

779 if recurse: 

780 visited = set() 

781 while q: 

782 v = q.pop() 

783 if v in visited: 

784 continue 

785 visited.add(v) 

786 if v not in mapping: 

787 continue 

788 vv = mapping[v] 

789 qq = add_to_valid_tree1(tree, k, vv, valid_values) 

790 q.extend(qq) 

791 

792 

793# Tree of sequences considered to be tags (includes sequences that are 

794# mapped to something that becomes one or more valid tags) 

795valid_sequences = ValidNode() 

796sequences_with_slashes: set[str] = set() 

797for tag in valid_tags: 

798 # The basic tags used in our tag system; some are a bit weird, but easier 

799 # to implement this with 'false' positives than filter out stuff no one else 

800 # uses. 

801 if "/" in tag: 

802 sequences_with_slashes.add(tag) 

803 add_to_valid_tree(valid_sequences, tag, tag) 

804for tag in uppercase_tags: 

805 hyphenated = re.sub(r"\s+", "-", tag) 

806 if "/" in tag: 

807 sequences_with_slashes.add(tag) 

808 add_to_valid_tree(valid_sequences, tag, hyphenated) 

809 

810# xlat_tags_map! 

811add_to_valid_tree_mapping(valid_sequences, xlat_tags_map, valid_tags, False) 

812for k in xlat_tags_map: 

813 if "/" in k: 

814 sequences_with_slashes.add(k) 

815# Add topics to the same table, with all generalized topics also added 

816for topic in valid_topics: 

817 assert " " not in topic 

818 if "/" in topic: 818 ↛ 819line 818 didn't jump to line 819 because the condition on line 818 was never true

819 sequences_with_slashes.add(topic) 

820 add_to_valid_tree(valid_sequences, topic, topic) 

821# Let each original topic value stand alone. These are not generally on 

822# valid_topics. We add the original topics with spaces replaced by hyphens. 

823for topic in topic_generalize_map.keys(): 

824 hyphenated = re.sub(r"\s+", "-", topic) 

825 if "/" in topic: 825 ↛ 826line 825 didn't jump to line 826 because the condition on line 825 was never true

826 sequences_with_slashes.add(topic) 

827 add_to_valid_tree(valid_sequences, topic, hyphenated) 

828# Add canonicalized/generalized topic values 

829add_to_valid_tree_mapping( 

830 valid_sequences, topic_generalize_map, valid_topics, True 

831) 

832 

833# Regex used to divide a decode candidate into parts that shouldn't 

834# have their slashes turned into spaces 

835slashes_re = re.compile( 

836 r"(" + "|".join((re.escape(s) for s in sequences_with_slashes)) + r")" 

837) 

838 

839# Regexp used to find "words" from word heads and linguistic descriptions 

840word_pattern = ( 

841 r"[^ ,;()\u200e]+|" 

842 r"\([^ ,;()\u200e]+\)[^ ,;()\u200e]+|" 

843 r"[\u2800-\u28ff]|" # Braille characters 

844 r"\(([^()]|\([^()]*\))*\)" 

845) 

846 

847word_re_global = re.compile(word_pattern) 

848 

849 

850def distw(titleparts: Sequence[str], word: str) -> float: 

851 """Computes how distinct ``word`` is from the most similar word in 

852 ``titleparts``. Returns 1 if words completely distinct, 0 if 

853 identical, or otherwise something in between.""" 

854 assert isinstance(titleparts, (list, tuple)) 

855 assert isinstance(word, str) 

856 w = min( 

857 Levenshtein.distance(word, tw) / max(len(tw), len(word)) 

858 for tw in titleparts 

859 ) 

860 return w 

861 

862 

863def map_with( 

864 ht: dict[str, str | list[str]] | dict[str, str], 

865 lst: Sequence[str], 

866) -> list[str]: 

867 """Takes alternatives from ``lst``, maps them using ``ht`` to zero or 

868 more alternatives each, and returns a combined list of alternatives.""" 

869 assert isinstance(ht, dict) 

870 assert isinstance(lst, (list, tuple)) 

871 ret = [] 

872 for x in lst: 

873 assert isinstance(x, str) 

874 x = x.strip() 

875 x = ht.get(x, x) 

876 if isinstance(x, str): 876 ↛ 879line 876 didn't jump to line 879 because the condition on line 876 was always true

877 if x: 877 ↛ 872line 877 didn't jump to line 872 because the condition on line 877 was always true

878 ret.append(x) 

879 elif isinstance(x, (list, tuple)): 

880 ret.extend(x) 

881 else: 

882 raise RuntimeError("map_with unexpected value: {!r}".format(x)) 

883 return ret 

884 

885 

886TagList = list[str] 

887PosPathStep = tuple[int, TagList, TagList] 

888 

889 

890def check_unknown( 

891 from_i: int, 

892 to_i: int, 

893 i: int, 

894 wordlst: Sequence[str], 

895 allow_any: bool, 

896 no_unknown_starts: bool, 

897) -> list[PosPathStep]: 

898 """Check if the current section from_i->to_i is actually unknown 

899 or if it needs some special handling. We already presupposed that 

900 this is UNKNOWN; this is just called to see what *kind* of UNKNOWN.""" 

901 assert isinstance(to_i, int) 

902 assert isinstance(from_i, int) 

903 assert isinstance(i, int) 

904 # Adds unknown tag if needed. Returns new last_i 

905 # print("check_unknown to_i={} from_i={} i={}" 

906 # .format(to_i, from_i, i)) 

907 if from_i >= to_i: 

908 return [] 

909 words = wordlst[from_i:to_i] 

910 tag = " ".join(words) 

911 assert tag 

912 # print(f"{tag=}") 

913 if re.match(ignored_unknown_starts_re, tag): 

914 # Tags with this start are to be ignored 

915 return [(from_i, ["UNKNOWN"], [])] 

916 if tag in ignored_unknown_tags: 916 ↛ 917line 916 didn't jump to line 917 because the condition on line 916 was never true

917 return [] # One of the tags listed as to be ignored 

918 if tag in ("and", "or"): 

919 return [] 

920 if ( 

921 not allow_any 

922 and not words[0].startswith("~") 

923 and ( 

924 no_unknown_starts 

925 or words[0] not in allowed_unknown_starts 

926 or len(words) <= 1 

927 ) 

928 ): 

929 # print("ERR allow_any={} words={}" 

930 # .format(allow_any, words)) 

931 return [ 

932 (from_i, ["UNKNOWN"], ["error-unknown-tag"]) 

933 ] # Add ``tag`` here to include 

934 else: 

935 return [(from_i, ["UNKNOWN"], [tag])] 

936 

937 

938def add_new1( 

939 node: ValidNode, 

940 i: int, 

941 start_i: int, 

942 last_i: int, 

943 new_paths: list[list[PosPathStep]], 

944 new_nodes: list[tuple[ValidNode, int, int]], 

945 pos_paths: list[list[list[PosPathStep]]], 

946 wordlst: list[str], 

947 allow_any: bool, 

948 no_unknown_starts: bool, 

949 max_last_i: int, 

950) -> int: 

951 assert isinstance(new_paths, list) 

952 # print("add_new: start_i={} last_i={}".format(start_i, last_i)) 

953 # print("$ {} last_i={} start_i={}" 

954 # .format(w, last_i, start_i)) 

955 max_last_i = max(max_last_i, last_i) # if last_i has grown 

956 if (node, start_i, last_i) not in new_nodes: 

957 new_nodes.append((node, start_i, last_i)) 

958 if node.end: 

959 # We can see a terminal point in the search tree. 

960 u = check_unknown( 

961 last_i, start_i, i, wordlst, allow_any, no_unknown_starts 

962 ) 

963 # Create new paths candidates based on different past possible 

964 # paths; pos_path[last_i] contains possible paths, so add this 

965 # new one at the beginning(?) 

966 # The list comprehension inside the parens generates an iterable 

967 # of lists, so this is .extend( [(last_i...)], [(last_i...)], ... ) 

968 # XXX: this is becoming impossible to annotate, nodes might 

969 # need to become classed objects and not just dicts, or at least 

970 # a TypedDict with a "children" node 

971 new_paths.extend( 

972 [(last_i, node.tags, node.topics)] + u + x 

973 for x in pos_paths[last_i] 

974 ) 

975 max_last_i = i + 1 

976 return max_last_i 

977 

978 

979@functools.lru_cache(maxsize=65536) 

980def decode_tags( 

981 src: str, 

982 allow_any=False, 

983 no_unknown_starts=False, 

984) -> tuple[list[tuple[str, ...]], list[str]]: 

985 tagsets, topics = decode_tags1(src, allow_any, no_unknown_starts) 

986 # print(f"decode_tags: {src=}, {tagsets=}") 

987 

988 # Insert retry-code here that modifies the text source 

989 if ( 

990 any(s.startswith("error-") for tagset in tagsets for s in tagset) 

991 # I hate Python's *nested* list comprehension syntax ^ 

992 or any(s.startswith("error-") for s in topics) 

993 ): 

994 new_tagsets: list[tuple[str, ...]] = [] 

995 new_topics: list[str] = [] 

996 

997 if "/" in src: 

998 # slashes_re contains valid key entries with slashes; we're going 

999 # to skip them by splitting the string and skipping handling every 

1000 # second entry, which contains the splitting group like "masculine/ 

1001 # feminine" style keys. 

1002 split_parts = re.split(slashes_re, src) 

1003 new_parts: list[str] = [] 

1004 if len(split_parts) > 1: 

1005 for i, s in enumerate(split_parts): 

1006 if i % 2 == 0: 

1007 new_parts.append(s.replace("/", " ")) 

1008 else: 

1009 new_parts.append(s) 

1010 new_src = "".join(new_parts) 

1011 else: 

1012 new_src = src 

1013 new_tagsets, new_topics = decode_tags1( 

1014 new_src, allow_any, no_unknown_starts 

1015 ) 

1016 elif " or " in src or " and " in src: 

1017 # Annoying kludge. 

1018 new_src = src.replace(" and ", " ") 

1019 new_src = new_src.replace(" or ", " ") 

1020 new_tagsets, new_topics = decode_tags1( 

1021 new_src, allow_any, no_unknown_starts 

1022 ) 

1023 # print(f"{new_tagsets=}") 

1024 

1025 if new_tagsets or new_topics: 

1026 old_errors = sum( 

1027 1 for tagset in tagsets for s in tagset if s.startswith("error") 

1028 ) 

1029 old_errors += sum(1 for s in topics if s.startswith("error")) 

1030 new_errors = sum( 

1031 1 

1032 for new_tagset in new_tagsets 

1033 for s in new_tagset 

1034 if s.startswith("error") 

1035 ) 

1036 new_errors += sum(1 for s in new_topics if s.startswith("error")) 

1037 

1038 if new_errors <= old_errors: 1038 ↛ 1041line 1038 didn't jump to line 1041 because the condition on line 1038 was always true

1039 return new_tagsets, new_topics 

1040 

1041 return tagsets, topics 

1042 

1043 

1044def decode_tags1( 

1045 src: str, 

1046 allow_any=False, 

1047 no_unknown_starts=False, 

1048) -> tuple[list[tuple[str, ...]], list[str]]: 

1049 """Decodes tags, doing some canonicalizations. This returns a list of 

1050 lists of tags and a list of topics.""" 

1051 assert isinstance(src, str) 

1052 

1053 # print("decode_tags: src={!r}".format(src)) 

1054 

1055 pos_paths: list[list[list[PosPathStep]]] = [[[]]] 

1056 wordlst: list[str] = [] 

1057 max_last_i = 0 # pre-initialized here so that it can be used as a ref 

1058 

1059 add_new = functools.partial( 

1060 add_new1, # pre-set parameters and references for function 

1061 pos_paths=pos_paths, 

1062 wordlst=wordlst, 

1063 allow_any=allow_any, 

1064 no_unknown_starts=no_unknown_starts, 

1065 max_last_i=max_last_i, 

1066 ) 

1067 # First split the tags at commas and semicolons. Their significance is that 

1068 # a multi-word sequence cannot continue across them. 

1069 parts = split_at_comma_semi(src, extra=[";", ":"]) 

1070 

1071 for part in parts: 

1072 max_last_i = len(wordlst) # "how far have we gone?" 

1073 lst1 = part.split() 

1074 if not lst1: 

1075 continue 

1076 wordlst.extend(lst1) 

1077 cur_nodes: list[tuple[ValidNode, int, int]] = [] # Currently seen 

1078 for w in lst1: 

1079 i = len(pos_paths) - 1 

1080 new_nodes: list[tuple[ValidNode, int, int]] = [] 

1081 # replacement nodes for next loop 

1082 new_paths: list[list[PosPathStep]] = [] 

1083 # print("ITER i={} w={} max_last_i={} wordlst={}" 

1084 # .format(i, w, max_last_i, wordlst)) 

1085 node: ValidNode 

1086 start_i: int 

1087 last_i: int 

1088 for node, start_i, last_i in cur_nodes: 

1089 # ValidNodes are part of a search tree that checks if a 

1090 # phrase is found in xlat_tags_map and other text->tags dicts. 

1091 if w in node.children: 

1092 # the phrase continues down the tree 

1093 # print("INC", w) 

1094 max_last_i = add_new( 

1095 node.children[w], 

1096 i, 

1097 start_i, 

1098 last_i, 

1099 new_paths, 

1100 new_nodes, 

1101 ) 

1102 if node.end: 

1103 # we've hit an end point, the tags and topics have already 

1104 # been gathered at some point, don't do anything with the 

1105 # old stuff 

1106 if w in valid_sequences.children: 

1107 # This starts a *new* possible section 

1108 max_last_i = add_new( 

1109 valid_sequences.children[w], # root-> 

1110 i, 

1111 i, 

1112 i, 

1113 new_paths, 

1114 new_nodes, 

1115 ) 

1116 if w not in node.children and not node.end: 

1117 # print("w not in node and $: i={} last_i={} wordlst={}" 

1118 # .format(i, last_i, wordlst)) 

1119 # If i == last_i == 0, for example (beginning) 

1120 if ( 

1121 i == last_i 

1122 or no_unknown_starts 

1123 or wordlst[last_i] not in allowed_unknown_starts 

1124 ): 

1125 # print("NEW", w) 

1126 if w in valid_sequences.children: 

1127 # Start new sequences here 

1128 max_last_i = add_new( 

1129 valid_sequences.children[w], 

1130 i, 

1131 i, 

1132 last_i, 

1133 new_paths, 

1134 new_nodes, 

1135 ) 

1136 if not new_nodes: 

1137 # This is run at the start when i == max_last_i == 0, 

1138 # which is what populates the first node in new_nodes. 

1139 # Some initial words cause the rest to be interpreted as unknown 

1140 # print("not new nodes: i={} last_i={} wordlst={}" 

1141 # .format(i, max_last_i, wordlst)) 

1142 if ( 

1143 i == max_last_i 

1144 or no_unknown_starts 

1145 or wordlst[max_last_i] not in allowed_unknown_starts 

1146 ): 

1147 # print("RECOVER w={} i={} max_last_i={} wordlst={}" 

1148 # .format(w, i, max_last_i, wordlst)) 

1149 if w in valid_sequences.children: 

1150 max_last_i = add_new( 

1151 # new sequence from root 

1152 valid_sequences.children[w], 

1153 i, 

1154 i, 

1155 max_last_i, 

1156 new_paths, 

1157 new_nodes, 

1158 ) 

1159 cur_nodes = new_nodes # Completely replace nodes! 

1160 # 2023-08-18, fix to improve performance 

1161 # Decode tags does a big search of the best-shortest matching 

1162 # sequences of tags, but the original algorithm didn't have 

1163 # any culling happen during operation, so in a case with 

1164 # a lot of tags (for example, big blocks of text inserted 

1165 # somewhere by mistake that is processed by decode_tags), 

1166 # it would lead to exponential growth of new_paths contents. 

1167 # This culling, using the same weighting algorithm code as 

1168 # in the original is just applied to new_paths before it is 

1169 # added to pos_paths. Basically it's "take the 10 best paths". 

1170 # This *can* cause bugs if it gets stuck in a local minimum 

1171 # or something, but this whole process is one-dimensional 

1172 # and not that complex, so hopefully it works out... 

1173 pw = [] 

1174 path: list[PosPathStep] 

1175 for path in new_paths: 

1176 weight = len(path) 

1177 if any(x[1] == ["UNKNOWN"] for x in path): 

1178 weight += 100 # Penalize unknown paths 

1179 pw.append((weight, path)) 

1180 new_paths = [weightpath[1] for weightpath in sorted(pw)[:10]] 

1181 pos_paths.append(new_paths) 

1182 

1183 # print("END max_last_i={} len(wordlst)={} len(pos_paths)={}" 

1184 # .format(max_last_i, len(wordlst), len(pos_paths))) 

1185 

1186 if cur_nodes: 

1187 # print("END HAVE_NODES") 

1188 for node, start_i, last_i in cur_nodes: 

1189 if node.end: 

1190 # print("$ END start_i={} last_i={}" 

1191 # .format(start_i, last_i)) 

1192 for path in pos_paths[start_i]: 

1193 pos_paths[-1].append( 

1194 [(last_i, node.tags, node.topics)] + path 

1195 ) 

1196 else: 

1197 # print("UNK END start_i={} last_i={} wordlst={}" 

1198 # .format(start_i, last_i, wordlst)) 

1199 u = check_unknown( 

1200 last_i, 

1201 len(wordlst), 

1202 len(wordlst), 

1203 wordlst, 

1204 allow_any, 

1205 no_unknown_starts, 

1206 ) 

1207 if pos_paths[start_i]: 

1208 for path in pos_paths[start_i]: 

1209 pos_paths[-1].append(u + path) 

1210 else: 

1211 pos_paths[-1].append(u) 

1212 else: 

1213 # Check for a final unknown tag 

1214 # print("NO END NODES max_last_i={}".format(max_last_i)) 

1215 paths = pos_paths[max_last_i] or [[]] 

1216 u = check_unknown( 

1217 max_last_i, 

1218 len(wordlst), 

1219 len(wordlst), 

1220 wordlst, 

1221 allow_any, 

1222 no_unknown_starts, 

1223 ) 

1224 if u: 

1225 # print("end max_last_i={}".format(max_last_i)) 

1226 for path in list(paths): # Copy in case it is the last pos 

1227 pos_paths[-1].append(u + path) 

1228 

1229 # import json 

1230 # print("POS_PATHS:", json.dumps(pos_paths, indent=2, sort_keys=True)) 

1231 

1232 if not pos_paths[-1]: 

1233 # print("decode_tags: {}: EMPTY POS_PATHS[-1]".format(src)) 

1234 return [], [] 

1235 

1236 # Find the best path 

1237 pw = [] 

1238 for path in pos_paths[-1]: 

1239 weight = len(path) 

1240 if any(x[1] == ["UNKNOWN"] for x in path): 

1241 weight += 100 # Penalize unknown paths 

1242 pw.append((weight, path)) 

1243 path = min(pw)[1] 

1244 

1245 # Convert the best path to tagsets and topics 

1246 tagsets: list[list[str]] = [[]] 

1247 topics: list[str] = [] 

1248 for i, tagspec, topicspec in path: 

1249 if len(tagsets or "") > 16: 

1250 # ctx.error("Too many tagsets! This is probably exponential", 

1251 # sortid="form_descriptions/20230818") 

1252 return [("error-unknown-tag", "error-exponential-tagsets")], [] 

1253 if tagspec == ["UNKNOWN"]: 

1254 new_tagsets = [] 

1255 for x in tagsets: 

1256 new_tagsets.append(x + topicspec) 

1257 tagsets = new_tagsets 

1258 continue 

1259 if tagspec: 

1260 new_tagsets = [] 

1261 for x in tagsets: 

1262 for t in tagspec: 

1263 if t: 1263 ↛ 1270line 1263 didn't jump to line 1270 because the condition on line 1263 was always true

1264 new_tags = list(x) 

1265 for tag in t.split(): 

1266 if tag not in new_tags: 

1267 new_tags.append(tag) 

1268 new_tagsets.append(new_tags) 

1269 else: 

1270 new_tagsets.append(x) 

1271 tagsets = new_tagsets 

1272 if topicspec: 

1273 for t in topicspec: 

1274 for topic in t.split(): 

1275 if topic not in topics: 

1276 topics.append(topic) 

1277 

1278 # print("unsorted tagsets:", tagsets) 

1279 ret_tagsets = sorted(set(tuple(sorted(set(tags))) for tags in tagsets)) 

1280 # topics = list(sorted(set(topics))) XXX tests expect not sorted 

1281 # print("decode_tags: {} -> {} topics {}".format(src, tagsets, topics)) 

1282 # Yes, ret_tagsets is a list of tags in tuples, while topics is a LIST 

1283 # of tags. Turning topics into a tuple breaks tests, turning the tuples 

1284 # inside tagsets into lists breaks tests, I'm leaving them mismatched 

1285 # for now. XXX 

1286 return ret_tagsets, topics 

1287 

1288 

1289def parse_head_final_tags( 

1290 wxr: WiktextractContext, lang: str, form: str 

1291) -> tuple[str, list[str]]: 

1292 """Parses tags that are allowed at the end of a form head from the end 

1293 of the form. This can also be used for parsing the final gender etc tags 

1294 from translations and linkages.""" 

1295 assert isinstance(wxr, WiktextractContext) 

1296 assert isinstance(lang, str) # Should be language that "form" is for 

1297 assert isinstance(form, str) 

1298 

1299 # print("parse_head_final_tags: lang={} form={!r}".format(lang, form)) 

1300 

1301 # Make sure there are no double spaces in the form as this code does not 

1302 # handle them otherwise. 

1303 form = re.sub(r"\s+", " ", form.strip()) 

1304 if not form: 

1305 return form, [] 

1306 

1307 origform = form 

1308 

1309 tags = [] 

1310 

1311 # If parsing for certain Bantu languages (e.g., Swahili), handle 

1312 # some extra head-final tags first 

1313 if lang in head_final_bantu_langs: 

1314 m = re.search(head_final_bantu_re, form) 

1315 if m is not None: 

1316 tagkeys = m.group(1) 

1317 if not wxr.wtp.title.endswith(tagkeys): # type:ignore[union-attr] 1317 ↛ 1332line 1317 didn't jump to line 1332 because the condition on line 1317 was always true

1318 form = form[: m.start()] 

1319 v = head_final_bantu_map[tagkeys] 

1320 if v.startswith("?"): 1320 ↛ 1321line 1320 didn't jump to line 1321 because the condition on line 1320 was never true

1321 v = v[1:] 

1322 wxr.wtp.debug( 

1323 "suspicious suffix {!r} in language {}: {}".format( 

1324 tagkeys, lang, origform 

1325 ), 

1326 sortid="form_descriptions/1028", 

1327 ) 

1328 tags.extend(v.split()) 

1329 

1330 # If parsing for certain Semitic languages (e.g., Arabic), handle 

1331 # some extra head-final tags first 

1332 if lang in head_final_semitic_langs: 

1333 m = re.search(head_final_semitic_re, form) 

1334 if m is not None: 

1335 tagkeys = m.group(1) 

1336 if not wxr.wtp.title.endswith(tagkeys): # type:ignore[union-attr] 1336 ↛ 1351line 1336 didn't jump to line 1351 because the condition on line 1336 was always true

1337 form = form[: m.start()] 

1338 v = head_final_semitic_map[tagkeys] 

1339 if v.startswith("?"): 1339 ↛ 1340line 1339 didn't jump to line 1340 because the condition on line 1339 was never true

1340 v = v[1:] 

1341 wxr.wtp.debug( 

1342 "suspicious suffix {!r} in language {}: {}".format( 

1343 tagkeys, lang, origform 

1344 ), 

1345 sortid="form_descriptions/1043", 

1346 ) 

1347 tags.extend(v.split()) 

1348 

1349 # If parsing for certain other languages (e.g., Lithuanian, 

1350 # French, Finnish), handle some extra head-final tags first 

1351 if lang in head_final_other_langs: 

1352 m = re.search(head_final_other_re, form) 

1353 if m is not None: 

1354 tagkeys = m.group(1) 

1355 if not wxr.wtp.title.endswith(tagkeys): # type:ignore[union-attr] 1355 ↛ 1360line 1355 didn't jump to line 1360 because the condition on line 1355 was always true

1356 form = form[: m.start()] 

1357 tags.extend(head_final_other_map[tagkeys].split(" ")) 

1358 

1359 # Handle normal head-final tags 

1360 m = re.search(head_final_re, form) 

1361 if m is not None: 

1362 tagkeys = m.group(3) 

1363 # Only replace tags ending with numbers in languages that have 

1364 # head-final numeric tags (e.g., Bantu classes); also, don't replace 

1365 # tags if the main title ends with them (then presume they are part 

1366 # of the word) 

1367 # print("head_final_tags form={!r} tagkeys={!r} lang={}" 

1368 # .format(form, tagkeys, lang)) 

1369 tagkeys_contains_digit = re.search(r"\d", tagkeys) 

1370 if ( 

1371 (not tagkeys_contains_digit or lang in head_final_numeric_langs) 

1372 and not wxr.wtp.title.endswith(" " + tagkeys) # type:ignore[union-attr] 

1373 and 

1374 # XXX the above test does not capture when the whole word is a 

1375 # xlat_head_map key, so I added the below test to complement 

1376 # it; does this break anything? 

1377 not wxr.wtp.title == tagkeys 

1378 ): # defunct/English, 

1379 # "more defunct" -> "more" ["archaic"] 

1380 if not tagkeys_contains_digit or lang in head_final_numeric_langs: 1380 ↛ 1394line 1380 didn't jump to line 1394 because the condition on line 1380 was always true

1381 form = form[: m.start()] 

1382 v = xlat_head_map[tagkeys] 

1383 if v.startswith("?"): 1383 ↛ 1384line 1383 didn't jump to line 1384 because the condition on line 1383 was never true

1384 v = v[1:] 

1385 wxr.wtp.debug( 

1386 "suspicious suffix {!r} in language {}: {}".format( 

1387 tagkeys, lang, origform 

1388 ), 

1389 sortid="form_descriptions/1077", 

1390 ) 

1391 tags.extend(v.split()) 

1392 

1393 # Generate warnings about words ending in " or" after processing 

1394 if ( 

1395 (form.endswith(" or") and not origform.endswith(" or")) 

1396 or re.search( 

1397 r" (1|2|3|4|5|6|7|8|9|10|11|12|13|14|15|16|17|18|" 

1398 r"1a|2a|9a|10a|m1|f1|f2|m2|f3|m3|f4|m4|f5|m5|or|\?)" 

1399 r"($|/| (f|m|sg|pl|anim|inan))", 

1400 form, 

1401 ) 

1402 or form.endswith(" du") 

1403 ): 

1404 if form not in ok_suspicious_forms: 

1405 wxr.wtp.debug( 

1406 "suspicious unhandled suffix in {}: {!r}, originally {!r}".format( 

1407 lang, form, origform 

1408 ), 

1409 sortid="form_descriptions/1089", 

1410 ) 

1411 

1412 # print("parse_head_final_tags: form={!r} tags={}".format(form, tags)) 

1413 return form, tags 

1414 

1415 

1416def quote_kept_parens(s: str) -> str: 

1417 """Changes certain parenthesized expressions so that they won't be 

1418 interpreted as parentheses. This is used for parts that are kept as 

1419 part of the word, such as "read admiral (upper half)".""" 

1420 return re.sub( 

1421 r"\((lower half|upper half|k|s|n|II|III|A|C|G|U|Y|" 

1422 r"vinyl|p-phenylene vinylene|\(\(\s*\)\))\)", 

1423 r"__lpar__\1__rpar__", 

1424 s, 

1425 ) 

1426 

1427 

1428def quote_kept_ruby( 

1429 wxr: WiktextractContext, 

1430 ruby_tuples: list[ 

1431 tuple[ 

1432 str, 

1433 str, 

1434 ] 

1435 ], 

1436 s: str, 

1437) -> str: 

1438 if len(ruby_tuples) < 1: 1438 ↛ 1439line 1438 didn't jump to line 1439 because the condition on line 1438 was never true

1439 wxr.wtp.debug( 

1440 "quote_kept_ruby called with no ruby", 

1441 sortid="form_description/1114/20230517", 

1442 ) 

1443 return s 

1444 ks = [] 

1445 rs = [] 

1446 for k, r in ruby_tuples: 

1447 ks.append(re.escape(k)) 

1448 rs.append(re.escape(r)) 

1449 if not (ks and rs): 1449 ↛ 1450line 1449 didn't jump to line 1450 because the condition on line 1449 was never true

1450 wxr.wtp.debug( 

1451 f"empty column in ruby_tuples: {ruby_tuples}", 

1452 sortid="form_description/1124/20230606", 

1453 ) 

1454 return s 

1455 newm = re.compile( 

1456 r"({})\s*\(\s*({})\s*\)".format("|".join(ks), "|".join(rs)) 

1457 ) 

1458 rub_re = re.compile( 

1459 r"({})".format( 

1460 r"|".join( 

1461 r"{}\(*{}\)*".format( 

1462 re.escape(k), 

1463 re.escape(r), 

1464 ) 

1465 for k, r in ruby_tuples 

1466 ) 

1467 ) 

1468 ) 

1469 

1470 def paren_replace(m: re.Match) -> str: 

1471 return re.sub(newm, r"\1__lrub__\2__rrub__", m.group(0)) 

1472 

1473 return re.sub(rub_re, paren_replace, s) 

1474 

1475 

1476def unquote_kept_parens(s: str) -> str: 

1477 """Conerts the quoted parentheses back to normal parentheses.""" 

1478 return re.sub(r"__lpar__(.*?)__rpar__", r"(\1)", s) 

1479 

1480 

1481def add_romanization( 

1482 wxr: WiktextractContext, 

1483 data: WordData, 

1484 roman: str, 

1485 text: str, 

1486 is_reconstruction: bool, 

1487 head_group: Optional[int], 

1488 ruby: Sequence[tuple[str, str]], 

1489) -> None: 

1490 tags_lst = ["romanization"] 

1491 m = re.match(r"([^:]+):(.+)", roman) 

1492 # This function's purpose is to intercept broken romanizations, 

1493 # like "Yale: hēnpyeng" style tags. Most romanization styles 

1494 # are already present as tags, so we can use decode_tags to find 

1495 # them. 

1496 if m: 1496 ↛ 1497line 1496 didn't jump to line 1497 because the condition on line 1496 was never true

1497 tagsets, topics = decode_tags(m.group(1)) 

1498 if tagsets: 

1499 for tags in tagsets: 

1500 tags_lst.extend(tags) 

1501 roman = m.group(2) 

1502 add_related( 

1503 wxr, 

1504 data, 

1505 tags_lst, 

1506 [roman], 

1507 text, 

1508 True, 

1509 is_reconstruction, 

1510 head_group, 

1511 ruby, 

1512 ) 

1513 

1514 

1515def add_related( 

1516 wxr: WiktextractContext, 

1517 data: WordData, 

1518 tags_lst: Union[list[str], tuple[str, ...]], 

1519 related_list: list[str], 

1520 origtext: str, 

1521 add_all_canonicals: bool, 

1522 is_reconstruction: bool, 

1523 head_group: Optional[int], 

1524 ruby_data: Optional[Sequence[tuple[str, str]]] = None, 

1525) -> Optional[list[tuple[str, ...]]]: 

1526 """Internal helper function for some post-processing entries for related 

1527 forms (e.g., in word head). This returns a list of list of tags to be 

1528 added to following related forms or None (cf. walrus/English word head, 

1529 parenthesized part starting with "both").""" 

1530 assert isinstance(wxr, WiktextractContext) 

1531 assert isinstance(tags_lst, (list, tuple)) 

1532 for x in tags_lst: 

1533 assert isinstance(x, str) 

1534 assert isinstance(related_list, (list, tuple)) 

1535 assert isinstance(origtext, str) 

1536 assert add_all_canonicals in (True, False) 

1537 assert isinstance(ruby_data, (list, tuple)) or ruby_data is None 

1538 if ruby_data is None: 1538 ↛ 1539line 1538 didn't jump to line 1539 because the condition on line 1538 was never true

1539 ruby_data = [] 

1540 related = " ".join(related_list) 

1541 # print("add_related: tags_lst={} related={}".format(tags_lst, related)) 

1542 if related == "[please provide]": 1542 ↛ 1543line 1542 didn't jump to line 1543 because the condition on line 1542 was never true

1543 return None 

1544 if related in IGNORED_RELATED: 1544 ↛ 1545line 1544 didn't jump to line 1545 because the condition on line 1544 was never true

1545 return None 

1546 if is_reconstruction and related.startswith("*") and len(related) > 1: 

1547 related = related[1:] 

1548 

1549 # Get title word, with any reconstruction prefix removed 

1550 titleword = re.sub(r"^Reconstruction:[^/]*/", "", wxr.wtp.title) # type:ignore[arg-type] 

1551 

1552 def check_related(related: str) -> None: 

1553 # Warn about some suspicious related forms 

1554 m = re.search(suspicious_related_re, related) 

1555 if (m and m.group(0) not in titleword) or ( 

1556 related in ("f", "m", "n", "c") and len(titleword) >= 3 

1557 ): 

1558 if "eumhun" in tags_lst: 1558 ↛ 1559line 1558 didn't jump to line 1559 because the condition on line 1558 was never true

1559 return 

1560 if "cangjie-input" in tags_lst: 1560 ↛ 1561line 1560 didn't jump to line 1561 because the condition on line 1560 was never true

1561 return 

1562 if "class" in tags_lst: 1562 ↛ 1563line 1562 didn't jump to line 1563 because the condition on line 1562 was never true

1563 return 

1564 if wxr.wtp.section == "Korean" and re.search( 1564 ↛ 1568line 1564 didn't jump to line 1568 because the condition on line 1564 was never true

1565 r"^\s*\w*>\w*\s*$", related 

1566 ): 

1567 # ignore Korean "i>ni" / "라>나" values 

1568 return 

1569 if ( 1569 ↛ 1576line 1569 didn't jump to line 1576 because the condition on line 1569 was never true

1570 wxr.wtp.section == "Burmese" 

1571 and "romanization" in tags_lst 

1572 and re.search(r":", related) 

1573 ): 

1574 # ignore Burmese with ":", that is used in Burmese 

1575 # translitteration of "း", the high-tone visarga. 

1576 return 

1577 wxr.wtp.debug( 

1578 "suspicious related form tags {}: {!r} in {!r}".format( 

1579 tags_lst, related, origtext 

1580 ), 

1581 sortid="form_descriptions/1147", 

1582 ) 

1583 

1584 following_tagsets = None # Tagsets to add to following related forms 

1585 roman = None 

1586 tagsets1: list[tuple[str, ...]] = [tuple()] 

1587 topics1: list[str] = [] 

1588 

1589 m = re.match(r"\((([^()]|\([^()]*\))*)\)\s+", related) 

1590 if m: 

1591 paren = m.group(1) 

1592 related = related[m.end() :] 

1593 m = re.match(r"^(all|both) (.*)", paren) 

1594 if m: 1594 ↛ 1595line 1594 didn't jump to line 1595 because the condition on line 1594 was never true

1595 tagsets1, topics1 = decode_tags(m.group(2)) 

1596 following_tagsets = tagsets1 

1597 else: 

1598 tagsets1, topics1 = decode_tags(paren) 

1599 else: 

1600 m = re.search(r"\s+\((([^()]|\([^()]*\))*)\)$", related) 

1601 if m: 

1602 paren = m.group(1) 

1603 if paren.startswith("U+"): 1603 ↛ 1604line 1603 didn't jump to line 1604 because the condition on line 1603 was never true

1604 related = related[: m.start()] 

1605 else: 

1606 cls = classify_desc(paren) 

1607 if ( 1607 ↛ 1614line 1607 didn't jump to line 1614 because the condition on line 1607 was always true

1608 cls in ("romanization", "english") 

1609 and classify_desc(related[: m.start()]) == "other" 

1610 ): 

1611 roman = paren 

1612 related = related[: m.start()] 

1613 else: 

1614 related = related[: m.start()] 

1615 tagsets1, topics1 = decode_tags(paren) 

1616 if related and related.startswith("{{"): 1616 ↛ 1617line 1616 didn't jump to line 1617 because the condition on line 1616 was never true

1617 wxr.wtp.debug( 

1618 "{{ in word head form - possible Wiktionary error: {!r}".format( 

1619 related 

1620 ), 

1621 sortid="form_descriptions/1177", 

1622 ) 

1623 return None # Likely Wiktionary coding error 

1624 related = unquote_kept_parens(related) 

1625 # Split related by "/" (e.g., grande/Spanish) superlative in head 

1626 # Do not split if / in word title, see π//Japanese 

1627 if len(related) > 5 and "/" not in wxr.wtp.title: # type:ignore[operator] 

1628 alts = split_at_comma_semi(related, separators=["/"]) 

1629 else: 

1630 alts = [related] 

1631 if ruby_data: 

1632 # prepare some regex stuff in advance 

1633 ks, rs = [], [] 

1634 for k, r in ruby_data: 

1635 ks.append(re.escape(k)) 

1636 rs.append(re.escape(r)) 

1637 splitter = r"((?:{})__lrub__(?:{})__rrub__)".format( 

1638 "|".join(ks), "|".join(rs) 

1639 ) 

1640 for related in alts: 

1641 ruby: list[tuple[str, str]] = [] 

1642 if ruby_data: 

1643 new_related = [] 

1644 rub_split = re.split(splitter, related) 

1645 for s in rub_split: 

1646 m = re.match(r"(.+)__lrub__(.+)__rrub__", s) 

1647 if m: 

1648 # add ruby with (\1, \2) 

1649 ruby.append((m.group(1), m.group(2))) 

1650 new_related.append(m.group(1)) 

1651 else: 

1652 new_related.append(s) 

1653 related = "".join(new_related) 

1654 tagsets2, topics2 = decode_tags(" ".join(tags_lst)) 

1655 for tags1 in tagsets1: 

1656 assert isinstance(tags1, (list, tuple)) 

1657 for tags2 in tagsets2: 

1658 assert isinstance(tags1, (list, tuple)) 

1659 dt: LinkageData = {"word": related} 

1660 if roman: 

1661 dt["roman"] = roman 

1662 if ruby: 

1663 dt["ruby"] = ruby 

1664 if "alt-of" in tags2: 1664 ↛ 1665line 1664 didn't jump to line 1665 because the condition on line 1664 was never true

1665 check_related(related) 

1666 data_extend(data, "tags", tags1) 

1667 data_extend(data, "tags", tags2) 

1668 data_extend(data, "topics", topics1) 

1669 data_extend(data, "topics", topics2) 

1670 data_append(data, "alt_of", dt) 

1671 elif "form-of" in tags2: 1671 ↛ 1672line 1671 didn't jump to line 1672 because the condition on line 1671 was never true

1672 check_related(related) 

1673 data_extend(data, "tags", tags1) 

1674 data_extend(data, "tags", tags2) 

1675 data_extend(data, "topics", topics1) 

1676 data_extend(data, "topics", topics2) 

1677 data_append(data, "form_of", dt) 

1678 elif "compound-of" in tags2: 1678 ↛ 1679line 1678 didn't jump to line 1679 because the condition on line 1678 was never true

1679 check_related(related) 

1680 data_extend(data, "tags", tags1) 

1681 data_extend(data, "tags", tags2) 

1682 data_extend(data, "topics", topics1) 

1683 data_extend(data, "topics", topics2) 

1684 data_append(data, "compound", related) 

1685 else: 

1686 lang = wxr.wtp.section or "LANG_MISSING" 

1687 related, final_tags = parse_head_final_tags( 

1688 wxr, lang, related 

1689 ) 

1690 # print("add_related: related={!r} tags1={!r} tags2={!r} " 

1691 # "final_tags={!r}" 

1692 # .format(related, tags1, tags2, final_tags)) 

1693 tags = list(tags1) + list(tags2) + list(final_tags) 

1694 check_related(related) 

1695 form: FormData = {"form": related} 

1696 if head_group: 

1697 form["head_nr"] = head_group 

1698 if roman: 

1699 form["roman"] = roman 

1700 if ruby: 

1701 form["ruby"] = ruby 

1702 data_extend(form, "topics", topics1) 

1703 data_extend(form, "topics", topics2) 

1704 if topics1 or topics2: 1704 ↛ 1705line 1704 didn't jump to line 1705 because the condition on line 1704 was never true

1705 wxr.wtp.debug( 

1706 "word head form has topics: {}".format(form), 

1707 sortid="form_descriptions/1233", 

1708 ) 

1709 # Add tags from canonical form into the main entry 

1710 if "canonical" in tags: 

1711 if related in ("m", "f") and len(titleword) > 1: 1711 ↛ 1712line 1711 didn't jump to line 1712 because the condition on line 1711 was never true

1712 wxr.wtp.debug( 

1713 "probably incorrect canonical form " 

1714 "{!r} ignored (probably tag combination " 

1715 "missing from xlat_head_map)".format(related), 

1716 sortid="form_descriptions/1241", 

1717 ) 

1718 continue 

1719 if ( 

1720 related != titleword 

1721 or add_all_canonicals 

1722 or topics1 

1723 or topics2 

1724 or ruby 

1725 ): 

1726 data_extend(form, "tags", sorted(set(tags))) 

1727 else: 

1728 # We won't add canonical form here 

1729 filtered_tags = list( 

1730 x for x in tags if x != "canonical" 

1731 ) 

1732 data_extend(data, "tags", filtered_tags) 

1733 continue 

1734 else: 

1735 data_extend(form, "tags", sorted(set(tags))) 

1736 # Only insert if the form is not already there 

1737 for old in data.get("forms", ()): 

1738 if form == old: 1738 ↛ 1739line 1738 didn't jump to line 1739 because the condition on line 1738 was never true

1739 break 

1740 else: 

1741 data_append(data, "forms", form) 

1742 

1743 # If this form had pre-tags that started with "both" or "all", add those 

1744 # tags also to following related forms that don't have their own tags 

1745 # specified. 

1746 return following_tagsets 

1747 

1748 

1749# Issue #967, in English word forms sometimes forms are skipped because 

1750# they are taggable words and their distw() is too big, like clipping from clip 

1751WORDS_WITH_FALSE_POSITIVE_TAGS: dict[str, list[str]] = { 

1752 "clip": ["clipping"], # XXX remember to change me back to clipping after 

1753 "English": ["English", "Englishes"], 

1754 "common": ["common", "commoner"], 

1755 # tests. 

1756} 

1757 

1758WORDS_WITH_FALSE_POSITIVE_FORMS: dict[str, list[str]] = { 

1759 "unaccountability": ["countable", "uncountable"], 

1760 "uncountability": ["countable", "uncountable"], 

1761} 

1762 

1763FALSE_POSITIVE_MISSING_FORMS: dict[str, list[str]] = {} 

1764 

1765FORM_ASSOCIATED_TAG_WORDS: set[str] = { 

1766 "participle", 

1767 "past", 

1768 "present", 

1769 "singular", 

1770 "plural", 

1771 "first-person", 

1772 "second-person", 

1773 "third-person", 

1774 "gerund", 

1775} 

1776 

1777 

1778def parse_word_head( 

1779 wxr: WiktextractContext, 

1780 pos: str, 

1781 text: str, 

1782 data: WordData, 

1783 is_reconstruction: bool, 

1784 head_group: Optional[int], 

1785 ruby=None, 

1786 links=None, 

1787) -> None: 

1788 """Parses the head line for a word for in a particular language and 

1789 part-of-speech, extracting tags and related forms.""" 

1790 assert isinstance(wxr, WiktextractContext) 

1791 assert isinstance(pos, str) 

1792 assert isinstance(text, str) 

1793 assert isinstance(data, dict) 

1794 assert isinstance(ruby, (list, tuple)) or ruby is None 

1795 if ruby is None: 

1796 ruby = [] 

1797 assert is_reconstruction in (True, False) 

1798 # print("PARSE_WORD_HEAD: {}: {!r}".format(wxr.wtp.section, text)) 

1799 # print(f"PARSE_WORD_HEAD: {data=}") 

1800 if links is None: 

1801 links = [] 

1802 

1803 if len(links) > 0: 

1804 # if we have link data (that is, links with stuff like commas and 

1805 # spaces, replace word_re with a modified local scope pattern 

1806 # print(f"links {list((c, ord(c)) for link in links for c in link)=}") 

1807 word_re = re.compile( 

1808 r"\b" # In case we have forms that are longer and contain links 

1809 + 

1810 # or words as a substring... 

1811 r"\b|\b".join( 

1812 sorted((re.escape(s) for s in links), key=lambda x: -len(x)) 

1813 ) 

1814 + r"\b|" 

1815 + word_pattern 

1816 ) 

1817 else: 

1818 word_re = word_re_global 

1819 

1820 if "Lua execution error" in text or "Lua timeout error" in text: 1820 ↛ 1821line 1820 didn't jump to line 1821 because the condition on line 1820 was never true

1821 return 

1822 

1823 # Fix words with "superlative:" or "comparative:" at end of head 

1824 # e.g. grande/Spanish/Adj 

1825 text = re.sub(r" (superlative|comparative): (.*)", r" (\1 \2)", text) 

1826 

1827 # Parse Arabic non-past forms, e.g. أبلع/Arabic/Verb 

1828 m = re.search(r", non-past ([^)]+ \([^)]+\))", text) 

1829 if m: 

1830 add_related( 

1831 wxr, 

1832 data, 

1833 ["non-past"], 

1834 [m.group(1)], 

1835 text, 

1836 True, 

1837 is_reconstruction, 

1838 head_group, 

1839 ruby, 

1840 ) 

1841 text = text[: m.start()] + text[m.end() :] 

1842 

1843 language = wxr.wtp.section 

1844 titleword = re.sub( 

1845 r"^Reconstruction:[^/]*/", "", wxr.wtp.title or "MISSING_TITLE" 

1846 ) 

1847 titleparts = list( 

1848 m.group(0) 

1849 for m in re.finditer(word_re, wxr.wtp.title or "MISSING_TITLE") 

1850 ) 

1851 if not titleparts: 1851 ↛ 1852line 1851 didn't jump to line 1852 because the condition on line 1851 was never true

1852 return 

1853 

1854 # Remove " or" from the end to prevent weird canonical forms 

1855 if text.endswith(" or"): 

1856 for tp in titleparts: 

1857 if text.endswith(tp): 1857 ↛ 1858line 1857 didn't jump to line 1858 because the condition on line 1857 was never true

1858 break 

1859 else: 

1860 text = text.removesuffix(" or").rstrip() 

1861 

1862 # Handle the part of the head that is not in parentheses. However, certain 

1863 # parenthesized parts are part of word, and those must be handled 

1864 # specially here. 

1865 if ruby: 

1866 text = quote_kept_ruby(wxr, ruby, text) 

1867 base = text 

1868 base = quote_kept_parens(base) 

1869 base = remove_text_in_parentheses(base) 

1870 base = base.replace("?", "") # Removes uncertain articles etc 

1871 base = re.sub(r"\s+", " ", base) 

1872 base = re.sub(r" ([,;])", r"\1", base) 

1873 base = re.sub(r" • ", r" ", base) 

1874 # Many languages use • as a punctuation mark separating the base 

1875 # from the rest of the head. στάδιος/Ancient Greek, issue #176 

1876 base = base.strip() 

1877 # print(f"{base=}") 

1878 

1879 # Check for certain endings in head (mostly for compatibility with weird 

1880 # heads, e.g. rata/Romanian "1st conj." at end) 

1881 m = re.search(head_end_re, base) 

1882 tags: Union[tuple[str, ...], list[str]] = [] 

1883 if m: 1883 ↛ 1884line 1883 didn't jump to line 1884 because the condition on line 1883 was never true

1884 tags = head_end_map[m.group(1).lower()].split() 

1885 data_extend(data, "tags", tags) 

1886 base = base[: m.start()] 

1887 

1888 # Special case: handle Hán Nôm readings for Vietnamese characters 

1889 m = re.match( 

1890 r"{}: (Hán Nôm) readings: (.*)".format(re.escape(titleword)), base 

1891 ) 

1892 if m: 1892 ↛ 1893line 1892 didn't jump to line 1893 because the condition on line 1892 was never true

1893 tag, readings = m.groups() 

1894 tag = re.sub(r"\s+", "-", tag) 

1895 for reading in split_at_comma_semi(readings, skipped=links): 

1896 add_related( 

1897 wxr, 

1898 data, 

1899 [tag], 

1900 [reading], 

1901 text, 

1902 True, 

1903 is_reconstruction, 

1904 head_group, 

1905 ruby, 

1906 ) 

1907 return 

1908 

1909 # Special case: Hebrew " [pattern: nnn]" ending 

1910 m = re.search(r"\s+\[pattern: ([^]]+)\]", base) 

1911 if m: 1911 ↛ 1912line 1911 didn't jump to line 1912 because the condition on line 1911 was never true

1912 add_related( 

1913 wxr, 

1914 data, 

1915 ["class"], 

1916 [m.group(1)], 

1917 text, 

1918 True, 

1919 is_reconstruction, 

1920 head_group, 

1921 ruby, 

1922 ) 

1923 base = base[: m.start()] + base[m.end() :] 

1924 

1925 # Clean away some messy "Upload an image" template text used in 

1926 # American Sign Language: 

1927 # S@NearBaseForearm-PalmUp Frontandback S@BaseForearm-PalmUp 

1928 m = re.search(r"Upload .+ gif image.", base) 

1929 if m: 1929 ↛ 1930line 1929 didn't jump to line 1930 because the condition on line 1929 was never true

1930 base = base[: m.start()] + base[m.end() :] 

1931 

1932 # Split the head into alternatives. This is a complicated task, as 

1933 # we do not want so split on "or" or "," when immediately followed by more 

1934 # head-final tags, but otherwise do want to split by them. 

1935 # 20230907 added "or" to this to handle 'true or false', titles with 'or' 

1936 if wxr.wtp.title and ("," in wxr.wtp.title or " or " in wxr.wtp.title): 

1937 # A kludge to handle article titles/phrases with commas. 

1938 # Preprocess splits to first capture the title, then handle 

1939 # all the others as usual. 

1940 presplits = re.split(r"({})".format(wxr.wtp.title), base) 

1941 splits = [] 

1942 for psplit in presplits: 

1943 if psplit == wxr.wtp.title: 

1944 splits.append(psplit) 

1945 else: 

1946 splits.extend(re.split(head_split_re, psplit)) 

1947 else: 

1948 # Do the normal split; previous only-behavior. 

1949 splits = re.split(head_split_re, base) 

1950 # print("SPLITS:", splits) 

1951 alts: list[str] = [] 

1952 # print("parse_word_head: splits:", splits, 

1953 # "head_split_re_parens:", head_split_re_parens) 

1954 for i in range( 

1955 0, len(splits) - head_split_re_parens, head_split_re_parens + 1 

1956 ): 

1957 v = splits[i] 

1958 ending = splits[i + 1] or "" # XXX is this correct??? 

1959 # print("parse_word_head alts v={!r} ending={!r} alts={}" 

1960 # .format(v, ending, alts)) 

1961 if alts and (v == "" and ending): 

1962 assert ending[0] == " " 

1963 alts[-1] += " or" + ending # endings starts with space 

1964 elif v or ending: 1964 ↛ 1954line 1964 didn't jump to line 1954 because the condition on line 1964 was always true

1965 alts.append((v or "") + (ending or "")) 

1966 last = splits[-1].strip() 

1967 conn = "" if len(splits) < 3 else splits[-2] 

1968 # print("parse_word_head alts last={!r} conn={!r} alts={}" 

1969 # .format(last, conn, alts)) 

1970 if ( 

1971 alts 

1972 and last 

1973 and ( 

1974 last.split()[0] in xlat_head_map 

1975 or ( 

1976 conn == " or " 

1977 and (alts[-1] + " or " + last).strip() in xlat_head_map 

1978 ) 

1979 ) 

1980 ): 

1981 alts[-1] += " or " + last 

1982 elif last: 

1983 alts.append(last) 

1984 

1985 # print("parse_word_head alts: {}".format(alts)) 

1986 # print(f"{base=}") 

1987 

1988 # Process the head alternatives 

1989 canonicals: list[tuple[list[str], list[str]]] = [] 

1990 mode: Optional[str] = None 

1991 for alt_i, alt in enumerate(alts): 

1992 alt = alt.strip() 

1993 if alt.startswith("compound form:"): 1993 ↛ 1994line 1993 didn't jump to line 1994 because the condition on line 1993 was never true

1994 mode = "compound-form" 

1995 alt = alt[14:].strip() 

1996 if ((dash_i := alt.find(" -")) > 0) and ( 

1997 dash_i > (wxr.wtp.title or "").find(" -") 

1998 ): 

1999 # test_en_head / test_suffixes_at_end_of_form1 

2000 # Some heads have suffixes that end up attached to the form 

2001 # like in https://en.wiktionary.org/wiki/%E6%A5%BD%E3%81%97%E3%81%84 

2002 alt = alt[:dash_i] 

2003 if mode == "compound-form": 2003 ↛ 2004line 2003 didn't jump to line 2004 because the condition on line 2003 was never true

2004 add_related( 

2005 wxr, 

2006 data, 

2007 ["in-compounds"], 

2008 [alt], 

2009 text, 

2010 True, 

2011 is_reconstruction, 

2012 head_group, 

2013 ruby, 

2014 ) 

2015 continue 

2016 # For non-first parts, see if it can be treated as tags-only 

2017 if alt_i == 0: 

2018 expanded_alts = [alt] 

2019 else: 

2020 expanded_alts = map_with(xlat_descs_map, [alt]) 

2021 # print("EXPANDED_ALTS:", expanded_alts) 

2022 tagsets: Optional[list[tuple[str, ...]]] 

2023 for alt in expanded_alts: 

2024 baseparts = list(m.group(0) for m in word_re.finditer(alt)) 

2025 if alt_i > 0: 

2026 tagsets, topics = decode_tags(" ".join(baseparts)) 

2027 if not any("error-unknown-tag" in x for x in tagsets): 

2028 data_extend(data, "topics", topics) 

2029 for tags1 in tagsets: 

2030 data_extend(data, "tags", tags1) 

2031 continue 

2032 

2033 alt, tags = parse_head_final_tags( 

2034 wxr, language or "MISSING_LANG", alt 

2035 ) 

2036 tags = list(tags) # Make sure we don't modify anything cached 

2037 tags.append("canonical") 

2038 if alt_i == 0 and "," in wxr.wtp.title: # type:ignore[operator] 

2039 # Kludge to handle article titles/phrases with commas. 

2040 # basepart's regex strips commas, which leads to a 

2041 # canonical form that is the title phrase without a comma. 

2042 # basepart in add_related is almost immediately joined with 

2043 # spaces anyhow. XXX not exactly sure why it's 

2044 # canonicals.append((tags, baseparts)) and not (tags, [alt]) 

2045 baseparts = [alt] 

2046 canonicals.append((tags, baseparts)) 

2047 for tags, baseparts in canonicals: 

2048 add_related( 

2049 wxr, 

2050 data, 

2051 tags, 

2052 baseparts, 

2053 text, 

2054 len(canonicals) > 1, 

2055 is_reconstruction, 

2056 head_group, 

2057 ruby, 

2058 ) 

2059 

2060 # Handle parenthesized descriptors for the word form and links to 

2061 # related words 

2062 text = quote_kept_parens(text) 

2063 parens = list( 

2064 m.group(2) 

2065 for m in re.finditer(r"(^|\s)\((([^()]|\([^()]*\))*)\)", text) 

2066 ) 

2067 parens.extend( 

2068 m.group(1) 

2069 for m in re.finditer(r"[^\s]\((([^()]|\([^()]*\))*)\)($|\s)", text) 

2070 ) 

2071 have_romanization = False 

2072 have_ruby = False 

2073 hiragana = "" 

2074 katakana = "" 

2075 for paren in parens: 

2076 paren = paren.strip() 

2077 if not paren: 2077 ↛ 2078line 2077 didn't jump to line 2078 because the condition on line 2077 was never true

2078 continue 

2079 if paren.startswith("see "): 

2080 continue 

2081 if paren.startswith("U+"): 2081 ↛ 2082line 2081 didn't jump to line 2082 because the condition on line 2081 was never true

2082 continue 

2083 # In some rare cases, strip word that inflects form the form 

2084 # description, e.g. "look through rose-tinted glasses"/English. 

2085 paren = re.sub(r"\s*\(\[[^])]*\]\)", "", paren) 

2086 

2087 # If it starts with hiragana or katakana, treat as such form. Note 

2088 # that each hiragana/katakana character is in separate parentheses, 

2089 # so we must concatenate them. 

2090 try: 

2091 un = unicodedata.name(paren[0]).split()[0] 

2092 except ValueError: 

2093 un = "INVALID" 

2094 if un == "KATAKANA": 2094 ↛ 2095line 2094 didn't jump to line 2095 because the condition on line 2094 was never true

2095 katakana += paren 

2096 have_ruby = True 

2097 continue 

2098 if un == "HIRAGANA": 2098 ↛ 2099line 2098 didn't jump to line 2099 because the condition on line 2098 was never true

2099 hiragana += paren 

2100 have_ruby = True 

2101 continue 

2102 

2103 # Parse format ", 16 (Japan, Mainland), 17 (Hong Kong, Taiwan) strokes," 

2104 # in the middle of the parenthesized expression, e.g. 薄 

2105 def strokes_repl(m: re.Match) -> str: 

2106 strokes1, tags1, strokes2, tags2 = m.groups() 

2107 for strokes, tags in [[strokes1, tags1], [strokes2, tags2]]: 

2108 tags = tags.split(", ") 

2109 tags = list( 

2110 "Mainland China" if t == "Mainland" else t for t in tags 

2111 ) 

2112 tags.append("strokes") 

2113 add_related( 

2114 wxr, 

2115 data, 

2116 tags, 

2117 [strokes], 

2118 text, 

2119 True, 

2120 is_reconstruction, 

2121 head_group, 

2122 ruby, 

2123 ) 

2124 return ", " 

2125 

2126 paren = re.sub( 

2127 r", (\d+) \(([^()]+)\), (\d+) \(([^()]+)\) strokes, ", 

2128 strokes_repl, 

2129 paren, 

2130 ) 

2131 

2132 descriptors = map_with(xlat_descs_map, [paren]) 

2133 new_desc = [] 

2134 for desc in descriptors: 

2135 new_desc.extend( 

2136 map_with( 

2137 xlat_tags_map, 

2138 split_at_comma_semi(desc, extra=[", or "], skipped=links), 

2139 ) 

2140 ) 

2141 prev_tags: Union[list[list[str]], list[tuple[str, ...]], None] = None 

2142 following_tags = None # Added to prev_tags from previous parenthesized 

2143 # part, e.g. walrus/English 

2144 # "(both nonstandard, proscribed, uncommon)" 

2145 for desc_i, desc in enumerate(new_desc): 

2146 # print("HEAD DESC: {!r}".format(desc)) 

2147 

2148 # Abort on certain descriptors (assume remaining values are 

2149 # examples or uninteresting, cf. gaan/Navajo, horior/Latin) 

2150 if re.match(r"^(per |e\.g\.$)", desc): 2150 ↛ 2151line 2150 didn't jump to line 2151 because the condition on line 2150 was never true

2151 break 

2152 

2153 # If it all consists of CJK characters, add it with the 

2154 # CJK tag. This is used at least for some Vietnamese 

2155 # words (e.g., ba/Vietnamese) 

2156 try: 

2157 if all(unicodedata.name(x).startswith("CJK ") for x in desc): 2157 ↛ 2158line 2157 didn't jump to line 2158 because the condition on line 2157 was never true

2158 add_related( 

2159 wxr, 

2160 data, 

2161 ["CJK"], 

2162 [desc], 

2163 text, 

2164 True, 

2165 is_reconstruction, 

2166 head_group, 

2167 ruby, 

2168 ) 

2169 continue 

2170 except ValueError: 

2171 pass 

2172 

2173 # Handle some special cases 

2174 splitdesc = desc.split() 

2175 if ( 2175 ↛ 2184line 2175 didn't jump to line 2184 because the condition on line 2175 was never true

2176 len(splitdesc) >= 3 

2177 and splitdesc[1] == "superlative" 

2178 and classify_desc(splitdesc[0]) != "tags" 

2179 and prev_tags 

2180 ): 

2181 # Handle the special case of second comparative after comma, 

2182 # followed by superlative without comma. E.g. 

2183 # mal/Portuguese/Adv 

2184 for ts in prev_tags: 

2185 add_related( 

2186 wxr, 

2187 data, 

2188 ts, 

2189 [splitdesc[0]], 

2190 text, 

2191 True, 

2192 is_reconstruction, 

2193 head_group, 

2194 ruby, 

2195 ) 

2196 desc = " ".join(splitdesc[1:]) 

2197 elif ( 2197 ↛ 2205line 2197 didn't jump to line 2205 because the condition on line 2197 was never true

2198 len(splitdesc) == 2 

2199 and splitdesc[0] in ("also", "and") 

2200 and prev_tags 

2201 and classify_desc(splitdesc[1]) != "tags" 

2202 ): 

2203 # Sometimes alternative forms are prefixed with "also" or 

2204 # "and" 

2205 for ts in prev_tags: 

2206 add_related( 

2207 wxr, 

2208 data, 

2209 ts, 

2210 [splitdesc[1]], 

2211 text, 

2212 True, 

2213 is_reconstruction, 

2214 head_group, 

2215 ruby, 

2216 ) 

2217 continue 

2218 elif len(splitdesc) >= 2 and splitdesc[0] in ("including",): 2218 ↛ 2219line 2218 didn't jump to line 2219 because the condition on line 2218 was never true

2219 continue 

2220 

2221 # If only one word, assume it is comma-separated alternative 

2222 # to the previous one 

2223 if " " not in desc: 

2224 cls = classify_desc(desc) 

2225 if cls != "tags": 

2226 if prev_tags: 2226 ↛ 2228line 2226 didn't jump to line 2228 because the condition on line 2226 was never true

2227 # Assume comma-separated alternative to previous one 

2228 for ts in prev_tags: 

2229 add_related( 

2230 wxr, 

2231 data, 

2232 ts, 

2233 [desc], 

2234 text, 

2235 True, 

2236 is_reconstruction, 

2237 head_group, 

2238 ruby, 

2239 ) 

2240 continue 

2241 elif distw(titleparts, desc) <= 0.5: 2241 ↛ 2244line 2241 didn't jump to line 2244 because the condition on line 2241 was never true

2242 # Similar to head word, assume a dialectal variation to 

2243 # the base form. Cf. go/Alemannic German/Verb 

2244 add_related( 

2245 wxr, 

2246 data, 

2247 ["alternative"], 

2248 [desc], 

2249 text, 

2250 True, 

2251 is_reconstruction, 

2252 head_group, 

2253 ruby, 

2254 ) 

2255 continue 

2256 elif ( 

2257 cls in ("romanization", "english") 

2258 and not have_romanization 

2259 and classify_desc(titleword) == "other" 

2260 and not ( 

2261 "categories" in data and desc in data["categories"] 

2262 ) 

2263 ): 

2264 # Assume it to be a romanization 

2265 add_romanization( 

2266 wxr, 

2267 data, 

2268 desc, 

2269 text, 

2270 is_reconstruction, 

2271 head_group, 

2272 ruby, 

2273 ) 

2274 have_romanization = True 

2275 continue 

2276 

2277 m = re.match(r"^(\d+) strokes?$", desc) 

2278 if m: 

2279 # Special case, used to give #strokes for Han characters 

2280 add_related( 

2281 wxr, 

2282 data, 

2283 ["strokes"], 

2284 [m.group(1)], 

2285 text, 

2286 True, 

2287 is_reconstruction, 

2288 head_group, 

2289 ruby, 

2290 ) 

2291 continue 

2292 

2293 # See if it is radical+strokes 

2294 m = re.match( 

2295 r"^([\u2F00-\u2FDF\u2E80-\u2EFF\U00018800-\U00018AFF" 

2296 r"\uA490-\uA4CF\u4E00-\u9FFF]\+\d+)" 

2297 r"( in (Japanese|Chinese|traditional Chinese|" 

2298 r"simplified Chinese))?$", 

2299 desc, 

2300 ) 

2301 if m: 2301 ↛ 2304line 2301 didn't jump to line 2304 because the condition on line 2301 was never true

2302 # Special case, used to give radical + strokes for Han 

2303 # characters 

2304 radical_strokes = m.group(1) 

2305 lang = m.group(3) 

2306 t = ["radical+strokes"] 

2307 if lang: 

2308 t.extend(lang.split()) 

2309 add_related( 

2310 wxr, 

2311 data, 

2312 t, 

2313 [radical_strokes], 

2314 text, 

2315 True, 

2316 is_reconstruction, 

2317 head_group, 

2318 ruby, 

2319 ) 

2320 prev_tags = None 

2321 following_tags = None 

2322 continue 

2323 

2324 # See if it indicates historical Katakana ortography (←) or 

2325 # just otherwise katakana/hiragana form 

2326 m = re.match(r"←\s*|kana\s+", desc) 

2327 if m: 2327 ↛ 2328line 2327 didn't jump to line 2328 because the condition on line 2327 was never true

2328 if desc.startswith("←"): 

2329 t1 = "historical " 

2330 else: 

2331 t1 = "" 

2332 x = desc[m.end() :] 

2333 if x.endswith("?"): 

2334 x = x[:-1] 

2335 # XXX should we add a tag indicating uncertainty? 

2336 if x: 

2337 name = unicodedata.name(x[0]) 

2338 if name.startswith("HIRAGANA "): 

2339 desc = t1 + "hiragana " + x 

2340 elif name.startswith("KATAKANA "): 

2341 desc = t1 + "katakana " + x 

2342 

2343 # See if it is "n strokes in Chinese" or similar 

2344 m = re.match( 

2345 r"(\d+) strokes in (Chinese|Japanese|" 

2346 r"traditional Chinese|simplified Chinese)$", 

2347 desc, 

2348 ) 

2349 if m: 2349 ↛ 2351line 2349 didn't jump to line 2351 because the condition on line 2349 was never true

2350 # Special case, used to give just strokes for some Han chars 

2351 strokes = m.group(1) 

2352 lang = m.group(2) 

2353 t = ["strokes"] 

2354 t.extend(lang.split()) 

2355 add_related( 

2356 wxr, 

2357 data, 

2358 t, 

2359 [strokes], 

2360 text, 

2361 True, 

2362 is_reconstruction, 

2363 head_group, 

2364 ruby, 

2365 ) 

2366 prev_tags = None 

2367 following_tags = None 

2368 continue 

2369 

2370 # American Sign Language has images (or requests for image) 

2371 # as heads, + this ASL gloss after. 

2372 m2 = re.search(r"\(ASL gloss:\s+(.*)\)", text) 

2373 if m2: 2373 ↛ 2374line 2373 didn't jump to line 2374 because the condition on line 2373 was never true

2374 add_related( 

2375 wxr, 

2376 data, 

2377 ["ASL-gloss"], 

2378 [m2.group(1)], 

2379 text, 

2380 True, 

2381 is_reconstruction, 

2382 head_group, 

2383 ruby, 

2384 ) 

2385 continue 

2386 

2387 parts = list(m.group(0) for m in re.finditer(word_re, desc)) 

2388 if not parts: 2388 ↛ 2389line 2388 didn't jump to line 2389 because the condition on line 2388 was never true

2389 prev_tags = None 

2390 following_tags = None 

2391 continue 

2392 

2393 # Check for certain language-specific header part starts that 

2394 # modify 

2395 if len(parts) == 2 and language in lang_specific_head_map: 2395 ↛ 2396line 2395 didn't jump to line 2396 because the condition on line 2395 was never true

2396 ht = lang_specific_head_map[language] 

2397 if parts[0] in ht: 

2398 rem_tags, add_tags = ht[parts[0]] 

2399 new_prev_tags1: list[list[str]] = [] 

2400 tags2: Union[tuple[str, ...], list[str]] 

2401 for tags2 in prev_tags or [()]: 

2402 if rem_tags is True: # Remove all old tags 

2403 tsets = set() 

2404 else: 

2405 tsets = set(tags2) - set(rem_tags.split()) 

2406 tsets = tsets | set(add_tags.split()) 

2407 tags = list(sorted(tsets)) 

2408 add_related( 

2409 wxr, 

2410 data, 

2411 tags, 

2412 [parts[1]], 

2413 text, 

2414 True, 

2415 is_reconstruction, 

2416 head_group, 

2417 ruby, 

2418 ) 

2419 new_prev_tags1.append(tags) 

2420 prev_tags = new_prev_tags1 

2421 following_tags = None 

2422 continue 

2423 

2424 # Handle the special case of descriptors that are parenthesized, 

2425 # e.g., (archaic or Scotland) 

2426 m = re.match(r"\(([^)]+)\)\s+(.*)$", desc) 

2427 if m is not None and classify_desc(m.group(1)) == "tags": 2427 ↛ 2428line 2427 didn't jump to line 2428 because the condition on line 2427 was never true

2428 tagpart = m.group(1) 

2429 related = [m.group(2)] 

2430 tagsets, topics = decode_tags(tagpart, no_unknown_starts=True) 

2431 if topics: 

2432 wxr.wtp.debug( 

2433 "parenthized head part {!r} contains topics: {}".format( 

2434 tagpart, topics 

2435 ), 

2436 sortid="form_descriptions/1647", 

2437 ) 

2438 elif m is not None and re.match(r"in the sense ", m.group(1)): 2438 ↛ 2441line 2438 didn't jump to line 2441 because the condition on line 2438 was never true

2439 # Handle certain ignored cases 

2440 # e.g. bord/Danish: in the sense "plank" 

2441 related = [m.group(2)] 

2442 tagsets = [()] 

2443 else: 

2444 # Normal parsing of the descriptor 

2445 alt_related = None 

2446 alt_tagsets = None 

2447 tagsets = None 

2448 for i in range(len(parts), 0, -1): 

2449 related = parts[i:] 

2450 tagparts = parts[:i] 

2451 # print(" i={} related={} tagparts={}" 

2452 # .format(i, related, tagparts)) 

2453 tagsets, topics = decode_tags( 

2454 " ".join(tagparts), no_unknown_starts=True 

2455 ) 

2456 # print("tagparts={!r} tagsets={} topics={} related={} " 

2457 # "alt_related={} distw={:.2f}" 

2458 # .format(tagparts, tagsets, topics, related, 

2459 # alt_related, 

2460 # distw(titleparts, parts[i - 1]))) 

2461 if ( 

2462 topics 

2463 or not tagsets 

2464 or any("error-unknown-tag" in x for x in tagsets) 

2465 ): 

2466 if alt_related is not None: 2466 ↛ 2468line 2466 didn't jump to line 2468 because the condition on line 2466 was never true

2467 # We already had a good division, so let's stop. 

2468 break 

2469 # Bad division, try deeper 

2470 continue 

2471 # print(f"{parts[i-1]=}, {parts=}") 

2472 if ( 

2473 i > 1 

2474 and len(parts[i - 1]) >= 4 

2475 and ( 

2476 distw(titleparts, parts[i - 1]) <= 0.4 

2477 or ( 

2478 wxr.wtp.section == "English" 

2479 and wxr.wtp.title 

2480 in WORDS_WITH_FALSE_POSITIVE_TAGS 

2481 and parts[i - 1] 

2482 in WORDS_WITH_FALSE_POSITIVE_TAGS[wxr.wtp.title] 

2483 ) 

2484 ) 

2485 # Fixes 'unaccountability' wiktext #1196 

2486 and not ( 

2487 wxr.wtp.section == "English" 

2488 and wxr.wtp.title in WORDS_WITH_FALSE_POSITIVE_FORMS 

2489 and parts[i - 1] 

2490 in WORDS_WITH_FALSE_POSITIVE_FORMS[wxr.wtp.title] 

2491 ) 

2492 # Fixes wiktextract #983, where "participle" 

2493 # was too close to "Martinize" and so this accepted 

2494 # ["participle", "Martinize"] as matching; this 

2495 # kludge prevents this from happening if titleparts 

2496 # is shorter than what would be 'related'. 

2497 # This breaks if we want to detect stuff that 

2498 # actually gets an extra space-separated word when 

2499 # 'inflected'. 

2500 and ( 

2501 len(titleparts) >= len(parts[i - 1 :]) 

2502 or "or" in parts[i - 1 :] 

2503 ) 

2504 ): 

2505 # print(f"Reached; {parts=}, {parts[i-1]=}") 

2506 alt_related = related 

2507 alt_tagsets = tagsets 

2508 continue 

2509 alt_related = None 

2510 alt_tagsets = None 

2511 break 

2512 else: 

2513 if alt_related is None: 2513 ↛ 2545line 2513 didn't jump to line 2545 because the condition on line 2513 was always true

2514 # Check if the parenthesized part is likely a 

2515 # romanization 

2516 if ( 2516 ↛ 2524line 2516 didn't jump to line 2524 because the condition on line 2516 was never true

2517 (have_ruby or classify_desc(base) == "other") 

2518 and classify_desc(paren) == "romanization" 

2519 and not ( 

2520 "categories" in data 

2521 and desc in data["categories"] 

2522 ) 

2523 ): 

2524 for r in split_at_comma_semi( 

2525 paren, extra=[" or "], skipped=links 

2526 ): 

2527 add_romanization( 

2528 wxr, 

2529 data, 

2530 r, 

2531 text, 

2532 is_reconstruction, 

2533 head_group, 

2534 ruby, 

2535 ) 

2536 have_romanization = True 

2537 continue 

2538 tagsets = [("error-unrecognized-head-form",)] 

2539 wxr.wtp.debug( 

2540 "unrecognized head form: {}".format(desc), 

2541 sortid="form_descriptions/1698", 

2542 ) 

2543 continue 

2544 

2545 if alt_related is not None: 2545 ↛ 2546line 2545 didn't jump to line 2546 because the condition on line 2545 was never true

2546 related = alt_related 

2547 tagsets = alt_tagsets 

2548 

2549 # print("FORM END: tagsets={} related={}".format(tagsets, related)) 

2550 # print("==================") 

2551 

2552 if ( 2552 ↛ 2573line 2552 didn't jump to line 2573 because the condition on line 2552 was never true

2553 len(related) <= 0 

2554 and wxr.wtp.section == "English" 

2555 and tagsets is not None 

2556 and len(tagsets) > 0 

2557 and not any( 

2558 s.startswith("error-") for tagset in tagsets for s in tagset 

2559 ) 

2560 and any( 

2561 s in FORM_ASSOCIATED_TAG_WORDS 

2562 for tagset in tagsets 

2563 for s in tagset 

2564 ) 

2565 and ( 

2566 wxr.wtp.title not in FALSE_POSITIVE_MISSING_FORMS 

2567 and not any( 

2568 rel in FALSE_POSITIVE_MISSING_FORMS[wxr.wtp.title or ""] 

2569 for rel in related 

2570 ) 

2571 ) 

2572 ): 

2573 wxr.wtp.debug( 

2574 f"Form tags without form: {desc=}, {tagsets=}", 

2575 sortid="form_description/20250107", 

2576 ) 

2577 if not tagsets: 2577 ↛ 2578line 2577 didn't jump to line 2578 because the condition on line 2577 was never true

2578 continue 

2579 

2580 # print(f"{alts=}, {related=}") 

2581 

2582 assert isinstance(related, (list, tuple)) 

2583 related_str = " ".join(related) 

2584 if "or" in titleparts: 

2585 alts = [related_str] 

2586 else: 

2587 alts = split_at_comma_semi( 

2588 related_str, separators=[r"\bor\b"], skipped=links 

2589 ) 

2590 # print(f"{related_str=}, {alts=}") 

2591 if not alts: 

2592 alts = [""] 

2593 for related_str in alts: 

2594 if related_str: 

2595 if prev_tags and ( 

2596 all( 

2597 all( 

2598 t in ["nonstandard", "dialectal"] 

2599 or valid_tags[t] == "dialect" 

2600 for t in tags 

2601 ) 

2602 for ts in tagsets 

2603 ) 

2604 or ( 

2605 any("participle" in ts for ts in prev_tags) 

2606 and all( 

2607 "attributive" in ts 

2608 or any(valid_tags[t] == "gender" for t in ts) 

2609 for ts in tagsets 

2610 ) 

2611 ) 

2612 ): 

2613 # Merged with previous tags. Don't update previous 

2614 # tags here; cf. burn/English/Verb 

2615 for tags_l in tagsets: 

2616 for ts in prev_tags: 

2617 tags_l1 = sorted(set(tags_l) | set(ts)) 

2618 add_related( 

2619 wxr, 

2620 data, 

2621 tags_l1, 

2622 [related_str], 

2623 text, 

2624 True, 

2625 is_reconstruction, 

2626 head_group, 

2627 ruby, 

2628 ) 

2629 else: 

2630 # Not merged with previous tags 

2631 for tags_l in tagsets: 

2632 if following_tags is not None: 2632 ↛ 2633line 2632 didn't jump to line 2633 because the condition on line 2632 was never true

2633 for ts in following_tags: 

2634 tags_l1 = list( 

2635 sorted(set(tags_l) | set(ts)) 

2636 ) 

2637 add_related( 

2638 wxr, 

2639 data, 

2640 tags_l1, 

2641 [related_str], 

2642 text, 

2643 True, 

2644 is_reconstruction, 

2645 head_group, 

2646 ruby, 

2647 ) 

2648 else: 

2649 ret = add_related( 

2650 wxr, 

2651 data, 

2652 tags_l, 

2653 [related_str], 

2654 text, 

2655 True, 

2656 is_reconstruction, 

2657 head_group, 

2658 ruby, 

2659 ) 

2660 if ret is not None: 2660 ↛ 2661line 2660 didn't jump to line 2661 because the condition on line 2660 was never true

2661 following_tags = ret 

2662 prev_tags = tagsets 

2663 else: 

2664 if desc_i < len(new_desc) - 1 and all( 2664 ↛ 2671line 2664 didn't jump to line 2671 because the condition on line 2664 was never true

2665 "participle" in ts or "infinitive" in ts 

2666 for ts in tagsets 

2667 ): 

2668 # Interpret it as a standalone form description 

2669 # in the middle, probably followed by forms or 

2670 # language-specific descriptors. cf. drikke/Danish 

2671 new_prev_tags2 = [] 

2672 for ts1 in prev_tags or [()]: 

2673 for ts2 in tagsets: 

2674 ts = tuple(sorted(set(ts1) | set(ts2))) 

2675 new_prev_tags2.append(ts) 

2676 prev_tags = new_prev_tags2 

2677 continue 

2678 for tags in tagsets: 

2679 data_extend(data, "tags", tags) 

2680 prev_tags = tagsets 

2681 following_tags = None 

2682 

2683 # Finally, if we collected hirakana/katakana, add them now 

2684 if hiragana: 2684 ↛ 2685line 2684 didn't jump to line 2685 because the condition on line 2684 was never true

2685 add_related( 

2686 wxr, 

2687 data, 

2688 ["hiragana"], 

2689 [hiragana], 

2690 text, 

2691 True, 

2692 is_reconstruction, 

2693 head_group, 

2694 ruby, 

2695 ) 

2696 if katakana: 2696 ↛ 2697line 2696 didn't jump to line 2697 because the condition on line 2696 was never true

2697 add_related( 

2698 wxr, 

2699 data, 

2700 ["katakana"], 

2701 [katakana], 

2702 text, 

2703 True, 

2704 is_reconstruction, 

2705 head_group, 

2706 ruby, 

2707 ) 

2708 

2709 # XXX check if this is actually relevant, tags in word root data 

2710 # is extremely rare (not sure where they slip through). 

2711 tags = data.get("tags", []) # type:ignore 

2712 if len(tags) > 0: 

2713 # wxr.wtp.debug( 

2714 # f"Tags appear in word root data: {data['tags']=}", # type:ignore 

2715 # sortid="form_descriptions/2620/20240606", 

2716 # ) # Messes up tests. 

2717 data["tags"] = sorted(set(tags)) # type:ignore 

2718 

2719 

2720def parse_sense_qualifier( 

2721 wxr: WiktextractContext, text: str, data: Union[SenseData, LinkageData] 

2722) -> None: 

2723 """Parses tags or topics for a sense or some other data. The values are 

2724 added into the dictionary ``data``.""" 

2725 assert isinstance(wxr, WiktextractContext) 

2726 assert isinstance(text, str) 

2727 assert isinstance(data, dict) 

2728 # print("parse_sense_qualifier:", text) 

2729 if re.match(r"\([^()]+\)$", text): 2729 ↛ 2730line 2729 didn't jump to line 2730 because the condition on line 2729 was never true

2730 text = text[1:-1] 

2731 if re.match(r'"[^"]+"$', text): 2731 ↛ 2732line 2731 didn't jump to line 2732 because the condition on line 2731 was never true

2732 text = text[1:-1] 

2733 lst = map_with(xlat_descs_map, [text]) 

2734 sense_tags: list[str] = [] 

2735 for text in lst: 

2736 for semi in split_at_comma_semi(text): 

2737 if not semi: 2737 ↛ 2738line 2737 didn't jump to line 2738 because the condition on line 2737 was never true

2738 continue 

2739 orig_semi = semi 

2740 idx = semi.find(":") 

2741 if idx >= 0: 2741 ↛ 2742line 2741 didn't jump to line 2742 because the condition on line 2741 was never true

2742 semi = semi[:idx] 

2743 cls = classify_desc(semi, allow_unknown_tags=True) 

2744 # print("parse_sense_qualifier: classify_desc: {} -> {}" 

2745 # .format(semi, cls)) 

2746 if cls == "tags": 

2747 tagsets, topics = decode_tags(semi) 

2748 data_extend(data, "topics", topics) 

2749 # XXX should think how to handle distinct options better, 

2750 # e.g., "singular and plural genitive"; that can't really be 

2751 # done with changing the calling convention of this function. 

2752 # Should split sense if more than one category of tags differs. 

2753 for tags in tagsets: 

2754 sense_tags.extend(tags) 

2755 elif cls == "taxonomic": 2755 ↛ 2756line 2755 didn't jump to line 2756 because the condition on line 2755 was never true

2756 if re.match(r"×[A-Z]", semi): 

2757 sense_tags.append("extinct") 

2758 semi = semi[1:] 

2759 data["taxonomic"] = semi 

2760 elif cls == "english": 

2761 if "qualifier" in data and data["qualifier"] != orig_semi: 2761 ↛ 2762line 2761 didn't jump to line 2762 because the condition on line 2761 was never true

2762 data["qualifier"] += "; " + orig_semi 

2763 else: 

2764 data["qualifier"] = orig_semi 

2765 else: 

2766 wxr.wtp.debug( 

2767 "unrecognized sense qualifier: {}".format(text), 

2768 sortid="form_descriptions/1831", 

2769 ) 

2770 sense_tags = sorted(set(sense_tags)) 

2771 data_extend(data, "tags", sense_tags) 

2772 

2773 

2774def parse_pronunciation_tags( 

2775 wxr: WiktextractContext, text: str, data: SoundData 

2776) -> None: 

2777 assert isinstance(wxr, WiktextractContext) 

2778 assert isinstance(text, str) 

2779 assert isinstance(data, dict) 

2780 text = text.strip() 

2781 if not text: 2781 ↛ 2782line 2781 didn't jump to line 2782 because the condition on line 2781 was never true

2782 return 

2783 cls = classify_desc(text) 

2784 notes = [] 

2785 if cls == "tags": 

2786 tagsets, topics = decode_tags(text) 

2787 data_extend(data, "topics", topics) 

2788 for tagset in tagsets: 

2789 for t in tagset: 

2790 if " " in t: 2790 ↛ 2791line 2790 didn't jump to line 2791 because the condition on line 2790 was never true

2791 notes.append(t) 

2792 else: 

2793 data_append(data, "tags", t) 

2794 else: 

2795 notes.append(text) 

2796 if notes: 

2797 data["note"] = "; ".join(notes) 

2798 

2799 

2800def parse_translation_desc( 

2801 wxr: WiktextractContext, lang: str, text: str, tr: TranslationData 

2802) -> None: 

2803 assert isinstance(wxr, WiktextractContext) 

2804 assert isinstance(lang, str) # The language of ``text`` 

2805 assert isinstance(text, str) 

2806 assert isinstance(tr, dict) 

2807 # print("parse_translation_desc:", text) 

2808 

2809 # Process all parenthesized parts from the translation item 

2810 note = None 

2811 restore_beginning = "" 

2812 restore_end = "" 

2813 while True: 

2814 beginning = False 

2815 # See if we can find a parenthesized expression at the end 

2816 m = re.search(r"\s*\((([^()]|\([^()]+\))+)\)\.?$", text) 

2817 if m: 

2818 par = m.group(1) 

2819 text = text[: m.start()] 

2820 if par.startswith(("literally ", "lit.")): 

2821 continue # Not useful for disambiguation in many idioms 

2822 else: 

2823 # See if we can find a parenthesized expression at the start 

2824 m = re.match(r"^\^?\((([^()]|\([^()]+\))+)\):?(\s+|$)", text) 

2825 if m: 

2826 par = m.group(1) 

2827 text = text[m.end() :] 

2828 beginning = True 

2829 if re.match(r"^(\d|\s|,| or | and )+$", par): 2829 ↛ 2834line 2829 didn't jump to line 2834 because the condition on line 2829 was never true

2830 # Looks like this beginning parenthesized expression only 

2831 # contains digits or their combinations. We assume such 

2832 # to be sense descriptions if no sense has been selected, 

2833 # or otherwise just ignore them. 

2834 if not tr.get("sense"): 

2835 tr["sense"] = par 

2836 continue 

2837 else: 

2838 # See if we can find a parenthesized expression in the middle. 

2839 # Romanizations are sometimes between word and gender marker, 

2840 # e.g. wife/English/Tr/Yiddish. 

2841 m = re.search(r"\s+\((([^()]|\([^()]+\))+)\)", text) 

2842 if m: 

2843 par = m.group(1) 

2844 text = text[: m.start()] + text[m.end() :] 

2845 else: 

2846 # No more parenthesized expressions - break out of the loop 

2847 break 

2848 

2849 # Some cleanup of artifacts that may result from skipping some templates 

2850 # in earlier stages 

2851 if par.startswith(": "): 2851 ↛ 2852line 2851 didn't jump to line 2852 because the condition on line 2851 was never true

2852 par = par[2:] 

2853 if par.endswith(","): 2853 ↛ 2854line 2853 didn't jump to line 2854 because the condition on line 2853 was never true

2854 par = par[:-1] 

2855 if re.match(r'^[“"]([^“”"]*)[“”"]$', par): 2855 ↛ 2856line 2855 didn't jump to line 2856 because the condition on line 2855 was never true

2856 par = par[1:-1] 

2857 par = par.strip() 

2858 

2859 # Check for special script pronunciation followed by romanization, 

2860 # used in many Asian languages. 

2861 lst = par.split(", ") 

2862 if len(lst) == 2: 

2863 a, r = lst 

2864 if classify_desc(a) == "other": 

2865 cls = classify_desc(r) 

2866 # print("parse_translation_desc: r={} cls={}".format(r, cls)) 

2867 if cls == "romanization" or ( 

2868 cls == "english" and len(r.split()) == 1 and r[0].islower() 

2869 ): 

2870 if tr.get("alt") and tr.get("alt") != a: 2870 ↛ 2871line 2870 didn't jump to line 2871 because the condition on line 2870 was never true

2871 wxr.wtp.debug( 

2872 'more than one value in "alt": {} vs. {}'.format( 

2873 tr["alt"], a 

2874 ), 

2875 sortid="form_descriptions/1930", 

2876 ) 

2877 tr["alt"] = a 

2878 if tr.get("roman") and tr.get("roman") != r: 2878 ↛ 2879line 2878 didn't jump to line 2879 because the condition on line 2878 was never true

2879 wxr.wtp.debug( 

2880 'more than one value in "roman": {} vs. {}'.format( 

2881 tr["roman"], r 

2882 ), 

2883 sortid="form_descriptions/1936", 

2884 ) 

2885 tr["roman"] = r 

2886 continue 

2887 

2888 # Check for certain comma-separated tags combined with English text 

2889 # at the beginning or end of a comma-separated parenthesized list 

2890 while len(lst) > 1: 

2891 cls = classify_desc(lst[0]) 

2892 if cls == "tags": 2892 ↛ 2893line 2892 didn't jump to line 2893 because the condition on line 2892 was never true

2893 tagsets, topics = decode_tags(lst[0]) 

2894 for t in tagsets: 

2895 data_extend(tr, "tags", t) 

2896 data_extend(tr, "topics", topics) 

2897 lst = lst[1:] 

2898 continue 

2899 cls = classify_desc(lst[-1]) 

2900 if cls == "tags": 

2901 tagsets, topics = decode_tags(lst[-1]) 

2902 for t in tagsets: 

2903 data_extend(tr, "tags", t) 

2904 data_extend(tr, "topics", topics) 

2905 lst = lst[:-1] 

2906 continue 

2907 break 

2908 par = ", ".join(lst) 

2909 

2910 if not par: 2910 ↛ 2911line 2910 didn't jump to line 2911 because the condition on line 2910 was never true

2911 continue 

2912 if re.search(tr_ignored_parens_re, par): 2912 ↛ 2913line 2912 didn't jump to line 2913 because the condition on line 2912 was never true

2913 continue 

2914 if par.startswith("numeral:"): 

2915 par = par[8:].strip() 

2916 

2917 # Classify the part in parenthesis and process accordingly 

2918 cls = classify_desc(par) 

2919 # print("parse_translation_desc classify: {!r} -> {}" 

2920 # .format(par, cls)) 

2921 if par == text: 

2922 pass 

2923 if par == "f": 2923 ↛ 2924line 2923 didn't jump to line 2924 because the condition on line 2923 was never true

2924 data_append(tr, "tags", "feminine") 

2925 elif par == "m": 2925 ↛ 2926line 2925 didn't jump to line 2926 because the condition on line 2925 was never true

2926 data_append(tr, "tags", "masculine") 

2927 elif cls == "tags": 

2928 tagsets, topics = decode_tags(par) 

2929 for tags in tagsets: 

2930 data_extend(tr, "tags", tags) 

2931 data_extend(tr, "topics", topics) 

2932 elif cls == "english": 

2933 # If the text contains any of certain grammatical words, treat it 

2934 # as a "note" instead of "english" 

2935 if re.search(tr_note_re, par): 

2936 if par.endswith(":"): 2936 ↛ 2937line 2936 didn't jump to line 2937 because the condition on line 2936 was never true

2937 par = par[:-1] 

2938 if par not in ("see entry for forms",): 2938 ↛ 2813line 2938 didn't jump to line 2813 because the condition on line 2938 was always true

2939 if note: 2939 ↛ 2940line 2939 didn't jump to line 2940 because the condition on line 2939 was never true

2940 note = note + ";" + par 

2941 else: 

2942 note = par 

2943 else: 

2944 # There can be more than one parenthesized english item, see 

2945 # e.g. Aunt/English/Translations/Tamil 

2946 if "translation" in tr and "english" in tr: 

2947 tr["english"] += "; " + par # DEPRECATED for "translation" 

2948 tr["translation"] += "; " + par 

2949 else: 

2950 tr["english"] = par # DEPRECATED for "translation" 

2951 tr["translation"] = par 

2952 elif cls == "romanization": 

2953 # print("roman text={!r} text cls={}" 

2954 # .format(text, classify_desc(text))) 

2955 if classify_desc(text) in ( 

2956 "english", 

2957 "romanization", 

2958 ) and lang not in ("Egyptian",): 

2959 if beginning: 

2960 restore_beginning += "({}) ".format(par) 

2961 else: 

2962 restore_end = " ({})".format(par) + restore_end 

2963 else: 

2964 if tr.get("roman"): 2964 ↛ 2965line 2964 didn't jump to line 2965 because the condition on line 2964 was never true

2965 wxr.wtp.debug( 

2966 'more than one value in "roman": {} vs. {}'.format( 

2967 tr["roman"], par 

2968 ), 

2969 sortid="form_descriptions/2013", 

2970 ) 

2971 tr["roman"] = par 

2972 elif cls == "taxonomic": 2972 ↛ 2973line 2972 didn't jump to line 2973 because the condition on line 2972 was never true

2973 if tr.get("taxonomic"): 

2974 wxr.wtp.debug( 

2975 'more than one value in "taxonomic": {} vs. {}'.format( 

2976 tr["taxonomic"], par 

2977 ), 

2978 sortid="form_descriptions/2019", 

2979 ) 

2980 if re.match(r"×[A-Z]", par): 

2981 data_append(tr, "tags", "extinct") 

2982 par = par[1:] 

2983 tr["taxonomic"] = par 

2984 elif cls == "other": 2984 ↛ 2994line 2984 didn't jump to line 2994 because the condition on line 2984 was always true

2985 if tr.get("alt"): 2985 ↛ 2986line 2985 didn't jump to line 2986 because the condition on line 2985 was never true

2986 wxr.wtp.debug( 

2987 'more than one value in "alt": {} vs. {}'.format( 

2988 tr["alt"], par 

2989 ), 

2990 sortid="form_descriptions/2028", 

2991 ) 

2992 tr["alt"] = par 

2993 else: 

2994 wxr.wtp.debug( 

2995 "parse_translation_desc unimplemented cls {}: {}".format( 

2996 cls, par 

2997 ), 

2998 sortid="form_descriptions/2033", 

2999 ) 

3000 

3001 # Check for gender indications in suffix 

3002 text, final_tags = parse_head_final_tags(wxr, lang, text) 

3003 data_extend(tr, "tags", final_tags) 

3004 

3005 # Restore those parts that we did not want to remove (they are often 

3006 # optional words or words that are always used with the given translation) 

3007 text = restore_beginning + text + restore_end 

3008 

3009 if note: 

3010 tr["note"] = note.strip() 

3011 if text and text not in ignored_translations: 

3012 tr["word"] = text.strip() 

3013 

3014 # Sometimes gender seems to be at the end of "roman" field, see e.g. 

3015 # fire/English/Noun/Translations/Egyptian (for "oxidation reaction") 

3016 roman = tr.get("roman") 

3017 if roman: 

3018 if roman.endswith(" f"): 3018 ↛ 3019line 3018 didn't jump to line 3019 because the condition on line 3018 was never true

3019 data_append(tr, "tags", "feminine") 

3020 tr["roman"] = roman[:-2].strip() 

3021 elif roman.endswith(" m"): 3021 ↛ 3022line 3021 didn't jump to line 3022 because the condition on line 3021 was never true

3022 data_append(tr, "tags", "masculine") 

3023 tr["roman"] = roman[:-2].strip() 

3024 

3025 # If the word now has "translation" field but no "roman" field, and 

3026 # the word would be classified "other" (generally non-latin 

3027 # characters), and the value in "translation" is only one lowercase 

3028 # word, move it to "roman". This happens semi-frequently when the 

3029 # translation is transliterated the same as some English word. 

3030 roman = tr.get("roman") 

3031 english = tr.get("translation") 

3032 if english and not roman and "word" in tr: 

3033 cls = classify_desc(tr["word"]) 

3034 if cls == "other" and " " not in english and english[0].islower(): 

3035 del tr["translation"] 

3036 if "english" in tr: # DEPRECATED for "translation" 3036 ↛ 3038line 3036 didn't jump to line 3038 because the condition on line 3036 was always true

3037 del tr["english"] 

3038 tr["roman"] = english 

3039 

3040 # If the entry now has both tr["roman"] and tr["word"] and they have 

3041 # the same value, delete tr["roman"] (e.g., man/English/Translations 

3042 # Evenki) 

3043 if tr.get("word") and tr.get("roman") == tr.get("word"): 3043 ↛ 3044line 3043 didn't jump to line 3044 because the condition on line 3043 was never true

3044 del tr["roman"] 

3045 

3046 

3047def parse_alt_or_inflection_of( 

3048 wxr: WiktextractContext, gloss: str, gloss_template_args: set[str] 

3049) -> Optional[tuple[list[str], Optional[list[AltOf]]]]: 

3050 """Tries to parse an inflection-of or alt-of description. If successful, 

3051 this returns (tags, alt-of/inflection-of-dict). If the description cannot 

3052 be parsed, this returns None. This may also return (tags, None) when the 

3053 gloss describes a form (or some other tags were extracted from it), but 

3054 there was no alt-of/form-of/synonym-of word.""" 

3055 # print("parse_alt_or_inflection_of: {!r}".format(gloss)) 

3056 # Occasionally inflection_of/alt_of have "A(n) " etc. at the beginning. 

3057 

3058 # Never interpret a gloss that is equal to the word itself as a tag 

3059 # (e.g., instrumental/Romanian, instrumental/Spanish). 

3060 if gloss.lower() == wxr.wtp.title.lower() or ( # type:ignore[union-attr] 

3061 len(gloss) >= 5 and distw([gloss.lower()], wxr.wtp.title.lower()) < 0.2 # type:ignore[union-attr] 

3062 ): 

3063 return None 

3064 

3065 # First try parsing it as-is 

3066 parsed = parse_alt_or_inflection_of1(wxr, gloss, gloss_template_args) 

3067 if parsed is not None: 

3068 return parsed 

3069 

3070 # Next try parsing it with the first character converted to lowercase if 

3071 # it was previously uppercase. 

3072 if gloss and gloss[0].isupper(): 

3073 gloss = gloss[0].lower() + gloss[1:] 

3074 parsed = parse_alt_or_inflection_of1(wxr, gloss, gloss_template_args) 

3075 if parsed is not None: 

3076 return parsed 

3077 

3078 return None 

3079 

3080 

3081# These tags are not allowed in alt-or-inflection-of parsing 

3082alt_infl_disallowed: set[str] = set( 

3083 [ 

3084 "error-unknown-tag", 

3085 "place", # Not in inflected forms and causes problems e.g. house/English 

3086 ] 

3087) 

3088 

3089 

3090def parse_alt_or_inflection_of1( 

3091 wxr: WiktextractContext, gloss: str, gloss_template_args: set[str] 

3092) -> Optional[tuple[list[str], Optional[list[AltOf]]]]: 

3093 """Helper function for parse_alt_or_inflection_of. This handles a single 

3094 capitalization.""" 

3095 if not gloss or not gloss.strip(): 3095 ↛ 3096line 3095 didn't jump to line 3096 because the condition on line 3095 was never true

3096 return None 

3097 

3098 # Prevent some common errors where we would parse something we shouldn't 

3099 if re.search(r"(?i)form of address ", gloss): 3099 ↛ 3100line 3099 didn't jump to line 3100 because the condition on line 3099 was never true

3100 return None 

3101 

3102 gloss = re.sub(r"only used in [^,]+, ", "", gloss) 

3103 

3104 # First try all formats ending with "of" (or other known last words that 

3105 # can end a form description) 

3106 matches = list(re.finditer(r"\b(of|for|by|as|letter|number) ", gloss)) 

3107 m: Optional[re.Match] 

3108 for m in reversed(matches): 

3109 desc = gloss[: m.end()].strip() 

3110 base = gloss[m.end() :].strip() 

3111 tagsets, topics = decode_tags(desc, no_unknown_starts=True) 

3112 if not topics and any( 

3113 not (alt_infl_disallowed & set(ts)) for ts in tagsets 

3114 ): 

3115 # Successfully parsed, including "of" etc. 

3116 tags: list[str] = [] 

3117 # If you have ("Western-Armenian", ..., "form-of") as your 

3118 # tag set, it's most probable that it's something like 

3119 # "Western Armenian form of խոսել (xosel)", which should 

3120 # get "alt-of" instead of "form-of" (inflection). 

3121 # խօսիլ/Armenian 

3122 for ts_t in tagsets: 

3123 if "form-of" in ts_t and any( 

3124 valid_tags.get(tk) == "dialect" for tk in ts_t 

3125 ): 

3126 ts_s = (set(ts_t) - {"form-of"}) | {"alt-of"} 

3127 else: 

3128 ts_s = set(ts_t) 

3129 if not (alt_infl_disallowed & ts_s): 3129 ↛ 3122line 3129 didn't jump to line 3122 because the condition on line 3129 was always true

3130 tags.extend(ts_s) 

3131 if ( 

3132 "alt-of" in tags 

3133 or "form-of" in tags 

3134 or "synonym-of" in tags 

3135 or "compound-of" in tags 

3136 ): 

3137 break 

3138 if m.group(1) == "of": 

3139 # Try parsing without the final "of". This is commonly used in 

3140 # various form-of expressions. 

3141 desc = gloss[: m.start()] 

3142 base = gloss[m.end() :] 

3143 tagsets, topics = decode_tags(desc, no_unknown_starts=True) 

3144 # print("ALT_OR_INFL: desc={!r} base={!r} tagsets={} topics={}" 

3145 # .format(desc, base, tagsets, topics)) 

3146 if not topics and any( 

3147 not (alt_infl_disallowed & set(t)) for t in tagsets 

3148 ): 

3149 tags = [] 

3150 for t in tagsets: 

3151 if not (alt_infl_disallowed & set(t)): 3151 ↛ 3150line 3151 didn't jump to line 3150 because the condition on line 3151 was always true

3152 tags.extend(t) 

3153 # It must have at least one tag from form_of_tags 

3154 if set(tags) & form_of_tags: 

3155 # Accept this as form-of 

3156 tags.append("form-of") 

3157 break 

3158 if set(tags) & alt_of_tags: 

3159 # Accept this as alt-of 

3160 tags.append("alt-of") 

3161 break 

3162 

3163 else: 

3164 # Did not find a form description based on last word; see if the 

3165 # whole description is tags 

3166 tagsets, topics = decode_tags(gloss, no_unknown_starts=True) 

3167 if not topics and any( 

3168 not (alt_infl_disallowed & set(ts)) and form_of_tags & set(ts) 

3169 for ts in tagsets 

3170 ): 

3171 tags = [] 

3172 for ts in tagsets: 

3173 if not (alt_infl_disallowed & set(ts)) and form_of_tags & set( 3173 ↛ 3172line 3173 didn't jump to line 3172 because the condition on line 3173 was always true

3174 ts 

3175 ): 

3176 tags.extend(ts) 

3177 base = "" 

3178 else: 

3179 return None 

3180 

3181 # kludge for Spanish (again): 'x of [word] combined with [clitic]' 

3182 m = re.search(r"combined with \w+$", base) 

3183 if m: 3183 ↛ 3184line 3183 didn't jump to line 3184 because the condition on line 3183 was never true

3184 tagsets, topics = decode_tags(m.group(0), no_unknown_starts=True) 

3185 if not topics: 

3186 for ts in tagsets: 

3187 tags.extend(ts) 

3188 base = base[: m.start()] 

3189 

3190 # It is fairly common for form_of glosses to end with something like 

3191 # "ablative case" or "in instructive case". Parse that ending. 

3192 base = base.strip() 

3193 lst = base.split() 

3194 # print("parse_alt_or_inflection_of: lst={}".format(lst)) 

3195 if len(lst) >= 3 and lst[-1] in ("case", "case."): 3195 ↛ 3196line 3195 didn't jump to line 3196 because the condition on line 3195 was never true

3196 node = valid_sequences.children.get(lst[-2]) 

3197 if node and node.end: 

3198 for s in node.tags: 

3199 tags.extend(s.split(" ")) 

3200 lst = lst[:-2] 

3201 if lst[-1] == "in" and len(lst) > 1: 

3202 lst = lst[:-1] 

3203 

3204 # Eliminate empty and duplicate tags 

3205 tags = sorted(set(t for t in tags if t)) 

3206 

3207 # Clean up some extra stuff from the linked word, separating the text 

3208 # into ``base`` (the linked word) and ``extra`` (additional information, 

3209 # such as English translation or clarifying word sense information). 

3210 orig_base = base 

3211 base = re.sub(alt_of_form_of_clean_re, "", orig_base) 

3212 base = re.sub(r" [(⟨][^()]*[)⟩]", "", base) # Remove all (...) groups 

3213 extra = orig_base[len(base) :] 

3214 extra = re.sub(r"^[- :;.,,—]+", "", extra) 

3215 if extra.endswith(".") and extra.count(".") == 1: 

3216 extra = extra[:-1].strip() 

3217 m = re.match(r"^\(([^()]*)\)$", extra) 

3218 if m: 3218 ↛ 3219line 3218 didn't jump to line 3219 because the condition on line 3218 was never true

3219 extra = m.group(1) 

3220 else: 

3221 # These weird backets used in "slash mark" 

3222 m = re.match(r"^⟨([^()]*)⟩$", extra) 

3223 if m: 3223 ↛ 3224line 3223 didn't jump to line 3224 because the condition on line 3223 was never true

3224 extra = m.group(1) 

3225 m = re.match(r'^[“"]([^"“”]*)["”]$', extra) 

3226 if m: 3226 ↛ 3227line 3226 didn't jump to line 3227 because the condition on line 3226 was never true

3227 extra = m.group(1) 

3228 # Note: base might still contain comma-separated values and values 

3229 # separated by "and" 

3230 base = base.strip() 

3231 if base.endswith(",") and len(base) > 2: 3231 ↛ 3232line 3231 didn't jump to line 3232 because the condition on line 3231 was never true

3232 base = base[:-1].strip() 

3233 while ( 

3234 base.endswith(".") 

3235 and not wxr.wtp.page_exists(base) 

3236 and base not in gloss_template_args 

3237 ): 

3238 base = base[:-1].strip() 

3239 if base.endswith('(\u201cconjecture")'): 3239 ↛ 3240line 3239 didn't jump to line 3240 because the condition on line 3239 was never true

3240 base = base[:-14].strip() 

3241 tags.append("conjecture") 

3242 while ( 3242 ↛ 3247line 3242 didn't jump to line 3247 because the condition on line 3242 was never true

3243 base.endswith(".") 

3244 and not wxr.wtp.page_exists(base) 

3245 and base not in gloss_template_args 

3246 ): 

3247 base = base[:-1].strip() 

3248 if ( 3248 ↛ 3253line 3248 didn't jump to line 3253 because the condition on line 3248 was never true

3249 base.endswith(".") 

3250 and base not in gloss_template_args 

3251 and base[:-1] in gloss_template_args 

3252 ): 

3253 base = base[:-1] 

3254 base = base.strip() 

3255 if not base: 

3256 return tags, None 

3257 

3258 # Kludge: Spanish verb forms seem to have a dot added at the end. 

3259 # Remove it; we know of no Spanish verbs ending with a dot. 

3260 language = wxr.wtp.section 

3261 pos = wxr.wtp.subsection 

3262 # print("language={} pos={} base={}".format(language, pos, base)) 

3263 if ( 3263 ↛ 3269line 3263 didn't jump to line 3269 because the condition on line 3263 was never true

3264 base.endswith(".") 

3265 and len(base) > 1 

3266 and base[-2].isalpha() 

3267 and (language == "Spanish" and pos == "Verb") 

3268 ): 

3269 base = base[:-1] 

3270 

3271 # Split base to alternatives when multiple alternatives provided 

3272 parts = split_at_comma_semi(base, extra=[" / ", "/", r" \+ "]) 

3273 titleword = re.sub(r"^Reconstruction:[^/]*/", "", wxr.wtp.title or "") 

3274 if ( 

3275 len(parts) <= 1 

3276 or base.startswith("/") 

3277 or base.endswith("/") 

3278 or "/" in titleword 

3279 ): 

3280 parts = [base] 

3281 # Split base to alternatives when of form "a or b" and "a" and "b" are 

3282 # similar (generally spelling variants of the same word or similar words) 

3283 if len(parts) == 1: 

3284 pp = base.split() 

3285 if len(pp) == 3 and pp[1] == "or" and distw([pp[0]], pp[2]) < 0.4: 

3286 parts = [pp[0], pp[2]] 

3287 

3288 # Create form-of/alt-of entries based on the extracted data 

3289 dt_lst: list[AltOf] = [] 

3290 for p in parts: 

3291 # Check for some suspicious base forms 

3292 m = re.search(r"[.,] |[{}()]", p) 

3293 if m and not wxr.wtp.page_exists(p): 3293 ↛ 3294line 3293 didn't jump to line 3294 because the condition on line 3293 was never true

3294 wxr.wtp.debug( 

3295 "suspicious alt_of/form_of with {!r}: {}".format(m.group(0), p), 

3296 sortid="form_descriptions/2278", 

3297 ) 

3298 if p.startswith("*") and len(p) >= 3 and p[1].isalpha(): 3298 ↛ 3299line 3298 didn't jump to line 3299 because the condition on line 3298 was never true

3299 p = p[1:] 

3300 dt: AltOf = {"word": p} 

3301 if extra: 

3302 dt["extra"] = extra 

3303 dt_lst.append(dt) 

3304 # print("alt_or_infl_of returning tags={} lst={} base={!r}" 

3305 # .format(tags, lst, base)) 

3306 return tags, dt_lst 

3307 

3308 

3309@functools.lru_cache(maxsize=65536) 

3310def classify_desc( 

3311 desc: str, 

3312 allow_unknown_tags=False, 

3313 no_unknown_starts=False, 

3314 accepted: Union[tuple[str, ...], frozenset[str]] = tuple(), 

3315) -> str: 

3316 """Determines whether the given description is most likely tags, english, 

3317 a romanization, or something else. Returns one of: "tags", "english", 

3318 "romanization", or "other". If ``allow_unknown_tags`` is True, then 

3319 allow "tags" classification even when the only tags are those starting 

3320 with a word in allowed_unknown_starts.""" 

3321 assert isinstance(desc, str) 

3322 # Empty and whitespace-only strings are treated as "other" 

3323 desc = desc.strip() 

3324 if not desc: 

3325 return "other" 

3326 

3327 normalized_desc = unicodedata.normalize("NFKD", desc) 

3328 

3329 # If it can be fully decoded as tags without errors, treat as tags 

3330 tagsets, topics = decode_tags(desc, no_unknown_starts=no_unknown_starts) 

3331 for tagset in tagsets: 

3332 assert isinstance(tagset, (list, tuple, set)) 

3333 if "error-unknown-tag" not in tagset and ( 

3334 topics or allow_unknown_tags or any(" " not in x for x in tagset) 

3335 ): 

3336 return "tags" 

3337 

3338 # Check if it looks like the taxonomic name of a species 

3339 if desc in known_species: 

3340 return "taxonomic" 

3341 desc1 = re.sub(r"^×([A-Z])", r"\1", desc) 

3342 desc1 = re.sub(r"\s*×.*", "", desc1) 

3343 lst = desc1.split() 

3344 if len(lst) > 1 and len(lst) <= 5 and lst[0] in known_firsts: 

3345 have_non_english = 1 if lst[0].lower() not in english_words else 0 

3346 for x in lst[1:]: 

3347 if x in ("A", "B", "C", "D", "E", "F", "I", "II", "III", "IV", "V"): 

3348 continue 

3349 if x[0].isupper(): 

3350 break 

3351 if x not in english_words: 

3352 have_non_english += 1 

3353 else: 

3354 # Starts with known taxonomic term, does not contain uppercase 

3355 # words (except allowed letters) and at least one word is not 

3356 # English 

3357 if have_non_english >= len(lst) - 1 and have_non_english > 0: 3357 ↛ 3363line 3357 didn't jump to line 3363 because the condition on line 3357 was always true

3358 return "taxonomic" 

3359 

3360 # If all words are in our English dictionary, interpret as English. 

3361 # [ -~] is regex black magic, "ALL CHARACTERS from space to tilde" 

3362 # in ASCII. Took me a while to figure out. 

3363 if re.match(r"[ -~―—“”…'‘’ʹ€]+$", normalized_desc) and len(desc) > 1: 

3364 if desc in english_words and desc[0].isalpha(): 

3365 return "english" # Handles ones containing whitespace 

3366 desc1 = re.sub( 

3367 tokenizer_fixup_re, lambda m: tokenizer_fixup_map[m.group(0)], desc 

3368 ) 

3369 tokens = tokenizer.tokenize(desc1) 

3370 if not tokens: 3370 ↛ 3371line 3370 didn't jump to line 3371 because the condition on line 3370 was never true

3371 return "other" 

3372 lst_bool = list( 

3373 x not in not_english_words 

3374 and 

3375 # not x.isdigit() and 

3376 ( 

3377 x in english_words 

3378 or x.lower() in english_words 

3379 or x in known_firsts 

3380 or x[0].isdigit() 

3381 or x in accepted 

3382 or 

3383 # (x[0].isupper() and x.find("-") < 0 and x.isascii()) or 

3384 ( 

3385 x.endswith("s") and len(x) >= 4 and x[:-1] in english_words 

3386 ) # Plural 

3387 or ( 

3388 x.endswith("ies") 

3389 and len(x) >= 5 

3390 and x[:-3] + "y" in english_words 

3391 ) # E.g. lily - lilies 

3392 or ( 

3393 x.endswith("ing") 

3394 and len(x) >= 5 

3395 and x[:-3] in english_words 

3396 ) # E.g. bring - bringing 

3397 or ( 

3398 x.endswith("ing") 

3399 and len(x) >= 5 

3400 and x[:-3] + "e" in english_words 

3401 ) # E.g., tone - toning 

3402 or ( 

3403 x.endswith("ed") and len(x) >= 5 and x[:-2] in english_words 

3404 ) # E.g. hang - hanged 

3405 or ( 

3406 x.endswith("ed") 

3407 and len(x) >= 5 

3408 and x[:-2] + "e" in english_words 

3409 ) # E.g. atone - atoned 

3410 or (x.endswith("'s") and x[:-2] in english_words) 

3411 or (x.endswith("s'") and x[:-2] in english_words) 

3412 or ( 

3413 x.endswith("ise") 

3414 and len(x) >= 5 

3415 and x[:-3] + "ize" in english_words 

3416 ) 

3417 or ( 

3418 x.endswith("ised") 

3419 and len(x) >= 6 

3420 and x[:-4] + "ized" in english_words 

3421 ) 

3422 or ( 

3423 x.endswith("ising") 

3424 and len(x) >= 7 

3425 and x[:-5] + "izing" in english_words 

3426 ) 

3427 or ( 

3428 re.search(r"[-/]", x) 

3429 and all( 

3430 ((y in english_words and len(y) > 2) or not y) 

3431 for y in re.split(r"[-/]", x) 

3432 ) 

3433 ) 

3434 ) 

3435 for x in tokens 

3436 ) 

3437 cnt = lst_bool.count(True) 

3438 rejected_words = tuple( 

3439 x for i, x in enumerate(tokens) if not lst_bool[i] 

3440 ) 

3441 if ( 

3442 any( 

3443 lst_bool[i] and x[0].isalpha() and len(x) > 1 

3444 for i, x in enumerate(tokens) 

3445 ) 

3446 and not desc.startswith("-") 

3447 and not desc.endswith("-") 

3448 and re.search(r"\w+", desc) 

3449 and ( 

3450 cnt == len(lst_bool) 

3451 or ( 

3452 any( 

3453 lst_bool[i] and len(x) > 3 for i, x in enumerate(tokens) 

3454 ) 

3455 and cnt >= len(lst_bool) - 1 

3456 ) 

3457 or cnt / len(lst_bool) >= 0.8 

3458 or ( 

3459 all(x in potentially_english_words for x in rejected_words) 

3460 and cnt / len(lst_bool) >= 0.50 

3461 ) 

3462 ) 

3463 ): 

3464 return "english" 

3465 # Some translations have apparent pronunciation descriptions in /.../ 

3466 # which we'll put in the romanization field (even though they probably are 

3467 # not exactly romanizations). 

3468 if desc.startswith("/") and desc.endswith("/"): 

3469 return "romanization" 

3470 # If all characters are in classes that could occur in romanizations, 

3471 # treat as romanization 

3472 classes = list( 

3473 unicodedata.category(x) if x not in ("-", ",", ":", "/", '"') else "OK" 

3474 for x in normalized_desc 

3475 ) 

3476 classes1 = [] 

3477 num_latin = 0 

3478 num_greek = 0 

3479 # part = "" 

3480 # for ch, cl in zip(normalized_desc, classes): 

3481 # part += f"{ch}({cl})" 

3482 # print(part) 

3483 for ch, cl in zip(normalized_desc, classes): 

3484 if ch in ( 

3485 "'", # ' in Arabic, / in IPA-like parenthesized forms 

3486 ".", # e.g., "..." in translations 

3487 ";", 

3488 ":", 

3489 "!", 

3490 "‘", 

3491 "’", 

3492 '"', 

3493 "“", 

3494 "”", 

3495 "/", 

3496 "?", 

3497 "…", # alternative to "..." 

3498 "⁉", # 見る/Japanese automatic transcriptions... 

3499 "?", 

3500 "!", 

3501 "⁻", # superscript -, used in some Cantonese roman, e.g. "we" 

3502 "ʔ", 

3503 "ʼ", 

3504 "ʾ", 

3505 "ʹ", 

3506 ): # ʹ e.g. in understand/English/verb Russian transl 

3507 classes1.append("OK") 

3508 continue 

3509 if cl not in ("Ll", "Lu"): 

3510 classes1.append(cl) 

3511 continue 

3512 try: 

3513 name = unicodedata.name(ch) 

3514 first = name.split()[0] 

3515 if first == "LATIN": 

3516 num_latin += 1 

3517 elif first == "GREEK": 

3518 num_greek += 1 

3519 elif first == "COMBINING": # Combining diacritic 3519 ↛ 3520line 3519 didn't jump to line 3520 because the condition on line 3519 was never true

3520 cl = "OK" 

3521 elif re.match(non_latin_scripts_re, name): 3521 ↛ 3525line 3521 didn't jump to line 3525 because the condition on line 3521 was always true

3522 cl = "NO" # Not acceptable in romanizations 

3523 except ValueError: 

3524 cl = "NO" # Not acceptable in romanizations 

3525 classes1.append(cl) 

3526 # print("classify_desc: {!r} classes1: {}".format(desc, classes1)) 

3527 # print(set(classes1) ) 

3528 if all( 

3529 x in ("Ll", "Lu", "Lt", "Lm", "Mn", "Mc", "Zs", "Nd", "OK") 

3530 for x in classes1 

3531 ): 

3532 if ( 

3533 (num_latin >= num_greek + 2 or num_greek == 0) 

3534 and classes1.count("OK") < len(classes1) 

3535 and classes1.count("Nd") < len(classes1) 

3536 ): 

3537 return "romanization" 

3538 # Otherwise it is something else, such as hanji version of the word 

3539 return "other" 

3540 

3541 

3542def remove_text_in_parentheses(text: str) -> str: 

3543 parentheses = 0 

3544 new_text = "" 

3545 for c in text: 

3546 if c == "(": 

3547 parentheses += 1 

3548 elif c == ")": 

3549 parentheses -= 1 

3550 elif parentheses == 0: 

3551 new_text += c 

3552 return new_text