Coverage for src/wiktextract/extractor/en/form_descriptions.py: 70%

1321 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2024-12-27 08:07 +0000

1# Code for parsing linguistic form descriptions and tags for word senses 

2# (both the word entry head - initial part and parenthesized parts - 

3# and tags at the beginning of word senses) 

4# 

5# Copyright (c) 2020-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org 

6 

7import functools 

8import re 

9import unicodedata 

10from typing import ( 

11 Any, 

12 Literal, 

13 Optional, 

14 Sequence, 

15 Union, 

16) 

17 

18import Levenshtein 

19from nltk import TweetTokenizer # type:ignore[import-untyped] 

20 

21from ...datautils import data_append, data_extend, split_at_comma_semi 

22from ...tags import ( 

23 alt_of_tags, 

24 form_of_tags, 

25 head_final_bantu_langs, 

26 head_final_bantu_map, 

27 head_final_numeric_langs, 

28 head_final_other_langs, 

29 head_final_other_map, 

30 head_final_semitic_langs, 

31 head_final_semitic_map, 

32 uppercase_tags, 

33 valid_tags, 

34 xlat_descs_map, 

35 xlat_head_map, 

36 xlat_tags_map, 

37) 

38from ...topics import topic_generalize_map, valid_topics 

39from ...wxr_context import WiktextractContext 

40from .english_words import ( 

41 english_words, 

42 not_english_words, 

43 potentially_english_words, 

44) 

45from .form_descriptions_known_firsts import known_firsts 

46from .taxondata import known_species 

47from .type_utils import ( 

48 AltOf, 

49 FormData, 

50 LinkageData, 

51 SenseData, 

52 SoundData, 

53 TranslationData, 

54 WordData, 

55) 

56 

57# Tokenizer for classify_desc() 

58tokenizer = TweetTokenizer() 

59 

60# These are ignored as the value of a related form in form head. 

61IGNORED_RELATED: set[str] = set( 

62 [ 

63 "-", 

64 "־", 

65 "᠆", 

66 "‐", 

67 "‑", 

68 "‒", 

69 "–", 

70 "—", 

71 "―", 

72 "−", 

73 "⸺", 

74 "⸻", 

75 "﹘", 

76 "﹣", 

77 "-", 

78 "?", 

79 "(none)", 

80 ] 

81) 

82 

83 

84# First words of unicodedata.name() that indicate scripts that cannot be 

85# accepted in romanizations or english (i.e., should be considered "other" 

86# in classify_desc()). 

87non_latin_scripts: list[str] = [ 

88 "ADLAM", 

89 "ARABIC", 

90 "ARABIC-INDIC", 

91 "ARMENIAN", 

92 "BALINESE", 

93 "BENGALI", 

94 "BRAHMI", 

95 "BRAILLE", 

96 "CANADIAN", 

97 "CHAKMA", 

98 "CHAM", 

99 "CHEROKEE", 

100 "CJK", 

101 "COPTIC", 

102 "COUNTING ROD", 

103 "CUNEIFORM", 

104 "CYRILLIC", 

105 "DOUBLE-STRUCK", 

106 "EGYPTIAN", 

107 "ETHIOPIC", 

108 "EXTENDED ARABIC-INDIC", 

109 "GEORGIAN", 

110 "GLAGOLITIC", 

111 "GOTHIC", 

112 "GREEK", 

113 "GUJARATI", 

114 "GURMUKHI", 

115 "HANGUL", 

116 "HANIFI ROHINGYA", 

117 "HEBREW", 

118 "HIRAGANA", 

119 "JAVANESE", 

120 "KANNADA", 

121 "KATAKANA", 

122 "KAYAH LI", 

123 "KHMER", 

124 "KHUDAWADI", 

125 "LAO", 

126 "LEPCHA", 

127 "LIMBU", 

128 "MALAYALAM", 

129 "MEETEI", 

130 "MYANMAR", 

131 "NEW TAI LUE", 

132 "NKO", 

133 "OL CHIKI", 

134 "OLD PERSIAN", 

135 "OLD SOUTH ARABIAN", 

136 "ORIYA", 

137 "OSMANYA", 

138 "PHOENICIAN", 

139 "SAURASHTRA", 

140 "SHARADA", 

141 "SINHALA", 

142 "SUNDANESE", 

143 "SYLOTI", 

144 "TAI THAM", 

145 "TAKRI", 

146 "TAMIL", 

147 "TELUGU", 

148 "THAANA", 

149 "THAI", 

150 "TIBETAN", 

151 "TIFINAGH", 

152 "TIRHUTA", 

153 "UGARITIC", 

154 "WARANG CITI", 

155 "YI", 

156] 

157non_latin_scripts_re = re.compile( 

158 r"(" + r"|".join(re.escape(x) for x in non_latin_scripts) + r")\b" 

159) 

160 

161# Sanity check xlat_head_map values 

162for k, v in xlat_head_map.items(): 

163 if v.startswith("?"): 

164 v = v[1:] 

165 for tag in v.split(): 

166 if tag not in valid_tags: 166 ↛ 167line 166 didn't jump to line 167 because the condition on line 166 was never true

167 print( 

168 "WARNING: xlat_head_map[{}] contains unrecognized tag {}".format( 

169 k, tag 

170 ) 

171 ) 

172 

173# Regexp for finding nested translations from translation items (these are 

174# used in, e.g., year/English/Translations/Arabic). This is actually used 

175# in page.py. 

176nested_translations_re = re.compile( 

177 r"\s+\((({}): ([^()]|\([^()]+\))+)\)".format( 

178 "|".join( 

179 re.escape(x.removeprefix("?")) 

180 for x in sorted(xlat_head_map.values(), key=len, reverse=True) 

181 if x and not x.startswith("class-") 

182 ) 

183 ) 

184) 

185 

186# Regexp that matches head tag specifiers. Used to match tags from end of 

187# translations and linkages 

188head_final_re_text = r"( -)?( ({}))+".format( 

189 "|".join( 

190 re.escape(x) 

191 for x in 

192 # The sort is to put longer ones first, preferring them in 

193 # the regexp match 

194 sorted(xlat_head_map.keys(), key=len, reverse=True) 

195 ) 

196) 

197head_final_re = re.compile(head_final_re_text + "$") 

198 

199# Regexp used to match head tag specifiers at end of a form for certain 

200# Bantu languages (particularly Swahili and similar languages). 

201head_final_bantu_re_text = r" ({})".format( 

202 "|".join(re.escape(x) for x in head_final_bantu_map.keys()) 

203) 

204head_final_bantu_re = re.compile(head_final_bantu_re_text + "$") 

205 

206# Regexp used to match head tag specifiers at end of a form for certain 

207# Semitic languages (particularly Arabic and similar languages). 

208head_final_semitic_re_text = r" ({})".format( 

209 "|".join(re.escape(x) for x in head_final_semitic_map.keys()) 

210) 

211head_final_semitic_re = re.compile(head_final_semitic_re_text + "$") 

212 

213# Regexp used to match head tag specifiers at end of a form for certain 

214# other languages (e.g., Lithuanian, Finnish, French). 

215head_final_other_re_text = r" ({})".format( 

216 "|".join(re.escape(x) for x in head_final_other_map.keys()) 

217) 

218head_final_other_re = re.compile(head_final_other_re_text + "$") 

219 

220# Regexp for splitting heads. See parse_word_head(). 

221head_split_re_text = ( 

222 "(" 

223 + head_final_re_text 

224 + "|" 

225 + head_final_bantu_re_text 

226 + "|" 

227 + head_final_semitic_re_text 

228 + "|" 

229 + head_final_other_re_text 

230 + ")?( or |[,;]+)" 

231) 

232head_split_re = re.compile(head_split_re_text) 

233head_split_re_parens = 0 

234for m in re.finditer(r"(^|[^\\])[(]+", head_split_re_text): 

235 head_split_re_parens += m.group(0).count("(") 

236 

237# Parenthesized parts that are ignored in translations 

238tr_ignored_parens: set[str] = set( 

239 [ 

240 "please verify", 

241 "(please verify)", 

242 "transliteration needed", 

243 "(transliteration needed)", 

244 "in words with back vowel harmony", 

245 "(in words with back vowel harmony)", 

246 "in words with front vowel harmony", 

247 "(in words with front vowel harmony)", 

248 "see below", 

249 "see usage notes below", 

250 ] 

251) 

252tr_ignored_parens_re = re.compile( 

253 r"^(" 

254 + "|".join(re.escape(x) for x in tr_ignored_parens) 

255 + ")$" 

256 + r"|^(Can we clean up|Can we verify|for other meanings see " 

257 r"lit\. )" 

258) 

259 

260# Translations that are ignored 

261ignored_translations: set[str] = set( 

262 [ 

263 "[script needed]", 

264 "please add this translation if you can", 

265 ] 

266) 

267 

268# Put english text into the "note" field in a translation if it contains one 

269# of these words 

270tr_note_re = re.compile( 

271 r"(\b(article|definite|indefinite|superlative|comparative|pattern|" 

272 r"adjective|adjectives|clause|clauses|pronoun|pronouns|preposition|prep|" 

273 r"postposition|postp|action|actions|articles|" 

274 r"adverb|adverbs|noun|nouns|verb|verbs|before|" 

275 r"after|placed|prefix|suffix|used with|translated|" 

276 r"nominative|genitive|dative|infinitive|participle|past|perfect|imperfect|" 

277 r"perfective|imperfective|auxiliary|negative|future|present|tense|aspect|" 

278 r"conjugation|declension|class|category|plural|singular|positive|" 

279 r"seldom used|formal|informal|familiar|unspoken|spoken|written|" 

280 r"indicative|progressive|conditional|potential|" 

281 r"accusative|adessive|inessive|superessive|elative|allative|" 

282 r"dialect|dialects|object|subject|predicate|movies|recommended|language|" 

283 r"locative|continuous|simple|continuousness|gerund|subjunctive|" 

284 r"periphrastically|no equivalent|not used|not always used|" 

285 r"used only with|not applicable|use the|signifying|wordplay|pronounced|" 

286 r"preconsonantal|spelled|spelling|respelling|respellings|phonetic|" 

287 r"may be replaced|stricter sense|for nonhumans|" 

288 r"sense:|used:|in full:|informally used|followed by|" 

289 r"not restricted to|pertaining to|or optionally with|are optional|" 

290 r"in conjunction with|in compounds|depending on the relationship|" 

291 r"person addressed|one person|multiple persons|may be replaced with|" 

292 r"optionally completed with|in the phrase|in response to|" 

293 r"before a|before an|preceded by|verbs ending|very common|after a verb|" 

294 r"with verb|with uncountable|with the objects|with stative|" 

295 r"can be replaced by|often after|used before|used after|" 

296 r"used in|clipping of|spoken|somewhat|capitalized|" 

297 r"short form|shortening of|shortened form|initialism of|" 

298 r"said to|rare:|rarer also|is rarer|negatively connoted|" 

299 r"previously mentioned|uncountable noun|countable noun|" 

300 r"countable nouns|uncountable nouns|" 

301 r"with predicative|with -|with imperfect|with a negated|" 

302 r"colloquial|misspelling|holophrastic|frequently|esp\.|especially|" 

303 r'"|' 

304 r"general term|after a vowel|before a vowel|" 

305 r"form|regular|irregular|alternative)" 

306 r")($|[) ])|^(" 

307 # Following are only matched at the beginning of the string 

308 r"pl|pl\.|see:|pl:|sg:|plurals:|e\.g\.|e\.g\.:|e\.g\.,|cf\.|compare|such as|" 

309 r"see|only|often|usually|used|usage:|of|not|in|compare|usu\.|" 

310 r"as|about|abbrv\.|abbreviation|abbr\.|that:|optionally|" 

311 r"mainly|from|for|also|also:|acronym|" 

312 r"\+|with) " 

313) 

314# \b does not work at the end??? 

315 

316# Related forms matching this regexp will be considered suspicious if the 

317# page title does not also match one of these. 

318suspicious_related_re = re.compile( 

319 r"(^| )(f|m|n|c|or|pl|sg|inan|anim|pers|anml|impf|pf|vir|nvir)( |$)" 

320 r"|[][:=<>&#*|]" 

321 r"| \d+$" 

322) 

323 

324# Word forms (head forms, translations, etc) that will be considered ok and 

325# silently accepted even if they would otherwise trigger a suspicious 

326# form warning. 

327ok_suspicious_forms: set[str] = set( 

328 [ 

329 "but en or", # "golden goal"/English/Tr/French 

330 "cœur en or", # "heart of gold"/Eng/Tr/French 

331 "en or", # golden/Eng/Tr/French 

332 "men du", # jet/Etym2/Noun/Tr/Cornish 

333 "parachute en or", # "golden parachute"/Eng/Tr/French 

334 "vieil or", # "old gold"/Eng/Tr/French 

335 # "all that glitters is not gold"/Eng/Tr/French 

336 "tout ce qui brille n’est pas or", 

337 "μη αποκλειστικό or", # inclusive or/Eng/Tr/Greek 

338 "period or full stop", 

339 ] 

340) 

341 

342 

343# Replacements to be done in classify_desc before tokenizing. This is a 

344# workaround for shortcomings in TweetTokenizer. 

345tokenizer_fixup_map = { 

346 r"a.m.": "AM", 

347 r"p.m.": "PM", 

348} 

349tokenizer_fixup_re = re.compile( 

350 r"\b(" 

351 + "|".join( 

352 re.escape(x) 

353 for x in sorted( 

354 tokenizer_fixup_map.keys(), key=lambda x: len(x), reverse=True 

355 ) 

356 ) 

357 + r")" 

358) 

359 

360# Unknown tags starting with these words will be silently ignored. 

361ignored_unknown_starts: set[str] = set( 

362 [ 

363 "originally", 

364 "e.g.", 

365 "c.f.", 

366 "supplanted by", 

367 "supplied by", 

368 ] 

369) 

370 

371ignored_unknown_starts_re = re.compile( 

372 r"^(" 

373 + "|".join( 

374 re.escape(x) 

375 for x in sorted(ignored_unknown_starts, key=lambda x: -len(x)) 

376 ) 

377 + ") " 

378) 

379 

380# If an unknown sequence starts with one of these, it will continue as an 

381# unknown sequence until the end, unless it turns out to have a replacement. 

382allowed_unknown_starts: set[str] = set( 

383 [ 

384 "Relating", 

385 "accompanied", 

386 "added", 

387 "after", 

388 "answering", 

389 "as", 

390 "based", 

391 "before", 

392 "conjugated", 

393 "conjunction", 

394 "construed", 

395 "especially", 

396 "expression:", 

397 "figurative:", 

398 "followed", 

399 "for", 

400 "forms", 

401 "from", 

402 "governs", 

403 "in", 

404 "indicating", 

405 "modifying", 

406 "normally", 

407 "not", 

408 "of", 

409 "preceding", 

410 "prefixed", 

411 "referring", 

412 "relating", 

413 "revived", 

414 "said", 

415 "since", 

416 "takes", 

417 "used", 

418 "with", 

419 "With", 

420 "without", 

421 ] 

422) 

423# Allow the ignored unknown starts without complaining 

424allowed_unknown_starts.update(ignored_unknown_starts) 

425 

426# Full unknown tags that will be ignored in decode_tags() 

427# XXX this is unused, ask Tatu where the contents is now 

428ignored_unknown_tags: set[str] = set([]) 

429 

430# Head endings that are mapped to tags 

431head_end_map = { 

432 " 1st conj.": "conjugation-1", 

433 " 2nd conj.": "conjugation-2", 

434 " 3rd conj.": "conjugation-3", 

435 " 4th conj.": "conjugation-4", 

436 " 5th conj.": "conjugation-5", 

437 " 6th conj.": "conjugation-6", 

438 " 7th conj.": "conjugation-7", 

439} 

440head_end_re = re.compile( 

441 r"(" + "|".join(re.escape(x) for x in head_end_map.keys()) + r")$" 

442) 

443 

444 

445# Dictionary of language-specific parenthesized head part starts that 

446# either introduce new tags or modify previous tags. The value for each 

447# language is a dictionary that maps the first word of the head part to 

448# (rem_tags, add_tags), where ``rem_tags`` can be True to remove all previous 

449# tags or a space-separated string of tags to remove, and ``add_tags`` should 

450# be a string of tags to add. 

451lang_specific_head_map: dict[ 

452 str, dict[str, Union[tuple[str, str], tuple[Literal[True], str]]] 

453] = { 

454 "Danish": { 

455 # prefix: (rem_tags space separate string/True, add_tags s-sep str) 

456 "c": ("neuter", "common-gender"), 

457 "n": ("common-gender", "neuter"), 

458 "pl": ("singular neuter common-gender", "plural"), 

459 "sg": ("plural neuter common-gender", "singular"), 

460 }, 

461} 

462 

463 

464# Regular expression used to strip additional stuff from the end of alt_of and 

465# form_of. 

466alt_of_form_of_clean_re = re.compile( 

467 r"(?s)(" 

468 + "|".join( 

469 [ 

470 r":", 

471 r'[“"]', 

472 r";", 

473 r" \(", 

474 r" - ", 

475 r" ־ ", 

476 r" ᠆ ", 

477 r" ‐ ", 

478 r" ‑ ", 

479 r" ‒ ", 

480 r" – ", 

481 r" — ", 

482 r" ― ", 

483 r" − ", 

484 r" ⸺ ", 

485 r" ⸻ ", 

486 r" ﹘ ", 

487 r" ﹣ ", 

488 r" - ", 

489 r" \+ ", 

490 r" \(with ", 

491 r" with -ra/-re", 

492 r"\. Used ", 

493 r"\. Also ", 

494 r"\. Since ", 

495 r"\. A ", 

496 r"\.\. A ", 

497 r"\. An ", 

498 r"\.\. An ", 

499 r"\. an ", 

500 r"\. The ", 

501 r"\. Spanish ", 

502 r"\. Language ", 

503 r"\. former name of ", 

504 r"\. AIM", 

505 r"\. OT", 

506 r"\. Not ", 

507 r"\. Now ", 

508 r"\. Nowadays ", 

509 r"\. Early ", 

510 r"\. ASEAN", 

511 r"\. UN", 

512 r"\. IMF", 

513 r"\. WHO", 

514 r"\. WIPO", 

515 r"\. AC", 

516 r"\. DC", 

517 r"\. DNA", 

518 r"\. RNA", 

519 r"\. SOB", 

520 r"\. IMO", 

521 r"\. Behavior", 

522 r"\. Income ", 

523 r"\. More ", 

524 r"\. Most ", 

525 r"\. Only ", 

526 r"\. Also ", 

527 r"\. From ", 

528 r"\. Of ", 

529 r"\.\. Of ", 

530 r"\. To ", 

531 r"\. For ", 

532 r"\. If ", 

533 r"\. Praenominal ", 

534 r"\. This ", 

535 r"\. Replaced ", 

536 r"\. CHCS is the ", 

537 r"\. Equivalent ", 

538 r"\. Initialism ", 

539 r"\. Note ", 

540 r"\. Alternative ", 

541 r"\. Compare ", 

542 r"\. Cf\. ", 

543 r"\. Comparable ", 

544 r"\. Involves ", 

545 r"\. Sometimes ", 

546 r"\. Commonly ", 

547 r"\. Often ", 

548 r"\. Typically ", 

549 r"\. Possibly ", 

550 r"\. Although ", 

551 r"\. Rare ", 

552 r"\. Instead ", 

553 r"\. Integrated ", 

554 r"\. Distinguished ", 

555 r"\. Given ", 

556 r"\. Found ", 

557 r"\. Was ", 

558 r"\. In ", 

559 r"\. It ", 

560 r"\.\. It ", 

561 r"\. One ", 

562 r"\. Any ", 

563 r"\. They ", 

564 r"\. Members ", 

565 r"\. Each ", 

566 r"\. Original ", 

567 r"\. Especially ", 

568 r"\. Usually ", 

569 r"\. Known ", 

570 r"\.\. Known ", 

571 r"\. See ", 

572 r"\. see ", 

573 r"\. target was not ", 

574 r"\. Popular ", 

575 r"\. Pedantic ", 

576 r"\. Positive ", 

577 r"\. Society ", 

578 r"\. Plan ", 

579 r"\. Environmentally ", 

580 r"\. Affording ", 

581 r"\. Encompasses ", 

582 r"\. Expresses ", 

583 r"\. Indicates ", 

584 r"\. Text ", 

585 r"\. Large ", 

586 r"\. Sub-sorting ", 

587 r"\. Sax", 

588 r"\. First-person ", 

589 r"\. Second-person ", 

590 r"\. Third-person ", 

591 r"\. 1st ", 

592 r"\. 2nd ", 

593 r"\. 3rd ", 

594 r"\. Term ", 

595 r"\. Northeastern ", 

596 r"\. Northwestern ", 

597 r"\. Southeast ", 

598 r"\. Egyptian ", 

599 r"\. English ", 

600 r"\. Cape Province was split into ", 

601 r"\. Pañcat", 

602 r"\. of the ", 

603 r"\. is ", 

604 r"\. after ", 

605 r"\. or ", 

606 r"\. chromed", 

607 r"\. percussion", 

608 r"\. with his ", 

609 r"\. a\.k\.a\. ", 

610 r"\. comparative form ", 

611 r"\. singular ", 

612 r"\. plural ", 

613 r"\. present ", 

614 r"\. his ", 

615 r"\. her ", 

616 r"\. equivalent ", 

617 r"\. measuring ", 

618 r"\. used in ", 

619 r"\. cutely ", 

620 r"\. Protects", 

621 r'\. "', 

622 r"\.^", 

623 r"\. \+ ", 

624 r"\., ", 

625 r". — ", 

626 r", a ", 

627 r", an ", 

628 r", the ", 

629 r", obsolete ", 

630 r", possessed", # 'd/English 

631 r", imitating", # 1/English 

632 r", derived from", 

633 r", called ", 

634 r", especially ", 

635 r", slang for ", 

636 r" corresponding to ", 

637 r" equivalent to ", 

638 r" popularized by ", 

639 r" denoting ", 

640 r" in its various senses\.", 

641 r" used by ", 

642 r" but not for ", 

643 r" since ", 

644 r" i\.e\. ", 

645 r" i\. e\. ", 

646 r" e\.g\. ", 

647 r" eg\. ", 

648 r" etc\. ", 

649 r"\[http", 

650 r" — used as ", 

651 r" by K\. Forsyth ", 

652 r" by J\. R\. Allen ", 

653 r" by S\. Ferguson ", 

654 r" by G\. Donaldson ", 

655 r" May refer to ", 

656 r" An area or region ", 

657 ] 

658 ) 

659 + r").*$" 

660) 

661 

662 

663class ValidNode: 

664 """Node in the valid_sequences tree. Each node is part of a chain 

665 or chains that form sequences built out of keys in key->tags 

666 maps like xlat_tags, etc. The ValidNode's 'word' is the key 

667 by which it is refered to in the root dict or a `children` dict, 

668 `end` marks that the node is the end-terminus of a sequence (but 

669 it can still continue if the sequence is shared by the start of 

670 other sequences: "nominative$" and "nominative plural$" for example), 

671 `tags` and `topics` are the dicts containing tag and topic strings 

672 for terminal nodes (end==True).""" 

673 

674 __slots__ = ( 

675 "end", 

676 "tags", 

677 "topics", 

678 "children", 

679 ) 

680 

681 def __init__( 

682 self, 

683 end=False, 

684 tags: Optional[list[str]] = None, 

685 topics: Optional[list[str]] = None, 

686 children: Optional[dict[str, "ValidNode"]] = None, 

687 ) -> None: 

688 self.end = end 

689 self.tags: list[str] = tags or [] 

690 self.topics: list[str] = topics or [] 

691 self.children: dict[str, "ValidNode"] = children or {} 

692 

693 

694def add_to_valid_tree(tree: ValidNode, desc: str, v: Optional[str]) -> None: 

695 """Helper function for building trees of valid tags/sequences during 

696 initialization.""" 

697 assert isinstance(tree, ValidNode) 

698 assert isinstance(desc, str) 

699 assert v is None or isinstance(v, str) 

700 node = tree 

701 

702 # Build the tree structure: each node has children nodes 

703 # whose names are denoted by their dict key. 

704 for w in desc.split(" "): 

705 if w in node.children: 

706 node = node.children[w] 

707 else: 

708 new_node = ValidNode() 

709 node.children[w] = new_node 

710 node = new_node 

711 if not node.end: 

712 node.end = True 

713 if not v: 

714 return None # Terminate early because there are no tags 

715 

716 tagslist = [] 

717 topicslist = [] 

718 for vv in v.split(): 

719 if vv in valid_tags: 

720 tagslist.append(vv) 

721 elif vv in valid_topics: 721 ↛ 724line 721 didn't jump to line 724 because the condition on line 721 was always true

722 topicslist.append(vv) 

723 else: 

724 print( 

725 "WARNING: tag/topic {!r} maps to unknown {!r}".format(desc, vv) 

726 ) 

727 topics = " ".join(topicslist) 

728 tags = " ".join(tagslist) 

729 # Changed to "_tags" and "_topics" to avoid possible key-collisions. 

730 if topics: 

731 node.topics.extend([topics]) 

732 if tags: 

733 node.tags.extend([tags]) 

734 

735 

736def add_to_valid_tree1( 

737 tree: ValidNode, 

738 k: str, 

739 v: Union[list[str], tuple[str, ...], str], 

740 valid_values: Union[set[str], dict[str, Any]], 

741) -> list[str]: 

742 assert isinstance(tree, ValidNode) 

743 assert isinstance(k, str) 

744 assert v is None or isinstance(v, (list, tuple, str)) 

745 assert isinstance(valid_values, (set, dict)) 

746 if not v: 746 ↛ 747line 746 didn't jump to line 747 because the condition on line 746 was never true

747 add_to_valid_tree(valid_sequences, k, None) 

748 return [] 

749 elif isinstance(v, str): 

750 v = [v] 

751 q = [] 

752 for vv in v: 

753 assert isinstance(vv, str) 

754 add_to_valid_tree(valid_sequences, k, vv) 

755 vvs = vv.split() 

756 for x in vvs: 

757 q.append(x) 

758 # return each individual tag 

759 return q 

760 

761 

762def add_to_valid_tree_mapping( 

763 tree: ValidNode, 

764 mapping: Union[dict[str, Union[list[str], str]], dict[str, str]], 

765 valid_values: Union[set[str], dict[str, Any]], 

766 recurse: bool, 

767) -> None: 

768 assert isinstance(tree, ValidNode) 

769 assert isinstance(mapping, dict) 

770 assert isinstance(valid_values, (set, dict)) 

771 assert recurse in (True, False) 

772 for k, v in mapping.items(): 

773 assert isinstance(k, str) 

774 assert isinstance(v, (list, str)) 

775 if isinstance(v, str): 

776 q = add_to_valid_tree1(tree, k, [v], valid_values) 

777 else: 

778 q = add_to_valid_tree1(tree, k, v, valid_values) 

779 if recurse: 

780 visited = set() 

781 while q: 

782 v = q.pop() 

783 if v in visited: 

784 continue 

785 visited.add(v) 

786 if v not in mapping: 

787 continue 

788 vv = mapping[v] 

789 qq = add_to_valid_tree1(tree, k, vv, valid_values) 

790 q.extend(qq) 

791 

792 

793# Tree of sequences considered to be tags (includes sequences that are 

794# mapped to something that becomes one or more valid tags) 

795valid_sequences = ValidNode() 

796sequences_with_slashes: set[str] = set() 

797for tag in valid_tags: 

798 # The basic tags used in our tag system; some are a bit weird, but easier 

799 # to implement this with 'false' positives than filter out stuff no one else 

800 # uses. 

801 if "/" in tag: 

802 sequences_with_slashes.add(tag) 

803 add_to_valid_tree(valid_sequences, tag, tag) 

804for tag in uppercase_tags: 

805 hyphenated = re.sub(r"\s+", "-", tag) 

806 if hyphenated in valid_tags: 806 ↛ 807line 806 didn't jump to line 807 because the condition on line 806 was never true

807 print( 

808 "DUPLICATE TAG: {} (from uppercase tag {!r})".format( 

809 hyphenated, tag 

810 ) 

811 ) 

812 assert hyphenated not in valid_tags 

813 # Might as well, while we're here: Add hyphenated location tag. 

814 valid_tags[hyphenated] = "dialect" 

815 add_to_valid_tree(valid_sequences, hyphenated, hyphenated) 

816for tag in uppercase_tags: 

817 hyphenated = re.sub(r"\s+", "-", tag) 

818 # XXX Move to above loop? Or is this here for readability? 

819 if "/" in tag: 

820 sequences_with_slashes.add(tag) 

821 add_to_valid_tree(valid_sequences, tag, hyphenated) 

822# xlat_tags_map! 

823add_to_valid_tree_mapping(valid_sequences, xlat_tags_map, valid_tags, False) 

824for k in xlat_tags_map: 

825 if "/" in k: 

826 sequences_with_slashes.add(k) 

827# Add topics to the same table, with all generalized topics also added 

828for topic in valid_topics: 

829 assert " " not in topic 

830 if "/" in topic: 830 ↛ 831line 830 didn't jump to line 831 because the condition on line 830 was never true

831 sequences_with_slashes.add(topic) 

832 add_to_valid_tree(valid_sequences, topic, topic) 

833# Let each original topic value stand alone. These are not generally on 

834# valid_topics. We add the original topics with spaces replaced by hyphens. 

835for topic in topic_generalize_map.keys(): 

836 hyphenated = topic.replace(" ", "-") 

837 valid_topics.add(hyphenated) 

838 if "/" in topic: 838 ↛ 839line 838 didn't jump to line 839 because the condition on line 838 was never true

839 sequences_with_slashes.add(tag) 

840 add_to_valid_tree(valid_sequences, topic, hyphenated) 

841# Add canonicalized/generalized topic values 

842add_to_valid_tree_mapping( 

843 valid_sequences, topic_generalize_map, valid_topics, True 

844) 

845 

846# Regex used to divide a decode candidate into parts that shouldn't 

847# have their slashes turned into spaces 

848slashes_re = re.compile( 

849 r"(" + "|".join((re.escape(s) for s in sequences_with_slashes)) + r")" 

850) 

851 

852# Regexp used to find "words" from word heads and linguistic descriptions 

853word_pattern = ( 

854 r"[^ ,;()\u200e]+|" 

855 r"\([^ ,;()\u200e]+\)[^ ,;()\u200e]+|" 

856 r"[\u2800-\u28ff]|" # Braille characters 

857 r"\(([^()]|\([^()]*\))*\)" 

858) 

859 

860word_re_global = re.compile(word_pattern) 

861 

862 

863def distw(titleparts: Sequence[str], word: str) -> float: 

864 """Computes how distinct ``word`` is from the most similar word in 

865 ``titleparts``. Returns 1 if words completely distinct, 0 if 

866 identical, or otherwise something in between.""" 

867 assert isinstance(titleparts, (list, tuple)) 

868 assert isinstance(word, str) 

869 w = min( 

870 Levenshtein.distance(word, tw) / max(len(tw), len(word)) 

871 for tw in titleparts 

872 ) 

873 return w 

874 

875 

876def map_with( 

877 ht: Union[dict[str, Union[str, list[str]]], dict[str, str]], 

878 lst: Sequence[str], 

879) -> list[str]: 

880 """Takes alternatives from ``lst``, maps them using ``ht`` to zero or 

881 more alternatives each, and returns a combined list of alternatives.""" 

882 assert isinstance(ht, dict) 

883 assert isinstance(lst, (list, tuple)) 

884 ret = [] 

885 for x in lst: 

886 assert isinstance(x, str) 

887 x = x.strip() 

888 x = ht.get(x, x) 

889 if isinstance(x, str): 889 ↛ 892line 889 didn't jump to line 892 because the condition on line 889 was always true

890 if x: 890 ↛ 885line 890 didn't jump to line 885 because the condition on line 890 was always true

891 ret.append(x) 

892 elif isinstance(x, (list, tuple)): 

893 ret.extend(x) 

894 else: 

895 raise RuntimeError("map_with unexpected value: {!r}".format(x)) 

896 return ret 

897 

898 

899TagList = list[str] 

900PosPathStep = tuple[int, TagList, TagList] 

901 

902 

903def check_unknown( 

904 from_i: int, 

905 to_i: int, 

906 i: int, 

907 wordlst: Sequence[str], 

908 allow_any: bool, 

909 no_unknown_starts: bool, 

910) -> list[PosPathStep]: 

911 """Check if the current section from_i->to_i is actually unknown 

912 or if it needs some special handling. We already presupposed that 

913 this is UNKNOWN; this is just called to see what *kind* of UNKNOWN.""" 

914 assert isinstance(to_i, int) 

915 assert isinstance(from_i, int) 

916 assert isinstance(i, int) 

917 # Adds unknown tag if needed. Returns new last_i 

918 # print("check_unknown to_i={} from_i={} i={}" 

919 # .format(to_i, from_i, i)) 

920 if from_i >= to_i: 

921 return [] 

922 words = wordlst[from_i:to_i] 

923 tag = " ".join(words) 

924 assert tag 

925 # print(f"{tag=}") 

926 if re.match(ignored_unknown_starts_re, tag): 

927 # Tags with this start are to be ignored 

928 return [(from_i, ["UNKNOWN"], [])] 

929 if tag in ignored_unknown_tags: 929 ↛ 930line 929 didn't jump to line 930 because the condition on line 929 was never true

930 return [] # One of the tags listed as to be ignored 

931 if tag in ("and", "or"): 

932 return [] 

933 if ( 

934 not allow_any 

935 and not words[0].startswith("~") 

936 and ( 

937 no_unknown_starts 

938 or words[0] not in allowed_unknown_starts 

939 or len(words) <= 1 

940 ) 

941 ): 

942 # print("ERR allow_any={} words={}" 

943 # .format(allow_any, words)) 

944 return [ 

945 (from_i, ["UNKNOWN"], ["error-unknown-tag"]) 

946 ] # Add ``tag`` here to include 

947 else: 

948 return [(from_i, ["UNKNOWN"], [tag])] 

949 

950 

951def add_new1( 

952 node: ValidNode, 

953 i: int, 

954 start_i: int, 

955 last_i: int, 

956 new_paths: list[list[PosPathStep]], 

957 new_nodes: list[tuple[ValidNode, int, int]], 

958 pos_paths: list[list[list[PosPathStep]]], 

959 wordlst: list[str], 

960 allow_any: bool, 

961 no_unknown_starts: bool, 

962 max_last_i: int, 

963) -> int: 

964 assert isinstance(new_paths, list) 

965 # print("add_new: start_i={} last_i={}".format(start_i, last_i)) 

966 # print("$ {} last_i={} start_i={}" 

967 # .format(w, last_i, start_i)) 

968 max_last_i = max(max_last_i, last_i) # if last_i has grown 

969 if (node, start_i, last_i) not in new_nodes: 

970 new_nodes.append((node, start_i, last_i)) 

971 if node.end: 

972 # We can see a terminal point in the search tree. 

973 u = check_unknown( 

974 last_i, start_i, i, wordlst, allow_any, no_unknown_starts 

975 ) 

976 # Create new paths candidates based on different past possible 

977 # paths; pos_path[last_i] contains possible paths, so add this 

978 # new one at the beginning(?) 

979 # The list comprehension inside the parens generates an iterable 

980 # of lists, so this is .extend( [(last_i...)], [(last_i...)], ... ) 

981 # XXX: this is becoming impossible to annotate, nodes might 

982 # need to become classed objects and not just dicts, or at least 

983 # a TypedDict with a "children" node 

984 new_paths.extend( 

985 [(last_i, node.tags, node.topics)] + u + x 

986 for x in pos_paths[last_i] 

987 ) 

988 max_last_i = i + 1 

989 return max_last_i 

990 

991 

992@functools.lru_cache(maxsize=65536) 

993def decode_tags( 

994 src: str, 

995 allow_any=False, 

996 no_unknown_starts=False, 

997) -> tuple[list[tuple[str, ...]], list[str]]: 

998 tagsets, topics = decode_tags1(src, allow_any, no_unknown_starts) 

999 # print(f"decode_tags: {src=}, {tagsets=}") 

1000 

1001 # Insert retry-code here that modifies the text source 

1002 if ( 

1003 any(s.startswith("error-") for tagset in tagsets for s in tagset) 

1004 # I hate Python's *nested* list comprehension syntax ^ 

1005 or any(s.startswith("error-") for s in topics) 

1006 ): 

1007 new_tagsets: list[tuple[str, ...]] = [] 

1008 new_topics: list[str] = [] 

1009 

1010 if "/" in src: 

1011 # slashes_re contains valid key entries with slashes; we're going 

1012 # to skip them by splitting the string and skipping handling every 

1013 # second entry, which contains the splitting group like "masculine/ 

1014 # feminine" style keys. 

1015 split_parts = re.split(slashes_re, src) 

1016 new_parts: list[str] = [] 

1017 if len(split_parts) > 1: 

1018 for i, s in enumerate(split_parts): 

1019 if i % 2 == 0: 

1020 new_parts.append(s.replace("/", " ")) 

1021 else: 

1022 new_parts.append(s) 

1023 new_src = "".join(new_parts) 

1024 else: 

1025 new_src = src 

1026 new_tagsets, new_topics = decode_tags1( 

1027 new_src, allow_any, no_unknown_starts 

1028 ) 

1029 elif " or " in src or " and " in src: 

1030 # Annoying kludge. 

1031 new_src = src.replace(" and ", " ") 

1032 new_src = new_src.replace(" or ", " ") 

1033 new_tagsets, new_topics = decode_tags1( 

1034 new_src, allow_any, no_unknown_starts 

1035 ) 

1036 # print(f"{new_tagsets=}") 

1037 

1038 if new_tagsets or new_topics: 

1039 old_errors = sum( 

1040 1 for tagset in tagsets for s in tagset if s.startswith("error") 

1041 ) 

1042 old_errors += sum(1 for s in topics if s.startswith("error")) 

1043 new_errors = sum( 

1044 1 

1045 for new_tagset in new_tagsets 

1046 for s in new_tagset 

1047 if s.startswith("error") 

1048 ) 

1049 new_errors += sum(1 for s in new_topics if s.startswith("error")) 

1050 

1051 if new_errors <= old_errors: 1051 ↛ 1054line 1051 didn't jump to line 1054 because the condition on line 1051 was always true

1052 return new_tagsets, new_topics 

1053 

1054 return tagsets, topics 

1055 

1056 

1057def decode_tags1( 

1058 src: str, 

1059 allow_any=False, 

1060 no_unknown_starts=False, 

1061) -> tuple[list[tuple[str, ...]], list[str]]: 

1062 """Decodes tags, doing some canonicalizations. This returns a list of 

1063 lists of tags and a list of topics.""" 

1064 assert isinstance(src, str) 

1065 

1066 # print("decode_tags: src={!r}".format(src)) 

1067 

1068 pos_paths: list[list[list[PosPathStep]]] = [[[]]] 

1069 wordlst: list[str] = [] 

1070 max_last_i = 0 # pre-initialized here so that it can be used as a ref 

1071 

1072 add_new = functools.partial( 

1073 add_new1, # pre-set parameters and references for function 

1074 pos_paths=pos_paths, 

1075 wordlst=wordlst, 

1076 allow_any=allow_any, 

1077 no_unknown_starts=no_unknown_starts, 

1078 max_last_i=max_last_i, 

1079 ) 

1080 # First split the tags at commas and semicolons. Their significance is that 

1081 # a multi-word sequence cannot continue across them. 

1082 parts = split_at_comma_semi(src, extra=[";", ":"]) 

1083 

1084 for part in parts: 

1085 max_last_i = len(wordlst) # "how far have we gone?" 

1086 lst1 = part.split() 

1087 if not lst1: 

1088 continue 

1089 wordlst.extend(lst1) 

1090 cur_nodes: list[tuple[ValidNode, int, int]] = [] # Currently seen 

1091 for w in lst1: 

1092 i = len(pos_paths) - 1 

1093 new_nodes: list[tuple[ValidNode, int, int]] = [] 

1094 # replacement nodes for next loop 

1095 new_paths: list[list[PosPathStep]] = [] 

1096 # print("ITER i={} w={} max_last_i={} wordlst={}" 

1097 # .format(i, w, max_last_i, wordlst)) 

1098 node: ValidNode 

1099 start_i: int 

1100 last_i: int 

1101 for node, start_i, last_i in cur_nodes: 

1102 # ValidNodes are part of a search tree that checks if a 

1103 # phrase is found in xlat_tags_map and other text->tags dicts. 

1104 if w in node.children: 

1105 # the phrase continues down the tree 

1106 # print("INC", w) 

1107 max_last_i = add_new( 

1108 node.children[w], 

1109 i, 

1110 start_i, 

1111 last_i, 

1112 new_paths, 

1113 new_nodes, 

1114 ) 

1115 if node.end: 

1116 # we've hit an end point, the tags and topics have already 

1117 # been gathered at some point, don't do anything with the 

1118 # old stuff 

1119 if w in valid_sequences.children: 

1120 # This starts a *new* possible section 

1121 max_last_i = add_new( 

1122 valid_sequences.children[w], # root-> 

1123 i, 

1124 i, 

1125 i, 

1126 new_paths, 

1127 new_nodes, 

1128 ) 

1129 if w not in node.children and not node.end: 

1130 # print("w not in node and $: i={} last_i={} wordlst={}" 

1131 # .format(i, last_i, wordlst)) 

1132 # If i == last_i == 0, for example (beginning) 

1133 if ( 

1134 i == last_i 

1135 or no_unknown_starts 

1136 or wordlst[last_i] not in allowed_unknown_starts 

1137 ): 

1138 # print("NEW", w) 

1139 if w in valid_sequences.children: 

1140 # Start new sequences here 

1141 max_last_i = add_new( 

1142 valid_sequences.children[w], 

1143 i, 

1144 i, 

1145 last_i, 

1146 new_paths, 

1147 new_nodes, 

1148 ) 

1149 if not new_nodes: 

1150 # This is run at the start when i == max_last_i == 0, 

1151 # which is what populates the first node in new_nodes. 

1152 # Some initial words cause the rest to be interpreted as unknown 

1153 # print("not new nodes: i={} last_i={} wordlst={}" 

1154 # .format(i, max_last_i, wordlst)) 

1155 if ( 

1156 i == max_last_i 

1157 or no_unknown_starts 

1158 or wordlst[max_last_i] not in allowed_unknown_starts 

1159 ): 

1160 # print("RECOVER w={} i={} max_last_i={} wordlst={}" 

1161 # .format(w, i, max_last_i, wordlst)) 

1162 if w in valid_sequences.children: 

1163 max_last_i = add_new( 

1164 # new sequence from root 

1165 valid_sequences.children[w], 

1166 i, 

1167 i, 

1168 max_last_i, 

1169 new_paths, 

1170 new_nodes, 

1171 ) 

1172 cur_nodes = new_nodes # Completely replace nodes! 

1173 # 2023-08-18, fix to improve performance 

1174 # Decode tags does a big search of the best-shortest matching 

1175 # sequences of tags, but the original algorithm didn't have 

1176 # any culling happen during operation, so in a case with 

1177 # a lot of tags (for example, big blocks of text inserted 

1178 # somewhere by mistake that is processed by decode_tags), 

1179 # it would lead to exponential growth of new_paths contents. 

1180 # This culling, using the same weighting algorithm code as 

1181 # in the original is just applied to new_paths before it is 

1182 # added to pos_paths. Basically it's "take the 10 best paths". 

1183 # This *can* cause bugs if it gets stuck in a local minimum 

1184 # or something, but this whole process is one-dimensional 

1185 # and not that complex, so hopefully it works out... 

1186 pw = [] 

1187 path: list[PosPathStep] 

1188 for path in new_paths: 

1189 weight = len(path) 

1190 if any(x[1] == ["UNKNOWN"] for x in path): 

1191 weight += 100 # Penalize unknown paths 

1192 pw.append((weight, path)) 

1193 new_paths = [weightpath[1] for weightpath in sorted(pw)[:10]] 

1194 pos_paths.append(new_paths) 

1195 

1196 # print("END max_last_i={} len(wordlst)={} len(pos_paths)={}" 

1197 # .format(max_last_i, len(wordlst), len(pos_paths))) 

1198 

1199 if cur_nodes: 

1200 # print("END HAVE_NODES") 

1201 for node, start_i, last_i in cur_nodes: 

1202 if node.end: 

1203 # print("$ END start_i={} last_i={}" 

1204 # .format(start_i, last_i)) 

1205 for path in pos_paths[start_i]: 

1206 pos_paths[-1].append( 

1207 [(last_i, node.tags, node.topics)] + path 

1208 ) 

1209 else: 

1210 # print("UNK END start_i={} last_i={} wordlst={}" 

1211 # .format(start_i, last_i, wordlst)) 

1212 u = check_unknown( 

1213 last_i, 

1214 len(wordlst), 

1215 len(wordlst), 

1216 wordlst, 

1217 allow_any, 

1218 no_unknown_starts, 

1219 ) 

1220 if pos_paths[start_i]: 

1221 for path in pos_paths[start_i]: 

1222 pos_paths[-1].append(u + path) 

1223 else: 

1224 pos_paths[-1].append(u) 

1225 else: 

1226 # Check for a final unknown tag 

1227 # print("NO END NODES max_last_i={}".format(max_last_i)) 

1228 paths = pos_paths[max_last_i] or [[]] 

1229 u = check_unknown( 

1230 max_last_i, 

1231 len(wordlst), 

1232 len(wordlst), 

1233 wordlst, 

1234 allow_any, 

1235 no_unknown_starts, 

1236 ) 

1237 if u: 1237 ↛ 1084line 1237 didn't jump to line 1084 because the condition on line 1237 was always true

1238 # print("end max_last_i={}".format(max_last_i)) 

1239 for path in list(paths): # Copy in case it is the last pos 

1240 pos_paths[-1].append(u + path) 

1241 

1242 # import json 

1243 # print("POS_PATHS:", json.dumps(pos_paths, indent=2, sort_keys=True)) 

1244 

1245 if not pos_paths[-1]: 1245 ↛ 1247line 1245 didn't jump to line 1247 because the condition on line 1245 was never true

1246 # print("decode_tags: {}: EMPTY POS_PATHS[-1]".format(src)) 

1247 return [], [] 

1248 

1249 # Find the best path 

1250 pw = [] 

1251 for path in pos_paths[-1]: 

1252 weight = len(path) 

1253 if any(x[1] == ["UNKNOWN"] for x in path): 

1254 weight += 100 # Penalize unknown paths 

1255 pw.append((weight, path)) 

1256 path = min(pw)[1] 

1257 

1258 # Convert the best path to tagsets and topics 

1259 tagsets: list[list[str]] = [[]] 

1260 topics: list[str] = [] 

1261 for i, tagspec, topicspec in path: 

1262 if len(tagsets or "") > 16: 1262 ↛ 1265line 1262 didn't jump to line 1265 because the condition on line 1262 was never true

1263 # ctx.error("Too many tagsets! This is probably exponential", 

1264 # sortid="form_descriptions/20230818") 

1265 return [("error-unknown-tag", "error-exponential-tagsets")], [] 

1266 if tagspec == ["UNKNOWN"]: 

1267 new_tagsets = [] 

1268 for x in tagsets: 

1269 new_tagsets.append(x + topicspec) 

1270 tagsets = new_tagsets 

1271 continue 

1272 if tagspec: 

1273 new_tagsets = [] 

1274 for x in tagsets: 

1275 for t in tagspec: 

1276 if t: 1276 ↛ 1283line 1276 didn't jump to line 1283 because the condition on line 1276 was always true

1277 new_tags = list(x) 

1278 for tag in t.split(): 

1279 if tag not in new_tags: 

1280 new_tags.append(tag) 

1281 new_tagsets.append(new_tags) 

1282 else: 

1283 new_tagsets.append(x) 

1284 tagsets = new_tagsets 

1285 if topicspec: 

1286 for t in topicspec: 

1287 for topic in t.split(): 

1288 if topic not in topics: 

1289 topics.append(topic) 

1290 

1291 # print("unsorted tagsets:", tagsets) 

1292 ret_tagsets = sorted(set(tuple(sorted(set(tags))) for tags in tagsets)) 

1293 # topics = list(sorted(set(topics))) XXX tests expect not sorted 

1294 # print("decode_tags: {} -> {} topics {}".format(src, tagsets, topics)) 

1295 # Yes, ret_tagsets is a list of tags in tuples, while topics is a LIST 

1296 # of tags. Turning topics into a tuple breaks tests, turning the tuples 

1297 # inside tagsets into lists breaks tests, I'm leaving them mismatched 

1298 # for now. XXX 

1299 return ret_tagsets, topics 

1300 

1301 

1302def parse_head_final_tags( 

1303 wxr: WiktextractContext, lang: str, form: str 

1304) -> tuple[str, list[str]]: 

1305 """Parses tags that are allowed at the end of a form head from the end 

1306 of the form. This can also be used for parsing the final gender etc tags 

1307 from translations and linkages.""" 

1308 assert isinstance(wxr, WiktextractContext) 

1309 assert isinstance(lang, str) # Should be language that "form" is for 

1310 assert isinstance(form, str) 

1311 

1312 # print("parse_head_final_tags: lang={} form={!r}".format(lang, form)) 

1313 

1314 # Make sure there are no double spaces in the form as this code does not 

1315 # handle them otherwise. 

1316 form = re.sub(r"\s+", " ", form.strip()) 

1317 if not form: 

1318 return form, [] 

1319 

1320 origform = form 

1321 

1322 tags = [] 

1323 

1324 # If parsing for certain Bantu languages (e.g., Swahili), handle 

1325 # some extra head-final tags first 

1326 if lang in head_final_bantu_langs: 

1327 m = re.search(head_final_bantu_re, form) 

1328 if m is not None: 

1329 tagkeys = m.group(1) 

1330 if not wxr.wtp.title.endswith(tagkeys): # type:ignore[union-attr] 1330 ↛ 1345line 1330 didn't jump to line 1345 because the condition on line 1330 was always true

1331 form = form[: m.start()] 

1332 v = head_final_bantu_map[tagkeys] 

1333 if v.startswith("?"): 1333 ↛ 1334line 1333 didn't jump to line 1334 because the condition on line 1333 was never true

1334 v = v[1:] 

1335 wxr.wtp.debug( 

1336 "suspicious suffix {!r} in language {}: {}".format( 

1337 tagkeys, lang, origform 

1338 ), 

1339 sortid="form_descriptions/1028", 

1340 ) 

1341 tags.extend(v.split()) 

1342 

1343 # If parsing for certain Semitic languages (e.g., Arabic), handle 

1344 # some extra head-final tags first 

1345 if lang in head_final_semitic_langs: 

1346 m = re.search(head_final_semitic_re, form) 

1347 if m is not None: 

1348 tagkeys = m.group(1) 

1349 if not wxr.wtp.title.endswith(tagkeys): # type:ignore[union-attr] 1349 ↛ 1364line 1349 didn't jump to line 1364 because the condition on line 1349 was always true

1350 form = form[: m.start()] 

1351 v = head_final_semitic_map[tagkeys] 

1352 if v.startswith("?"): 1352 ↛ 1353line 1352 didn't jump to line 1353 because the condition on line 1352 was never true

1353 v = v[1:] 

1354 wxr.wtp.debug( 

1355 "suspicious suffix {!r} in language {}: {}".format( 

1356 tagkeys, lang, origform 

1357 ), 

1358 sortid="form_descriptions/1043", 

1359 ) 

1360 tags.extend(v.split()) 

1361 

1362 # If parsing for certain other languages (e.g., Lithuanian, 

1363 # French, Finnish), handle some extra head-final tags first 

1364 if lang in head_final_other_langs: 

1365 m = re.search(head_final_other_re, form) 

1366 if m is not None: 

1367 tagkeys = m.group(1) 

1368 if not wxr.wtp.title.endswith(tagkeys): # type:ignore[union-attr] 1368 ↛ 1373line 1368 didn't jump to line 1373 because the condition on line 1368 was always true

1369 form = form[: m.start()] 

1370 tags.extend(head_final_other_map[tagkeys].split(" ")) 

1371 

1372 # Handle normal head-final tags 

1373 m = re.search(head_final_re, form) 

1374 if m is not None: 

1375 tagkeys = m.group(3) 

1376 # Only replace tags ending with numbers in languages that have 

1377 # head-final numeric tags (e.g., Bantu classes); also, don't replace 

1378 # tags if the main title ends with them (then presume they are part 

1379 # of the word) 

1380 # print("head_final_tags form={!r} tagkeys={!r} lang={}" 

1381 # .format(form, tagkeys, lang)) 

1382 tagkeys_contains_digit = re.search(r"\d", tagkeys) 

1383 if ( 

1384 (not tagkeys_contains_digit or lang in head_final_numeric_langs) 

1385 and not wxr.wtp.title.endswith(" " + tagkeys) # type:ignore[union-attr] 

1386 and 

1387 # XXX the above test does not capture when the whole word is a 

1388 # xlat_head_map key, so I added the below test to complement 

1389 # it; does this break anything? 

1390 not wxr.wtp.title == tagkeys 

1391 ): # defunct/English, 

1392 # "more defunct" -> "more" ["archaic"] 

1393 if not tagkeys_contains_digit or lang in head_final_numeric_langs: 1393 ↛ 1407line 1393 didn't jump to line 1407 because the condition on line 1393 was always true

1394 form = form[: m.start()] 

1395 v = xlat_head_map[tagkeys] 

1396 if v.startswith("?"): 1396 ↛ 1397line 1396 didn't jump to line 1397 because the condition on line 1396 was never true

1397 v = v[1:] 

1398 wxr.wtp.debug( 

1399 "suspicious suffix {!r} in language {}: {}".format( 

1400 tagkeys, lang, origform 

1401 ), 

1402 sortid="form_descriptions/1077", 

1403 ) 

1404 tags.extend(v.split()) 

1405 

1406 # Generate warnings about words ending in " or" after processing 

1407 if ( 

1408 (form.endswith(" or") and not origform.endswith(" or")) 

1409 or re.search( 

1410 r" (1|2|3|4|5|6|7|8|9|10|11|12|13|14|15|16|17|18|" 

1411 r"1a|2a|9a|10a|m1|f1|f2|m2|f3|m3|f4|m4|f5|m5|or|\?)" 

1412 r"($|/| (f|m|sg|pl|anim|inan))", 

1413 form, 

1414 ) 

1415 or form.endswith(" du") 

1416 ): 

1417 if form not in ok_suspicious_forms: 

1418 wxr.wtp.debug( 

1419 "suspicious unhandled suffix in {}: {!r}, originally {!r}".format( 

1420 lang, form, origform 

1421 ), 

1422 sortid="form_descriptions/1089", 

1423 ) 

1424 

1425 # print("parse_head_final_tags: form={!r} tags={}".format(form, tags)) 

1426 return form, tags 

1427 

1428 

1429def quote_kept_parens(s: str) -> str: 

1430 """Changes certain parenthesized expressions so that they won't be 

1431 interpreted as parentheses. This is used for parts that are kept as 

1432 part of the word, such as "read admiral (upper half)".""" 

1433 return re.sub( 

1434 r"\((lower half|upper half|k|s|n|II|III|A|C|G|U|Y|" 

1435 r"vinyl|p-phenylene vinylene|\(\(\s*\)\))\)", 

1436 r"__lpar__\1__rpar__", 

1437 s, 

1438 ) 

1439 

1440 

1441def quote_kept_ruby( 

1442 wxr: WiktextractContext, 

1443 ruby_tuples: list[ 

1444 tuple[ 

1445 str, 

1446 str, 

1447 ] 

1448 ], 

1449 s: str, 

1450) -> str: 

1451 if len(ruby_tuples) < 1: 

1452 wxr.wtp.debug( 

1453 "quote_kept_ruby called with no ruby", 

1454 sortid="form_description/1114/20230517", 

1455 ) 

1456 return s 

1457 ks = [] 

1458 rs = [] 

1459 for k, r in ruby_tuples: 

1460 ks.append(re.escape(k)) 

1461 rs.append(re.escape(r)) 

1462 if not (ks and rs): 

1463 wxr.wtp.debug( 

1464 f"empty column in ruby_tuples: {ruby_tuples}", 

1465 sortid="form_description/1124/20230606", 

1466 ) 

1467 return s 

1468 newm = re.compile( 

1469 r"({})\s*\(\s*({})\s*\)".format("|".join(ks), "|".join(rs)) 

1470 ) 

1471 rub_re = re.compile( 

1472 r"({})".format( 

1473 r"|".join( 

1474 r"{}\(*{}\)*".format( 

1475 re.escape(k), 

1476 re.escape(r), 

1477 ) 

1478 for k, r in ruby_tuples 

1479 ) 

1480 ) 

1481 ) 

1482 

1483 def paren_replace(m: re.Match) -> str: 

1484 return re.sub(newm, r"\1__lrub__\2__rrub__", m.group(0)) 

1485 

1486 return re.sub(rub_re, paren_replace, s) 

1487 

1488 

1489def unquote_kept_parens(s: str) -> str: 

1490 """Conerts the quoted parentheses back to normal parentheses.""" 

1491 return re.sub(r"__lpar__(.*?)__rpar__", r"(\1)", s) 

1492 

1493 

1494def add_romanization( 

1495 wxr: WiktextractContext, 

1496 data: WordData, 

1497 roman: str, 

1498 text: str, 

1499 is_reconstruction: bool, 

1500 head_group: Optional[int], 

1501 ruby: Sequence[tuple[str, str]], 

1502) -> None: 

1503 tags_lst = ["romanization"] 

1504 m = re.match(r"([^:]+):(.+)", roman) 

1505 # This function's purpose is to intercept broken romanizations, 

1506 # like "Yale: hēnpyeng" style tags. Most romanization styles 

1507 # are already present as tags, so we can use decode_tags to find 

1508 # them. 

1509 if m: 1509 ↛ 1510line 1509 didn't jump to line 1510 because the condition on line 1509 was never true

1510 tagsets, topics = decode_tags(m.group(1)) 

1511 if tagsets: 

1512 for tags in tagsets: 

1513 tags_lst.extend(tags) 

1514 roman = m.group(2) 

1515 add_related( 

1516 wxr, 

1517 data, 

1518 tags_lst, 

1519 [roman], 

1520 text, 

1521 True, 

1522 is_reconstruction, 

1523 head_group, 

1524 ruby, 

1525 ) 

1526 

1527 

1528def add_related( 

1529 wxr: WiktextractContext, 

1530 data: WordData, 

1531 tags_lst: Union[list[str], tuple[str, ...]], 

1532 related_list: list[str], 

1533 origtext: str, 

1534 add_all_canonicals: bool, 

1535 is_reconstruction: bool, 

1536 head_group: Optional[int], 

1537 ruby_data: Optional[Sequence[tuple[str, str]]] = None, 

1538) -> Optional[list[tuple[str, ...]]]: 

1539 """Internal helper function for some post-processing entries for related 

1540 forms (e.g., in word head). This returns a list of list of tags to be 

1541 added to following related forms or None (cf. walrus/English word head, 

1542 parenthesized part starting with "both").""" 

1543 assert isinstance(wxr, WiktextractContext) 

1544 assert isinstance(tags_lst, (list, tuple)) 

1545 for x in tags_lst: 

1546 assert isinstance(x, str) 

1547 assert isinstance(related_list, (list, tuple)) 

1548 assert isinstance(origtext, str) 

1549 assert add_all_canonicals in (True, False) 

1550 assert isinstance(ruby_data, (list, tuple)) or ruby_data is None 

1551 if ruby_data is None: 1551 ↛ 1552line 1551 didn't jump to line 1552 because the condition on line 1551 was never true

1552 ruby_data = [] 

1553 # print("add_related: tags_lst={} related={}".format(tags_lst, related)) 

1554 related = " ".join(related_list) 

1555 if related == "[please provide]": 1555 ↛ 1556line 1555 didn't jump to line 1556 because the condition on line 1555 was never true

1556 return None 

1557 if related in IGNORED_RELATED: 1557 ↛ 1558line 1557 didn't jump to line 1558 because the condition on line 1557 was never true

1558 return None 

1559 if is_reconstruction and related.startswith("*") and len(related) > 1: 

1560 related = related[1:] 

1561 

1562 # Get title word, with any reconstruction prefix removed 

1563 titleword = re.sub(r"^Reconstruction:[^/]*/", "", wxr.wtp.title) # type:ignore[arg-type] 

1564 

1565 def check_related(related: str) -> None: 

1566 # Warn about some suspicious related forms 

1567 m = re.search(suspicious_related_re, related) 

1568 if (m and m.group(0) not in titleword) or ( 

1569 related in ("f", "m", "n", "c") and len(titleword) >= 3 

1570 ): 

1571 if "eumhun" in tags_lst: 1571 ↛ 1572line 1571 didn't jump to line 1572 because the condition on line 1571 was never true

1572 return 

1573 if "cangjie-input" in tags_lst: 1573 ↛ 1574line 1573 didn't jump to line 1574 because the condition on line 1573 was never true

1574 return 

1575 if "class" in tags_lst: 1575 ↛ 1576line 1575 didn't jump to line 1576 because the condition on line 1575 was never true

1576 return 

1577 if wxr.wtp.section == "Korean" and re.search( 1577 ↛ 1581line 1577 didn't jump to line 1581 because the condition on line 1577 was never true

1578 r"^\s*\w*>\w*\s*$", related 

1579 ): 

1580 # ignore Korean "i>ni" / "라>나" values 

1581 return 

1582 if ( 1582 ↛ 1589line 1582 didn't jump to line 1589 because the condition on line 1582 was never true

1583 wxr.wtp.section == "Burmese" 

1584 and "romanization" in tags_lst 

1585 and re.search(r":", related) 

1586 ): 

1587 # ignore Burmese with ":", that is used in Burmese 

1588 # translitteration of "း", the high-tone visarga. 

1589 return 

1590 wxr.wtp.debug( 

1591 "suspicious related form tags {}: {!r} in {!r}".format( 

1592 tags_lst, related, origtext 

1593 ), 

1594 sortid="form_descriptions/1147", 

1595 ) 

1596 

1597 following_tagsets = None # Tagsets to add to following related forms 

1598 roman = None 

1599 tagsets1: list[tuple[str, ...]] = [tuple()] 

1600 topics1: list[str] = [] 

1601 

1602 m = re.match(r"\((([^()]|\([^()]*\))*)\)\s+", related) 

1603 if m: 

1604 paren = m.group(1) 

1605 related = related[m.end() :] 

1606 m = re.match(r"^(all|both) (.*)", paren) 

1607 if m: 1607 ↛ 1608line 1607 didn't jump to line 1608 because the condition on line 1607 was never true

1608 tagsets1, topics1 = decode_tags(m.group(2)) 

1609 following_tagsets = tagsets1 

1610 else: 

1611 tagsets1, topics1 = decode_tags(paren) 

1612 else: 

1613 m = re.search(r"\s+\((([^()]|\([^()]*\))*)\)$", related) 

1614 if m: 

1615 paren = m.group(1) 

1616 if paren.startswith("U+"): 1616 ↛ 1617line 1616 didn't jump to line 1617 because the condition on line 1616 was never true

1617 related = related[: m.start()] 

1618 else: 

1619 cls = classify_desc(paren) 

1620 if ( 1620 ↛ 1627line 1620 didn't jump to line 1627 because the condition on line 1620 was always true

1621 cls in ("romanization", "english") 

1622 and classify_desc(related[: m.start()]) == "other" 

1623 ): 

1624 roman = paren 

1625 related = related[: m.start()] 

1626 else: 

1627 related = related[: m.start()] 

1628 tagsets1, topics1 = decode_tags(paren) 

1629 if related and related.startswith("{{"): 1629 ↛ 1630line 1629 didn't jump to line 1630 because the condition on line 1629 was never true

1630 wxr.wtp.debug( 

1631 "{{ in word head form - possible Wiktionary error: {!r}".format( 

1632 related 

1633 ), 

1634 sortid="form_descriptions/1177", 

1635 ) 

1636 return None # Likely Wiktionary coding error 

1637 related = unquote_kept_parens(related) 

1638 # Split related by "/" (e.g., grande/Spanish) superlative in head 

1639 # Do not split if / in word title, see π//Japanese 

1640 if len(related) > 5 and "/" not in wxr.wtp.title: # type:ignore[operator] 

1641 alts = split_at_comma_semi(related, separators=["/"]) 

1642 else: 

1643 alts = [related] 

1644 if ruby_data: 1644 ↛ 1646line 1644 didn't jump to line 1646 because the condition on line 1644 was never true

1645 # prepare some regex stuff in advance 

1646 ks, rs = [], [] 

1647 for k, r in ruby_data: 

1648 ks.append(re.escape(k)) 

1649 rs.append(re.escape(r)) 

1650 splitter = r"((?:{})__lrub__(?:{})__rrub__)".format( 

1651 "|".join(ks), "|".join(rs) 

1652 ) 

1653 for related in alts: 

1654 ruby: list[tuple[str, str]] = [] 

1655 if ruby_data: 1655 ↛ 1656line 1655 didn't jump to line 1656 because the condition on line 1655 was never true

1656 new_related = [] 

1657 rub_split = re.split(splitter, related) 

1658 for s in rub_split: 

1659 m = re.match(r"(.+)__lrub__(.+)__rrub__", s) 

1660 if m: 

1661 # add ruby with (\1, \2) 

1662 ruby.append((m.group(1), m.group(2))) 

1663 new_related.append(m.group(1)) 

1664 else: 

1665 new_related.append(s) 

1666 related = "".join(new_related) 

1667 tagsets2, topics2 = decode_tags(" ".join(tags_lst)) 

1668 for tags1 in tagsets1: 

1669 assert isinstance(tags1, (list, tuple)) 

1670 for tags2 in tagsets2: 

1671 assert isinstance(tags1, (list, tuple)) 

1672 dt: LinkageData = {"word": related} 

1673 if roman: 

1674 dt["roman"] = roman 

1675 if ruby: 1675 ↛ 1676line 1675 didn't jump to line 1676 because the condition on line 1675 was never true

1676 dt["ruby"] = ruby 

1677 if "alt-of" in tags2: 1677 ↛ 1678line 1677 didn't jump to line 1678 because the condition on line 1677 was never true

1678 check_related(related) 

1679 data_extend(data, "tags", tags1) 

1680 data_extend(data, "tags", tags2) 

1681 data_extend(data, "topics", topics1) 

1682 data_extend(data, "topics", topics2) 

1683 data_append(data, "alt_of", dt) 

1684 elif "form-of" in tags2: 1684 ↛ 1685line 1684 didn't jump to line 1685 because the condition on line 1684 was never true

1685 check_related(related) 

1686 data_extend(data, "tags", tags1) 

1687 data_extend(data, "tags", tags2) 

1688 data_extend(data, "topics", topics1) 

1689 data_extend(data, "topics", topics2) 

1690 data_append(data, "form_of", dt) 

1691 elif "compound-of" in tags2: 1691 ↛ 1692line 1691 didn't jump to line 1692 because the condition on line 1691 was never true

1692 check_related(related) 

1693 data_extend(data, "tags", tags1) 

1694 data_extend(data, "tags", tags2) 

1695 data_extend(data, "topics", topics1) 

1696 data_extend(data, "topics", topics2) 

1697 data_append(data, "compound", related) 

1698 else: 

1699 lang = wxr.wtp.section or "LANG_MISSING" 

1700 related, final_tags = parse_head_final_tags( 

1701 wxr, lang, related 

1702 ) 

1703 # print("add_related: related={!r} tags1={!r} tags2={!r} " 

1704 # "final_tags={!r}" 

1705 # .format(related, tags1, tags2, final_tags)) 

1706 tags = list(tags1) + list(tags2) + list(final_tags) 

1707 check_related(related) 

1708 form: FormData = {"form": related} 

1709 if head_group: 

1710 form["head_nr"] = head_group 

1711 if roman: 

1712 form["roman"] = roman 

1713 if ruby: 1713 ↛ 1714line 1713 didn't jump to line 1714 because the condition on line 1713 was never true

1714 form["ruby"] = ruby 

1715 data_extend(form, "topics", topics1) 

1716 data_extend(form, "topics", topics2) 

1717 if topics1 or topics2: 1717 ↛ 1718line 1717 didn't jump to line 1718 because the condition on line 1717 was never true

1718 wxr.wtp.debug( 

1719 "word head form has topics: {}".format(form), 

1720 sortid="form_descriptions/1233", 

1721 ) 

1722 # Add tags from canonical form into the main entry 

1723 if "canonical" in tags: 

1724 if related in ("m", "f") and len(titleword) > 1: 1724 ↛ 1725line 1724 didn't jump to line 1725 because the condition on line 1724 was never true

1725 wxr.wtp.debug( 

1726 "probably incorrect canonical form " 

1727 "{!r} ignored (probably tag combination " 

1728 "missing from xlat_head_map)".format(related), 

1729 sortid="form_descriptions/1241", 

1730 ) 

1731 continue 

1732 if ( 

1733 related != titleword 

1734 or add_all_canonicals 

1735 or topics1 

1736 or topics2 

1737 or ruby 

1738 ): 

1739 data_extend(form, "tags", list(sorted(set(tags)))) 

1740 else: 

1741 # We won't add canonical form here 

1742 filtered_tags = list( 

1743 x for x in tags if x != "canonical" 

1744 ) 

1745 data_extend(data, "tags", filtered_tags) 

1746 continue 

1747 else: 

1748 data_extend(form, "tags", list(sorted(set(tags)))) 

1749 # Only insert if the form is not already there 

1750 for old in data.get("forms", ()): 

1751 if form == old: 1751 ↛ 1752line 1751 didn't jump to line 1752 because the condition on line 1751 was never true

1752 break 

1753 else: 

1754 data_append(data, "forms", form) 

1755 

1756 # If this form had pre-tags that started with "both" or "all", add those 

1757 # tags also to following related forms that don't have their own tags 

1758 # specified. 

1759 return following_tagsets 

1760 

1761 

1762def parse_word_head( 

1763 wxr: WiktextractContext, 

1764 pos: str, 

1765 text: str, 

1766 data: WordData, 

1767 is_reconstruction: bool, 

1768 head_group: Optional[int], 

1769 ruby=None, 

1770 links=None, 

1771) -> None: 

1772 """Parses the head line for a word for in a particular language and 

1773 part-of-speech, extracting tags and related forms.""" 

1774 assert isinstance(wxr, WiktextractContext) 

1775 assert isinstance(pos, str) 

1776 assert isinstance(text, str) 

1777 assert isinstance(data, dict) 

1778 assert isinstance(ruby, (list, tuple)) or ruby is None 

1779 if ruby is None: 

1780 ruby = [] 

1781 assert is_reconstruction in (True, False) 

1782 # print("PARSE_WORD_HEAD: {}: {!r}".format(wxr.wtp.section, text)) 

1783 # print(f"PARSE_WORD_HEAD: {data=}") 

1784 if links is None: 

1785 links = [] 

1786 

1787 if len(links) > 0: 

1788 # if we have link data (that is, links with stuff like commas and 

1789 # spaces, replace word_re with a modified local scope pattern 

1790 word_re = re.compile( 

1791 r"|".join( 

1792 sorted((re.escape(s) for s in links), key=lambda x: -len(x)) 

1793 ) 

1794 + r"|" 

1795 + word_pattern 

1796 ) 

1797 else: 

1798 word_re = word_re_global 

1799 

1800 if "Lua execution error" in text or "Lua timeout error" in text: 1800 ↛ 1801line 1800 didn't jump to line 1801 because the condition on line 1800 was never true

1801 return 

1802 

1803 # In Aug 2021, some words had spurious Template:en at the end of head forms 

1804 # due to a Wiktionary error. 

1805 text = re.sub(r"\s+Template:[-a-zA-Z]+\s*$", "", text) 

1806 

1807 # Fix words with "superlative:" or "comparative:" at end of head 

1808 # e.g. grande/Spanish/Adj 

1809 text = re.sub(r" (superlative|comparative): (.*)", r" (\1 \2)", text) 

1810 

1811 # Parse Arabic non-past forms, e.g. أبلع/Arabic/Verb 

1812 m = re.search(r", non-past ([^)]+ \([^)]+\))", text) 

1813 if m: 

1814 add_related( 

1815 wxr, 

1816 data, 

1817 ["non-past"], 

1818 [m.group(1)], 

1819 text, 

1820 True, 

1821 is_reconstruction, 

1822 head_group, 

1823 ruby, 

1824 ) 

1825 text = text[: m.start()] + text[m.end() :] 

1826 

1827 language = wxr.wtp.section 

1828 titleword = re.sub( 

1829 r"^Reconstruction:[^/]*/", "", wxr.wtp.title or "MISSING_TITLE" 

1830 ) 

1831 titleparts = list( 

1832 m.group(0) 

1833 for m in re.finditer(word_re, wxr.wtp.title or "MISSING_TITLE") 

1834 ) 

1835 if not titleparts: 1835 ↛ 1836line 1835 didn't jump to line 1836 because the condition on line 1835 was never true

1836 return 

1837 

1838 # Remove " or" from the end to prevent weird canonical forms 

1839 if text.endswith(" or"): 

1840 for tp in titleparts: 

1841 if text.endswith(tp): 1841 ↛ 1842line 1841 didn't jump to line 1842 because the condition on line 1841 was never true

1842 break 

1843 else: 

1844 text = text.removesuffix(" or").rstrip() 

1845 

1846 # Handle the part of the head that is not in parentheses. However, certain 

1847 # parenthesized parts are part of word, and those must be handled 

1848 # specially here. 

1849 if ruby: 1849 ↛ 1850line 1849 didn't jump to line 1850 because the condition on line 1849 was never true

1850 text = quote_kept_ruby(wxr, ruby, text) 

1851 base = text 

1852 base = quote_kept_parens(base) 

1853 base = remove_text_in_parentheses(base) 

1854 base = base.replace("?", "") # Removes uncertain articles etc 

1855 base = re.sub(r"\s+", " ", base) 

1856 base = re.sub(r" ([,;])", r"\1", base) 

1857 base = re.sub(r"(.*) •.*", r"\1", base) 

1858 # Many languages use • as a punctuation mark separating the base 

1859 # from the rest of the head. στάδιος/Ancient Greek, issue #176 

1860 base = base.strip() 

1861 

1862 # Check for certain endings in head (mostly for compatibility with weird 

1863 # heads, e.g. rata/Romanian "1st conj." at end) 

1864 m = re.search(head_end_re, base) 

1865 tags: Union[tuple[str, ...], list[str]] = [] 

1866 if m: 1866 ↛ 1867line 1866 didn't jump to line 1867 because the condition on line 1866 was never true

1867 tags = head_end_map[m.group(1).lower()].split() 

1868 data_extend(data, "tags", tags) 

1869 base = base[: m.start()] 

1870 

1871 # Special case: handle Hán Nôm readings for Vietnamese characters 

1872 m = re.match( 

1873 r"{}: (Hán Nôm) readings: (.*)".format(re.escape(titleword)), base 

1874 ) 

1875 if m: 1875 ↛ 1876line 1875 didn't jump to line 1876 because the condition on line 1875 was never true

1876 tag, readings = m.groups() 

1877 tag = re.sub(r"\s+", "-", tag) 

1878 for reading in split_at_comma_semi(readings, skipped=links): 

1879 add_related( 

1880 wxr, 

1881 data, 

1882 [tag], 

1883 [reading], 

1884 text, 

1885 True, 

1886 is_reconstruction, 

1887 head_group, 

1888 ruby, 

1889 ) 

1890 return 

1891 

1892 # Special case: Hebrew " [pattern: nnn]" ending 

1893 m = re.search(r"\s+\[pattern: ([^]]+)\]", base) 

1894 if m: 1894 ↛ 1895line 1894 didn't jump to line 1895 because the condition on line 1894 was never true

1895 add_related( 

1896 wxr, 

1897 data, 

1898 ["class"], 

1899 [m.group(1)], 

1900 text, 

1901 True, 

1902 is_reconstruction, 

1903 head_group, 

1904 ruby, 

1905 ) 

1906 base = base[: m.start()] + base[m.end() :] 

1907 

1908 # Clean away some messy "Upload an image" template text used in 

1909 # American Sign Language: 

1910 # S@NearBaseForearm-PalmUp Frontandback S@BaseForearm-PalmUp 

1911 m = re.search(r"Upload .+ gif image.", base) 

1912 if m: 1912 ↛ 1913line 1912 didn't jump to line 1913 because the condition on line 1912 was never true

1913 base = base[: m.start()] + base[m.end() :] 

1914 

1915 # Split the head into alternatives. This is a complicated task, as 

1916 # we do not want so split on "or" or "," when immediately followed by more 

1917 # head-final tags, but otherwise do want to split by them. 

1918 # 20230907 added "or" to this to handle 'true or false', titles with 'or' 

1919 if wxr.wtp.title and ("," in wxr.wtp.title or " or " in wxr.wtp.title): 

1920 # A kludge to handle article titles/phrases with commas. 

1921 # Preprocess splits to first capture the title, then handle 

1922 # all the others as usual. 

1923 presplits = re.split(r"({})".format(wxr.wtp.title), base) 

1924 splits = [] 

1925 for psplit in presplits: 

1926 if psplit == wxr.wtp.title: 

1927 splits.append(psplit) 

1928 else: 

1929 splits.extend(re.split(head_split_re, psplit)) 

1930 else: 

1931 # Do the normal split; previous only-behavior. 

1932 splits = re.split(head_split_re, base) 

1933 # print("SPLITS:", splits) 

1934 alts: list[str] = [] 

1935 # print("parse_word_head: splits:", splits, 

1936 # "head_split_re_parens:", head_split_re_parens) 

1937 for i in range( 

1938 0, len(splits) - head_split_re_parens, head_split_re_parens + 1 

1939 ): 

1940 v = splits[i] 

1941 ending = splits[i + 1] or "" # XXX is this correct??? 

1942 # print("parse_word_head alts v={!r} ending={!r} alts={}" 

1943 # .format(v, ending, alts)) 

1944 if alts and (v == "" and ending): 

1945 assert ending[0] == " " 

1946 alts[-1] += " or" + ending # endings starts with space 

1947 elif v or ending: 1947 ↛ 1937line 1947 didn't jump to line 1937 because the condition on line 1947 was always true

1948 alts.append((v or "") + (ending or "")) 

1949 last = splits[-1].strip() 

1950 conn = "" if len(splits) < 3 else splits[-2] 

1951 # print("parse_word_head alts last={!r} conn={!r} alts={}" 

1952 # .format(last, conn, alts)) 

1953 if ( 

1954 alts 

1955 and last 

1956 and ( 

1957 last.split()[0] in xlat_head_map 

1958 or ( 

1959 conn == " or " 

1960 and (alts[-1] + " or " + last).strip() in xlat_head_map 

1961 ) 

1962 ) 

1963 ): 

1964 alts[-1] += " or " + last 

1965 elif last: 

1966 alts.append(last) 

1967 

1968 # print("parse_word_head alts: {}".format(alts)) 

1969 # print(f"{base=}") 

1970 

1971 # Process the head alternatives 

1972 canonicals: list[tuple[list[str], list[str]]] = [] 

1973 mode: Optional[str] = None 

1974 for alt_i, alt in enumerate(alts): 

1975 alt = alt.strip() 

1976 if alt.startswith("compound form:"): 1976 ↛ 1977line 1976 didn't jump to line 1977 because the condition on line 1976 was never true

1977 mode = "compound-form" 

1978 alt = alt[14:].strip() 

1979 if mode == "compound-form": 1979 ↛ 1980line 1979 didn't jump to line 1980 because the condition on line 1979 was never true

1980 add_related( 

1981 wxr, 

1982 data, 

1983 ["in-compounds"], 

1984 [alt], 

1985 text, 

1986 True, 

1987 is_reconstruction, 

1988 head_group, 

1989 ruby, 

1990 ) 

1991 continue 

1992 # For non-first parts, see if it can be treated as tags-only 

1993 if alt_i == 0: 

1994 expanded_alts = [alt] 

1995 else: 

1996 expanded_alts = map_with(xlat_descs_map, [alt]) 

1997 # print("EXPANDED_ALTS:", expanded_alts) 

1998 tagsets: Optional[list[tuple[str, ...]]] 

1999 for alt in expanded_alts: 

2000 baseparts = list(m.group(0) for m in re.finditer(word_re, alt)) 

2001 if alt_i > 0: 

2002 tagsets, topics = decode_tags(" ".join(baseparts)) 

2003 if not any("error-unknown-tag" in x for x in tagsets): 

2004 data_extend(data, "topics", topics) 

2005 for tags1 in tagsets: 

2006 data_extend(data, "tags", tags1) 

2007 continue 

2008 

2009 alt, tags = parse_head_final_tags( 

2010 wxr, language or "MISSING_LANG", alt 

2011 ) 

2012 tags = list(tags) # Make sure we don't modify anything cached 

2013 tags.append("canonical") 

2014 if alt_i == 0 and "," in wxr.wtp.title: # type:ignore[operator] 

2015 # Kludge to handle article titles/phrases with commas. 

2016 # basepart's regex strips commas, which leads to a 

2017 # canonical form that is the title phrase without a comma. 

2018 # basepart in add_related is almost immediately joined with 

2019 # spaces anyhow. XXX not exactly sure why it's 

2020 # canonicals.append((tags, baseparts)) and not (tags, [alt]) 

2021 baseparts = [alt] 

2022 canonicals.append((tags, baseparts)) 

2023 for tags, baseparts in canonicals: 

2024 add_related( 

2025 wxr, 

2026 data, 

2027 tags, 

2028 baseparts, 

2029 text, 

2030 len(canonicals) > 1, 

2031 is_reconstruction, 

2032 head_group, 

2033 ruby, 

2034 ) 

2035 

2036 # Handle parenthesized descriptors for the word form and links to 

2037 # related words 

2038 text = quote_kept_parens(text) 

2039 parens = list( 

2040 m.group(2) 

2041 for m in re.finditer(r"(^|\s)\((([^()]|\([^()]*\))*)\)", text) 

2042 ) 

2043 parens.extend( 

2044 m.group(1) 

2045 for m in re.finditer(r"[^\s]\((([^()]|\([^()]*\))*)\)($|\s)", text) 

2046 ) 

2047 have_romanization = False 

2048 have_ruby = False 

2049 hiragana = "" 

2050 katakana = "" 

2051 for paren in parens: 

2052 paren = paren.strip() 

2053 if not paren: 2053 ↛ 2054line 2053 didn't jump to line 2054 because the condition on line 2053 was never true

2054 continue 

2055 if paren.startswith("see "): 

2056 continue 

2057 if paren.startswith("U+"): 2057 ↛ 2058line 2057 didn't jump to line 2058 because the condition on line 2057 was never true

2058 continue 

2059 # In some rare cases, strip word that inflects form the form 

2060 # description, e.g. "look through rose-tinted glasses"/English. 

2061 paren = re.sub(r"\s*\(\[[^])]*\]\)", "", paren) 

2062 

2063 # If it starts with hiragana or katakana, treat as such form. Note 

2064 # that each hiragana/katakana character is in separate parentheses, 

2065 # so we must concatenate them. 

2066 try: 

2067 un = unicodedata.name(paren[0]).split()[0] 

2068 except ValueError: 

2069 un = "INVALID" 

2070 if un == "KATAKANA": 2070 ↛ 2071line 2070 didn't jump to line 2071 because the condition on line 2070 was never true

2071 katakana += paren 

2072 have_ruby = True 

2073 continue 

2074 if un == "HIRAGANA": 2074 ↛ 2075line 2074 didn't jump to line 2075 because the condition on line 2074 was never true

2075 hiragana += paren 

2076 have_ruby = True 

2077 continue 

2078 

2079 # Parse format ", 16 (Japan, Mainland), 17 (Hong Kong, Taiwan) strokes," 

2080 # in the middle of the parenthesized expression, e.g. 薄 

2081 def strokes_repl(m: re.Match) -> str: 

2082 strokes1, tags1, strokes2, tags2 = m.groups() 

2083 for strokes, tags in [[strokes1, tags1], [strokes2, tags2]]: 

2084 tags = tags.split(", ") 

2085 tags = list( 

2086 "Mainland China" if t == "Mainland" else t for t in tags 

2087 ) 

2088 tags.append("strokes") 

2089 add_related( 

2090 wxr, 

2091 data, 

2092 tags, 

2093 [strokes], 

2094 text, 

2095 True, 

2096 is_reconstruction, 

2097 head_group, 

2098 ruby, 

2099 ) 

2100 return ", " 

2101 

2102 paren = re.sub( 

2103 r", (\d+) \(([^()]+)\), (\d+) \(([^()]+)\) strokes, ", 

2104 strokes_repl, 

2105 paren, 

2106 ) 

2107 

2108 descriptors = map_with(xlat_descs_map, [paren]) 

2109 new_desc = [] 

2110 for desc in descriptors: 

2111 new_desc.extend( 

2112 map_with( 

2113 xlat_tags_map, 

2114 split_at_comma_semi(desc, extra=[", or "], skipped=links), 

2115 ) 

2116 ) 

2117 prev_tags: Union[list[list[str]], list[tuple[str, ...]], None] = None 

2118 following_tags = None # Added to prev_tags from previous parenthesized 

2119 # part, e.g. walrus/English 

2120 # "(both nonstandard, proscribed, uncommon)" 

2121 for desc_i, desc in enumerate(new_desc): 

2122 # print("HEAD DESC: {!r}".format(desc)) 

2123 

2124 # Abort on certain descriptors (assume remaining values are 

2125 # examples or uninteresting, cf. gaan/Navajo, horior/Latin) 

2126 if re.match(r"^(per |e\.g\.$)", desc): 2126 ↛ 2127line 2126 didn't jump to line 2127 because the condition on line 2126 was never true

2127 break 

2128 

2129 # If it all consists of CJK characters, add it with the 

2130 # CJK tag. This is used at least for some Vietnamese 

2131 # words (e.g., ba/Vietnamese) 

2132 try: 

2133 if all(unicodedata.name(x).startswith("CJK ") for x in desc): 2133 ↛ 2134line 2133 didn't jump to line 2134 because the condition on line 2133 was never true

2134 add_related( 

2135 wxr, 

2136 data, 

2137 ["CJK"], 

2138 [desc], 

2139 text, 

2140 True, 

2141 is_reconstruction, 

2142 head_group, 

2143 ruby, 

2144 ) 

2145 continue 

2146 except ValueError: 

2147 pass 

2148 

2149 # Handle some special cases 

2150 splitdesc = desc.split() 

2151 if ( 2151 ↛ 2160line 2151 didn't jump to line 2160 because the condition on line 2151 was never true

2152 len(splitdesc) >= 3 

2153 and splitdesc[1] == "superlative" 

2154 and classify_desc(splitdesc[0]) != "tags" 

2155 and prev_tags 

2156 ): 

2157 # Handle the special case of second comparative after comma, 

2158 # followed by superlative without comma. E.g. 

2159 # mal/Portuguese/Adv 

2160 for ts in prev_tags: 

2161 add_related( 

2162 wxr, 

2163 data, 

2164 ts, 

2165 [splitdesc[0]], 

2166 text, 

2167 True, 

2168 is_reconstruction, 

2169 head_group, 

2170 ruby, 

2171 ) 

2172 desc = " ".join(splitdesc[1:]) 

2173 elif ( 2173 ↛ 2181line 2173 didn't jump to line 2181 because the condition on line 2173 was never true

2174 len(splitdesc) == 2 

2175 and splitdesc[0] in ("also", "and") 

2176 and prev_tags 

2177 and classify_desc(splitdesc[1]) != "tags" 

2178 ): 

2179 # Sometimes alternative forms are prefixed with "also" or 

2180 # "and" 

2181 for ts in prev_tags: 

2182 add_related( 

2183 wxr, 

2184 data, 

2185 ts, 

2186 [splitdesc[1]], 

2187 text, 

2188 True, 

2189 is_reconstruction, 

2190 head_group, 

2191 ruby, 

2192 ) 

2193 continue 

2194 elif len(splitdesc) >= 2 and splitdesc[0] in ("including",): 2194 ↛ 2195line 2194 didn't jump to line 2195 because the condition on line 2194 was never true

2195 continue 

2196 

2197 # If only one word, assume it is comma-separated alternative 

2198 # to the previous one 

2199 if " " not in desc: 

2200 cls = classify_desc(desc) 

2201 if cls != "tags": 

2202 if prev_tags: 2202 ↛ 2204line 2202 didn't jump to line 2204 because the condition on line 2202 was never true

2203 # Assume comma-separated alternative to previous one 

2204 for ts in prev_tags: 

2205 add_related( 

2206 wxr, 

2207 data, 

2208 ts, 

2209 [desc], 

2210 text, 

2211 True, 

2212 is_reconstruction, 

2213 head_group, 

2214 ruby, 

2215 ) 

2216 continue 

2217 elif distw(titleparts, desc) <= 0.5: 2217 ↛ 2220line 2217 didn't jump to line 2220 because the condition on line 2217 was never true

2218 # Similar to head word, assume a dialectal variation to 

2219 # the base form. Cf. go/Alemannic German/Verb 

2220 add_related( 

2221 wxr, 

2222 data, 

2223 ["alternative"], 

2224 [desc], 

2225 text, 

2226 True, 

2227 is_reconstruction, 

2228 head_group, 

2229 ruby, 

2230 ) 

2231 continue 

2232 elif ( 2232 ↛ 2253line 2232 didn't jump to line 2253 because the condition on line 2232 was always true

2233 cls in ("romanization", "english") 

2234 and not have_romanization 

2235 and classify_desc(titleword) == "other" 

2236 and not ( 

2237 "categories" in data and desc in data["categories"] 

2238 ) 

2239 ): 

2240 # Assume it to be a romanization 

2241 add_romanization( 

2242 wxr, 

2243 data, 

2244 desc, 

2245 text, 

2246 is_reconstruction, 

2247 head_group, 

2248 ruby, 

2249 ) 

2250 have_romanization = True 

2251 continue 

2252 

2253 m = re.match(r"^(\d+) strokes?$", desc) 

2254 if m: 

2255 # Special case, used to give #strokes for Han characters 

2256 add_related( 

2257 wxr, 

2258 data, 

2259 ["strokes"], 

2260 [m.group(1)], 

2261 text, 

2262 True, 

2263 is_reconstruction, 

2264 head_group, 

2265 ruby, 

2266 ) 

2267 continue 

2268 

2269 # See if it is radical+strokes 

2270 m = re.match( 

2271 r"^([\u2F00-\u2FDF\u2E80-\u2EFF\U00018800-\U00018AFF" 

2272 r"\uA490-\uA4CF\u4E00-\u9FFF]\+\d+)" 

2273 r"( in (Japanese|Chinese|traditional Chinese|" 

2274 r"simplified Chinese))?$", 

2275 desc, 

2276 ) 

2277 if m: 2277 ↛ 2280line 2277 didn't jump to line 2280 because the condition on line 2277 was never true

2278 # Special case, used to give radical + strokes for Han 

2279 # characters 

2280 radical_strokes = m.group(1) 

2281 lang = m.group(3) 

2282 t = ["radical+strokes"] 

2283 if lang: 

2284 t.extend(lang.split()) 

2285 add_related( 

2286 wxr, 

2287 data, 

2288 t, 

2289 [radical_strokes], 

2290 text, 

2291 True, 

2292 is_reconstruction, 

2293 head_group, 

2294 ruby, 

2295 ) 

2296 prev_tags = None 

2297 following_tags = None 

2298 continue 

2299 

2300 # See if it indicates historical Katakana ortography (←) or 

2301 # just otherwise katakana/hiragana form 

2302 m = re.match(r"←\s*|kana\s+", desc) 

2303 if m: 2303 ↛ 2304line 2303 didn't jump to line 2304 because the condition on line 2303 was never true

2304 if desc.startswith("←"): 

2305 t1 = "historical " 

2306 else: 

2307 t1 = "" 

2308 x = desc[m.end() :] 

2309 if x.endswith("?"): 

2310 x = x[:-1] 

2311 # XXX should we add a tag indicating uncertainty? 

2312 if x: 

2313 name = unicodedata.name(x[0]) 

2314 if name.startswith("HIRAGANA "): 

2315 desc = t1 + "hiragana " + x 

2316 elif name.startswith("KATAKANA "): 

2317 desc = t1 + "katakana " + x 

2318 

2319 # See if it is "n strokes in Chinese" or similar 

2320 m = re.match( 

2321 r"(\d+) strokes in (Chinese|Japanese|" 

2322 r"traditional Chinese|simplified Chinese)$", 

2323 desc, 

2324 ) 

2325 if m: 2325 ↛ 2327line 2325 didn't jump to line 2327 because the condition on line 2325 was never true

2326 # Special case, used to give just strokes for some Han chars 

2327 strokes = m.group(1) 

2328 lang = m.group(2) 

2329 t = ["strokes"] 

2330 t.extend(lang.split()) 

2331 add_related( 

2332 wxr, 

2333 data, 

2334 t, 

2335 [strokes], 

2336 text, 

2337 True, 

2338 is_reconstruction, 

2339 head_group, 

2340 ruby, 

2341 ) 

2342 prev_tags = None 

2343 following_tags = None 

2344 continue 

2345 

2346 # American Sign Language has images (or requests for image) 

2347 # as heads, + this ASL gloss after. 

2348 m2 = re.search(r"\(ASL gloss:\s+(.*)\)", text) 

2349 if m2: 2349 ↛ 2350line 2349 didn't jump to line 2350 because the condition on line 2349 was never true

2350 add_related( 

2351 wxr, 

2352 data, 

2353 ["ASL-gloss"], 

2354 [m2.group(1)], 

2355 text, 

2356 True, 

2357 is_reconstruction, 

2358 head_group, 

2359 ruby, 

2360 ) 

2361 continue 

2362 

2363 parts = list(m.group(0) for m in re.finditer(word_re, desc)) 

2364 if not parts: 2364 ↛ 2365line 2364 didn't jump to line 2365 because the condition on line 2364 was never true

2365 prev_tags = None 

2366 following_tags = None 

2367 continue 

2368 

2369 # Check for certain language-specific header part starts that 

2370 # modify 

2371 if len(parts) == 2 and language in lang_specific_head_map: 2371 ↛ 2372line 2371 didn't jump to line 2372 because the condition on line 2371 was never true

2372 ht = lang_specific_head_map[language] 

2373 if parts[0] in ht: 

2374 rem_tags, add_tags = ht[parts[0]] 

2375 new_prev_tags1: list[list[str]] = [] 

2376 tags2: Union[tuple[str, ...], list[str]] 

2377 for tags2 in prev_tags or [()]: 

2378 if rem_tags is True: # Remove all old tags 

2379 tsets = set() 

2380 else: 

2381 tsets = set(tags2) - set(rem_tags.split()) 

2382 tsets = tsets | set(add_tags.split()) 

2383 tags = list(sorted(tsets)) 

2384 add_related( 

2385 wxr, 

2386 data, 

2387 tags, 

2388 [parts[1]], 

2389 text, 

2390 True, 

2391 is_reconstruction, 

2392 head_group, 

2393 ruby, 

2394 ) 

2395 new_prev_tags1.append(tags) 

2396 prev_tags = new_prev_tags1 

2397 following_tags = None 

2398 continue 

2399 

2400 # Handle the special case of descriptors that are parenthesized, 

2401 # e.g., (archaic or Scotland) 

2402 m = re.match(r"\(([^)]+)\)\s+(.*)$", desc) 

2403 if m is not None and classify_desc(m.group(1)) == "tags": 2403 ↛ 2404line 2403 didn't jump to line 2404 because the condition on line 2403 was never true

2404 tagpart = m.group(1) 

2405 related = [m.group(2)] 

2406 tagsets, topics = decode_tags(tagpart, no_unknown_starts=True) 

2407 if topics: 

2408 wxr.wtp.debug( 

2409 "parenthized head part {!r} contains topics: {}".format( 

2410 tagpart, topics 

2411 ), 

2412 sortid="form_descriptions/1647", 

2413 ) 

2414 elif m is not None and re.match(r"in the sense ", m.group(1)): 2414 ↛ 2417line 2414 didn't jump to line 2417 because the condition on line 2414 was never true

2415 # Handle certain ignored cases 

2416 # e.g. bord/Danish: in the sense "plank" 

2417 related = [m.group(2)] 

2418 tagsets = [()] 

2419 else: 

2420 # Normal parsing of the descriptor 

2421 alt_related = None 

2422 alt_tagsets = None 

2423 tagsets = None 

2424 for i in range(len(parts), 0, -1): 2424 ↛ 2469line 2424 didn't jump to line 2469 because the loop on line 2424 didn't complete

2425 related = parts[i:] 

2426 tagparts = parts[:i] 

2427 # print(" i={} related={} tagparts={}" 

2428 # .format(i, related, tagparts)) 

2429 tagsets, topics = decode_tags( 

2430 " ".join(tagparts), no_unknown_starts=True 

2431 ) 

2432 # print("tagparts={!r} tagsets={} topics={} related={} " 

2433 # "alt_related={} distw={:.2f}" 

2434 # .format(tagparts, tagsets, topics, related, 

2435 # alt_related, 

2436 # distw(titleparts, parts[i - 1]))) 

2437 if ( 

2438 topics 

2439 or not tagsets 

2440 or any("error-unknown-tag" in x for x in tagsets) 

2441 ): 

2442 if alt_related is not None: 2442 ↛ 2444line 2442 didn't jump to line 2444 because the condition on line 2442 was never true

2443 # We already had a good division, so let's stop. 

2444 break 

2445 # Bad division, try deeper 

2446 continue 

2447 if ( 2447 ↛ 2462line 2447 didn't jump to line 2462 because the condition on line 2447 was never true

2448 i > 1 

2449 and len(parts[i - 1]) >= 4 

2450 and distw(titleparts, parts[i - 1]) <= 0.4 

2451 # Fixes wiktextract #983, where "participle" 

2452 # was too close to "Martinize" and so this accepted 

2453 # ["participle", "Martinize"] as matching; this 

2454 # kludge prevents this from happening if titleparts 

2455 # is shorter than what would be 'related'. 

2456 # This breaks if we want to detect stuff that 

2457 # actually gets an extra space-separated word when 

2458 # 'inflected'. 

2459 and len(titleparts) >= len(parts[i - 1:]) 

2460 ): 

2461 # print(f"Reached; {parts=}, {parts[i-1]=}") 

2462 alt_related = related 

2463 alt_tagsets = tagsets 

2464 continue 

2465 alt_related = None 

2466 alt_tagsets = None 

2467 break 

2468 else: 

2469 if alt_related is None: 

2470 # Check if the parenthesized part is likely a 

2471 # romanization 

2472 if ( 

2473 (have_ruby or classify_desc(base) == "other") 

2474 and classify_desc(paren) == "romanization" 

2475 and not ( 

2476 "categories" in data 

2477 and desc in data["categories"] 

2478 ) 

2479 ): 

2480 for r in split_at_comma_semi( 

2481 paren, extra=[" or "], skipped=links 

2482 ): 

2483 add_romanization( 

2484 wxr, 

2485 data, 

2486 r, 

2487 text, 

2488 is_reconstruction, 

2489 head_group, 

2490 ruby, 

2491 ) 

2492 have_romanization = True 

2493 continue 

2494 tagsets = [("error-unrecognized-head-form",)] 

2495 wxr.wtp.debug( 

2496 "unrecognized head form: {}".format(desc), 

2497 sortid="form_descriptions/1698", 

2498 ) 

2499 continue 

2500 

2501 if alt_related is not None: 2501 ↛ 2502line 2501 didn't jump to line 2502 because the condition on line 2501 was never true

2502 related = alt_related 

2503 tagsets = alt_tagsets 

2504 

2505 # print("FORM END: tagsets={} related={}".format(tagsets, related)) 

2506 # print("==================") 

2507 if not tagsets: 2507 ↛ 2508line 2507 didn't jump to line 2508 because the condition on line 2507 was never true

2508 continue 

2509 

2510 assert isinstance(related, (list, tuple)) 

2511 related_str = " ".join(related) 

2512 if "or" in titleparts: 

2513 alts = [related_str] 

2514 else: 

2515 alts = split_at_comma_semi( 

2516 related_str, separators=[" or "], skipped=links 

2517 ) 

2518 if not alts: 

2519 alts = [""] 

2520 for related_str in alts: 

2521 if related_str: 

2522 if prev_tags and ( 

2523 all( 

2524 all( 

2525 t in ["nonstandard", "dialectal"] 

2526 or valid_tags[t] == "dialect" 

2527 for t in tags 

2528 ) 

2529 for ts in tagsets 

2530 ) 

2531 or ( 

2532 any("participle" in ts for ts in prev_tags) 

2533 and all( 

2534 "attributive" in ts 

2535 or any(valid_tags[t] == "gender" for t in ts) 

2536 for ts in tagsets 

2537 ) 

2538 ) 

2539 ): 

2540 # Merged with previous tags. Don't update previous 

2541 # tags here; cf. burn/English/Verb 

2542 for tags_l in tagsets: 

2543 for ts in prev_tags: 

2544 tags_l1 = list(sorted(set(tags_l) | set(ts))) 

2545 add_related( 

2546 wxr, 

2547 data, 

2548 tags_l1, 

2549 [related_str], 

2550 text, 

2551 True, 

2552 is_reconstruction, 

2553 head_group, 

2554 ruby, 

2555 ) 

2556 else: 

2557 # Not merged with previous tags 

2558 for tags_l in tagsets: 

2559 if following_tags is not None: 2559 ↛ 2560line 2559 didn't jump to line 2560 because the condition on line 2559 was never true

2560 for ts in following_tags: 

2561 tags_l1 = list( 

2562 sorted(set(tags_l) | set(ts)) 

2563 ) 

2564 add_related( 

2565 wxr, 

2566 data, 

2567 tags_l1, 

2568 [related_str], 

2569 text, 

2570 True, 

2571 is_reconstruction, 

2572 head_group, 

2573 ruby, 

2574 ) 

2575 else: 

2576 ret = add_related( 

2577 wxr, 

2578 data, 

2579 tags_l, 

2580 [related_str], 

2581 text, 

2582 True, 

2583 is_reconstruction, 

2584 head_group, 

2585 ruby, 

2586 ) 

2587 if ret is not None: 2587 ↛ 2588line 2587 didn't jump to line 2588 because the condition on line 2587 was never true

2588 following_tags = ret 

2589 prev_tags = tagsets 

2590 else: 

2591 if desc_i < len(new_desc) - 1 and all( 2591 ↛ 2598line 2591 didn't jump to line 2598 because the condition on line 2591 was never true

2592 "participle" in ts or "infinitive" in ts 

2593 for ts in tagsets 

2594 ): 

2595 # Interpret it as a standalone form description 

2596 # in the middle, probably followed by forms or 

2597 # language-specific descriptors. cf. drikke/Danish 

2598 new_prev_tags2 = [] 

2599 for ts1 in prev_tags or [()]: 

2600 for ts2 in tagsets: 

2601 ts = tuple(sorted(set(ts1) | set(ts2))) 

2602 new_prev_tags2.append(ts) 

2603 prev_tags = new_prev_tags2 

2604 continue 

2605 for tags in tagsets: 

2606 data_extend(data, "tags", tags) 

2607 prev_tags = tagsets 

2608 following_tags = None 

2609 

2610 # Finally, if we collected hirakana/katakana, add them now 

2611 if hiragana: 2611 ↛ 2612line 2611 didn't jump to line 2612 because the condition on line 2611 was never true

2612 add_related( 

2613 wxr, 

2614 data, 

2615 ["hiragana"], 

2616 [hiragana], 

2617 text, 

2618 True, 

2619 is_reconstruction, 

2620 head_group, 

2621 ruby, 

2622 ) 

2623 if katakana: 2623 ↛ 2624line 2623 didn't jump to line 2624 because the condition on line 2623 was never true

2624 add_related( 

2625 wxr, 

2626 data, 

2627 ["katakana"], 

2628 [katakana], 

2629 text, 

2630 True, 

2631 is_reconstruction, 

2632 head_group, 

2633 ruby, 

2634 ) 

2635 

2636 # XXX check if this is actually relevant, tags in word root data 

2637 # is extremely rare (not sure where they slip through). 

2638 tags = data.get("tags", []) # type:ignore 

2639 if len(tags) > 0: 

2640 # wxr.wtp.debug( 

2641 # f"Tags appear in word root data: {data['tags']=}", # type:ignore 

2642 # sortid="form_descriptions/2620/20240606", 

2643 # ) # Messes up tests. 

2644 data["tags"] = list(sorted(set(tags))) # type:ignore 

2645 

2646 

2647def parse_sense_qualifier( 

2648 wxr: WiktextractContext, text: str, data: Union[SenseData, LinkageData] 

2649) -> None: 

2650 """Parses tags or topics for a sense or some other data. The values are 

2651 added into the dictionary ``data``.""" 

2652 assert isinstance(wxr, WiktextractContext) 

2653 assert isinstance(text, str) 

2654 assert isinstance(data, dict) 

2655 # print("parse_sense_qualifier:", text) 

2656 if re.match(r"\([^()]+\)$", text): 2656 ↛ 2657line 2656 didn't jump to line 2657 because the condition on line 2656 was never true

2657 text = text[1:-1] 

2658 if re.match(r'"[^"]+"$', text): 2658 ↛ 2659line 2658 didn't jump to line 2659 because the condition on line 2658 was never true

2659 text = text[1:-1] 

2660 lst = map_with(xlat_descs_map, [text]) 

2661 sense_tags: list[str] = [] 

2662 for text in lst: 

2663 for semi in split_at_comma_semi(text): 

2664 if not semi: 2664 ↛ 2665line 2664 didn't jump to line 2665 because the condition on line 2664 was never true

2665 continue 

2666 orig_semi = semi 

2667 idx = semi.find(":") 

2668 if idx >= 0: 2668 ↛ 2669line 2668 didn't jump to line 2669 because the condition on line 2668 was never true

2669 semi = semi[:idx] 

2670 cls = classify_desc(semi, allow_unknown_tags=True) 

2671 # print("parse_sense_qualifier: classify_desc: {} -> {}" 

2672 # .format(semi, cls)) 

2673 if cls == "tags": 2673 ↛ 2682line 2673 didn't jump to line 2682 because the condition on line 2673 was always true

2674 tagsets, topics = decode_tags(semi) 

2675 data_extend(data, "topics", topics) 

2676 # XXX should think how to handle distinct options better, 

2677 # e.g., "singular and plural genitive"; that can't really be 

2678 # done with changing the calling convention of this function. 

2679 # Should split sense if more than one category of tags differs. 

2680 for tags in tagsets: 

2681 sense_tags.extend(tags) 

2682 elif cls == "taxonomic": 

2683 if re.match(r"×[A-Z]", semi): 

2684 sense_tags.append("extinct") 

2685 semi = semi[1:] 

2686 data["taxonomic"] = semi 

2687 elif cls == "english": 

2688 if "qualifier" in data and data["qualifier"] != orig_semi: 

2689 data["qualifier"] += "; " + orig_semi 

2690 else: 

2691 data["qualifier"] = orig_semi 

2692 else: 

2693 wxr.wtp.debug( 

2694 "unrecognized sense qualifier: {}".format(text), 

2695 sortid="form_descriptions/1831", 

2696 ) 

2697 sense_tags = list(sorted(set(sense_tags))) 

2698 data_extend(data, "tags", sense_tags) 

2699 

2700 

2701def parse_pronunciation_tags( 

2702 wxr: WiktextractContext, text: str, data: SoundData 

2703) -> None: 

2704 assert isinstance(wxr, WiktextractContext) 

2705 assert isinstance(text, str) 

2706 assert isinstance(data, dict) 

2707 text = text.strip() 

2708 if not text: 2708 ↛ 2709line 2708 didn't jump to line 2709 because the condition on line 2708 was never true

2709 return 

2710 cls = classify_desc(text) 

2711 notes = [] 

2712 if cls == "tags": 

2713 tagsets, topics = decode_tags(text) 

2714 data_extend(data, "topics", topics) 

2715 for tagset in tagsets: 

2716 for t in tagset: 

2717 if " " in t: 2717 ↛ 2718line 2717 didn't jump to line 2718 because the condition on line 2717 was never true

2718 notes.append(t) 

2719 else: 

2720 data_append(data, "tags", t) 

2721 else: 

2722 notes.append(text) 

2723 if notes: 

2724 data["note"] = "; ".join(notes) 

2725 

2726 

2727def parse_translation_desc( 

2728 wxr: WiktextractContext, lang: str, text: str, tr: TranslationData 

2729) -> None: 

2730 assert isinstance(wxr, WiktextractContext) 

2731 assert isinstance(lang, str) # The language of ``text`` 

2732 assert isinstance(text, str) 

2733 assert isinstance(tr, dict) 

2734 # print("parse_translation_desc:", text) 

2735 

2736 # Process all parenthesized parts from the translation item 

2737 note = None 

2738 restore_beginning = "" 

2739 restore_end = "" 

2740 while True: 

2741 beginning = False 

2742 # See if we can find a parenthesized expression at the end 

2743 m = re.search(r"\s*\((([^()]|\([^()]+\))+)\)\.?$", text) 

2744 if m: 

2745 par = m.group(1) 

2746 text = text[: m.start()] 

2747 if par.startswith(("literally ", "lit.")): 2747 ↛ 2748line 2747 didn't jump to line 2748 because the condition on line 2747 was never true

2748 continue # Not useful for disambiguation in many idioms 

2749 else: 

2750 # See if we can find a parenthesized expression at the start 

2751 m = re.match(r"^\^?\((([^()]|\([^()]+\))+)\):?(\s+|$)", text) 

2752 if m: 2752 ↛ 2753line 2752 didn't jump to line 2753 because the condition on line 2752 was never true

2753 par = m.group(1) 

2754 text = text[m.end() :] 

2755 beginning = True 

2756 if re.match(r"^(\d|\s|,| or | and )+$", par): 

2757 # Looks like this beginning parenthesized expression only 

2758 # contains digits or their combinations. We assume such 

2759 # to be sense descriptions if no sense has been selected, 

2760 # or otherwise just ignore them. 

2761 if not tr.get("sense"): 

2762 tr["sense"] = par 

2763 continue 

2764 else: 

2765 # See if we can find a parenthesized expression in the middle. 

2766 # Romanizations are sometimes between word and gender marker, 

2767 # e.g. wife/English/Tr/Yiddish. 

2768 m = re.search(r"\s+\((([^()]|\([^()]+\))+)\)", text) 

2769 if m: 2769 ↛ 2770line 2769 didn't jump to line 2770 because the condition on line 2769 was never true

2770 par = m.group(1) 

2771 text = text[: m.start()] + text[m.end() :] 

2772 else: 

2773 # No more parenthesized expressions - break out of the loop 

2774 break 

2775 

2776 # Some cleanup of artifacts that may result from skipping some templates 

2777 # in earlier stages 

2778 if par.startswith(": "): 2778 ↛ 2779line 2778 didn't jump to line 2779 because the condition on line 2778 was never true

2779 par = par[2:] 

2780 if par.endswith(","): 2780 ↛ 2781line 2780 didn't jump to line 2781 because the condition on line 2780 was never true

2781 par = par[:-1] 

2782 if re.match(r'^[“"]([^“”"]*)[“”"]$', par): 2782 ↛ 2783line 2782 didn't jump to line 2783 because the condition on line 2782 was never true

2783 par = par[1:-1] 

2784 par = par.strip() 

2785 

2786 # Check for special script pronunciation followed by romanization, 

2787 # used in many Asian languages. 

2788 lst = par.split(", ") 

2789 if len(lst) == 2: 2789 ↛ 2790line 2789 didn't jump to line 2790 because the condition on line 2789 was never true

2790 a, r = lst 

2791 if classify_desc(a) == "other": 

2792 cls = classify_desc(r) 

2793 # print("parse_translation_desc: r={} cls={}".format(r, cls)) 

2794 if cls == "romanization" or ( 

2795 cls == "english" and len(r.split()) == 1 and r[0].islower() 

2796 ): 

2797 if tr.get("alt") and tr.get("alt") != a: 

2798 wxr.wtp.debug( 

2799 'more than one value in "alt": {} vs. {}'.format( 

2800 tr["alt"], a 

2801 ), 

2802 sortid="form_descriptions/1930", 

2803 ) 

2804 tr["alt"] = a 

2805 if tr.get("roman") and tr.get("roman") != r: 

2806 wxr.wtp.debug( 

2807 'more than one value in "roman": ' 

2808 "{} vs. {}".format(tr["roman"], r), 

2809 sortid="form_descriptions/1936", 

2810 ) 

2811 tr["roman"] = r 

2812 continue 

2813 

2814 # Check for certain comma-separated tags combined with English text 

2815 # at the beginning or end of a comma-separated parenthesized list 

2816 while len(lst) > 1: 2816 ↛ 2817line 2816 didn't jump to line 2817 because the condition on line 2816 was never true

2817 cls = classify_desc(lst[0]) 

2818 if cls == "tags": 

2819 tagsets, topics = decode_tags(lst[0]) 

2820 for t in tagsets: 

2821 data_extend(tr, "tags", t) 

2822 data_extend(tr, "topics", topics) 

2823 lst = lst[1:] 

2824 continue 

2825 cls = classify_desc(lst[-1]) 

2826 if cls == "tags": 

2827 tagsets, topics = decode_tags(lst[-1]) 

2828 for t in tagsets: 

2829 data_extend(tr, "tags", t) 

2830 data_extend(tr, "topics", topics) 

2831 lst = lst[:-1] 

2832 continue 

2833 break 

2834 par = ", ".join(lst) 

2835 

2836 if not par: 2836 ↛ 2837line 2836 didn't jump to line 2837 because the condition on line 2836 was never true

2837 continue 

2838 if re.search(tr_ignored_parens_re, par): 2838 ↛ 2839line 2838 didn't jump to line 2839 because the condition on line 2838 was never true

2839 continue 

2840 if par.startswith("numeral:"): 2840 ↛ 2841line 2840 didn't jump to line 2841 because the condition on line 2840 was never true

2841 par = par[8:].strip() 

2842 

2843 # Classify the part in parenthesis and process accordingly 

2844 cls = classify_desc(par) 

2845 # print("parse_translation_desc classify: {!r} -> {}" 

2846 # .format(par, cls)) 

2847 if par == text: 2847 ↛ 2848line 2847 didn't jump to line 2848 because the condition on line 2847 was never true

2848 pass 

2849 if par == "f": 2849 ↛ 2850line 2849 didn't jump to line 2850 because the condition on line 2849 was never true

2850 data_append(tr, "tags", "feminine") 

2851 elif par == "m": 2851 ↛ 2852line 2851 didn't jump to line 2852 because the condition on line 2851 was never true

2852 data_append(tr, "tags", "masculine") 

2853 elif cls == "tags": 2853 ↛ 2854line 2853 didn't jump to line 2854 because the condition on line 2853 was never true

2854 tagsets, topics = decode_tags(par) 

2855 for tags in tagsets: 

2856 data_extend(tr, "tags", tags) 

2857 data_extend(tr, "topics", topics) 

2858 elif cls == "english": 

2859 # If the text contains any of certain grammatical words, treat it 

2860 # as a "note" instead of "english" 

2861 if re.search(tr_note_re, par): 2861 ↛ 2862line 2861 didn't jump to line 2862 because the condition on line 2861 was never true

2862 if par.endswith(":"): 

2863 par = par[:-1] 

2864 if par not in ("see entry for forms",): 

2865 if note: 

2866 note = note + ";" + par 

2867 else: 

2868 note = par 

2869 else: 

2870 # There can be more than one parenthesized english item, see 

2871 # e.g. Aunt/English/Translations/Tamil 

2872 if tr.get("english"): 2872 ↛ 2873line 2872 didn't jump to line 2873 because the condition on line 2872 was never true

2873 tr["english"] += "; " + par 

2874 else: 

2875 tr["english"] = par 

2876 elif cls == "romanization": 2876 ↛ 2896line 2876 didn't jump to line 2896 because the condition on line 2876 was always true

2877 # print("roman text={!r} text cls={}" 

2878 # .format(text, classify_desc(text))) 

2879 if classify_desc(text) in ( 2879 ↛ 2883line 2879 didn't jump to line 2883 because the condition on line 2879 was never true

2880 "english", 

2881 "romanization", 

2882 ) and lang not in ("Egyptian",): 

2883 if beginning: 

2884 restore_beginning += "({}) ".format(par) 

2885 else: 

2886 restore_end = " ({})".format(par) + restore_end 

2887 else: 

2888 if tr.get("roman"): 2888 ↛ 2889line 2888 didn't jump to line 2889 because the condition on line 2888 was never true

2889 wxr.wtp.debug( 

2890 'more than one value in "roman": {} vs. {}'.format( 

2891 tr["roman"], par 

2892 ), 

2893 sortid="form_descriptions/2013", 

2894 ) 

2895 tr["roman"] = par 

2896 elif cls == "taxonomic": 

2897 if tr.get("taxonomic"): 

2898 wxr.wtp.debug( 

2899 'more than one value in "taxonomic": {} vs. {}'.format( 

2900 tr["taxonomic"], par 

2901 ), 

2902 sortid="form_descriptions/2019", 

2903 ) 

2904 if re.match(r"×[A-Z]", par): 

2905 data_append(tr, "tags", "extinct") 

2906 par = par[1:] 

2907 tr["taxonomic"] = par 

2908 elif cls == "other": 

2909 if tr.get("alt"): 

2910 wxr.wtp.debug( 

2911 'more than one value in "alt": {} vs. {}'.format( 

2912 tr["alt"], par 

2913 ), 

2914 sortid="form_descriptions/2028", 

2915 ) 

2916 tr["alt"] = par 

2917 else: 

2918 wxr.wtp.debug( 

2919 "parse_translation_desc unimplemented cls {}: {}".format( 

2920 cls, par 

2921 ), 

2922 sortid="form_descriptions/2033", 

2923 ) 

2924 

2925 # Check for gender indications in suffix 

2926 text, final_tags = parse_head_final_tags(wxr, lang, text) 

2927 data_extend(tr, "tags", final_tags) 

2928 

2929 # Restore those parts that we did not want to remove (they are often 

2930 # optional words or words that are always used with the given translation) 

2931 text = restore_beginning + text + restore_end 

2932 

2933 if note: 2933 ↛ 2934line 2933 didn't jump to line 2934 because the condition on line 2933 was never true

2934 tr["note"] = note.strip() 

2935 if text and text not in ignored_translations: 2935 ↛ 2940line 2935 didn't jump to line 2940 because the condition on line 2935 was always true

2936 tr["word"] = text.strip() 

2937 

2938 # Sometimes gender seems to be at the end of "roman" field, see e.g. 

2939 # fire/English/Noun/Translations/Egyptian (for "oxidation reaction") 

2940 roman = tr.get("roman") 

2941 if roman: 

2942 if roman.endswith(" f"): 2942 ↛ 2943line 2942 didn't jump to line 2943 because the condition on line 2942 was never true

2943 data_append(tr, "tags", "feminine") 

2944 tr["roman"] = roman[:-2].strip() 

2945 elif roman.endswith(" m"): 2945 ↛ 2946line 2945 didn't jump to line 2946 because the condition on line 2945 was never true

2946 data_append(tr, "tags", "masculine") 

2947 tr["roman"] = roman[:-2].strip() 

2948 

2949 # If the word now has "english" field but no "roman" field, and 

2950 # the word would be classified "other" (generally non-latin 

2951 # characters), and the value in "english" is only one lowercase 

2952 # word, move it to "roman". This happens semi-frequently when the 

2953 # translation is transliterated the same as some English word. 

2954 roman = tr.get("roman") 

2955 english = tr.get("english") 

2956 if english and not roman and "word" in tr: 

2957 cls = classify_desc(tr["word"]) 

2958 if cls == "other" and " " not in english and english[0].islower(): 2958 ↛ 2965line 2958 didn't jump to line 2965 because the condition on line 2958 was always true

2959 del tr["english"] 

2960 tr["roman"] = english 

2961 

2962 # If the entry now has both tr["roman"] and tr["word"] and they have 

2963 # the same value, delete tr["roman"] (e.g., man/English/Translations 

2964 # Evenki) 

2965 if tr.get("word") and tr.get("roman") == tr.get("word"): 2965 ↛ 2966line 2965 didn't jump to line 2966 because the condition on line 2965 was never true

2966 del tr["roman"] 

2967 

2968 

2969def parse_alt_or_inflection_of( 

2970 wxr: WiktextractContext, gloss: str, gloss_template_args: set[str] 

2971) -> Optional[tuple[list[str], Optional[list[AltOf]]]]: 

2972 """Tries to parse an inflection-of or alt-of description. If successful, 

2973 this returns (tags, alt-of/inflection-of-dict). If the description cannot 

2974 be parsed, this returns None. This may also return (tags, None) when the 

2975 gloss describes a form (or some other tags were extracted from it), but 

2976 there was no alt-of/form-of/synonym-of word.""" 

2977 # print("parse_alt_or_inflection_of: {!r}".format(gloss)) 

2978 # Occasionally inflection_of/alt_of have "A(n) " etc. at the beginning. 

2979 

2980 # Never interpret a gloss that is equal to the word itself as a tag 

2981 # (e.g., instrumental/Romanian, instrumental/Spanish). 

2982 if gloss.lower() == wxr.wtp.title.lower() or ( # type:ignore[union-attr] 2982 ↛ 2985line 2982 didn't jump to line 2985 because the condition on line 2982 was never true

2983 len(gloss) >= 5 and distw([gloss.lower()], wxr.wtp.title.lower()) < 0.2 # type:ignore[union-attr] 

2984 ): 

2985 return None 

2986 

2987 # First try parsing it as-is 

2988 parsed = parse_alt_or_inflection_of1(wxr, gloss, gloss_template_args) 

2989 if parsed is not None: 

2990 return parsed 

2991 

2992 # Next try parsing it with the first character converted to lowercase if 

2993 # it was previously uppercase. 

2994 if gloss and gloss[0].isupper(): 

2995 gloss = gloss[0].lower() + gloss[1:] 

2996 parsed = parse_alt_or_inflection_of1(wxr, gloss, gloss_template_args) 

2997 if parsed is not None: 2997 ↛ 2998line 2997 didn't jump to line 2998 because the condition on line 2997 was never true

2998 return parsed 

2999 

3000 return None 

3001 

3002 

3003# These tags are not allowed in alt-or-inflection-of parsing 

3004alt_infl_disallowed: set[str] = set( 

3005 [ 

3006 "error-unknown-tag", 

3007 "place", # Not in inflected forms and causes problems e.g. house/English 

3008 ] 

3009) 

3010 

3011 

3012def parse_alt_or_inflection_of1( 

3013 wxr: WiktextractContext, gloss: str, gloss_template_args: set[str] 

3014) -> Optional[tuple[list[str], Optional[list[AltOf]]]]: 

3015 """Helper function for parse_alt_or_inflection_of. This handles a single 

3016 capitalization.""" 

3017 if not gloss or not gloss.strip(): 3017 ↛ 3018line 3017 didn't jump to line 3018 because the condition on line 3017 was never true

3018 return None 

3019 

3020 # Prevent some common errors where we would parse something we shouldn't 

3021 if re.search(r"(?i)form of address ", gloss): 3021 ↛ 3022line 3021 didn't jump to line 3022 because the condition on line 3021 was never true

3022 return None 

3023 

3024 gloss = re.sub(r"only used in [^,]+, ", "", gloss) 

3025 

3026 # First try all formats ending with "of" (or other known last words that 

3027 # can end a form description) 

3028 matches = list(re.finditer(r"\b(of|for|by|as|letter|number) ", gloss)) 

3029 m: Optional[re.Match] 

3030 for m in reversed(matches): 

3031 desc = gloss[: m.end()].strip() 

3032 base = gloss[m.end() :].strip() 

3033 tagsets, topics = decode_tags(desc, no_unknown_starts=True) 

3034 if not topics and any( 

3035 not (alt_infl_disallowed & set(ts)) for ts in tagsets 

3036 ): 

3037 # Successfully parsed, including "of" etc. 

3038 tags: list[str] = [] 

3039 # If you have ("Western-Armenian", ..., "form-of") as your 

3040 # tag set, it's most probable that it's something like 

3041 # "Western Armenian form of խոսել (xosel)", which should 

3042 # get "alt-of" instead of "form-of" (inflection). 

3043 # խօսիլ/Armenian 

3044 for ts_t in tagsets: 

3045 if "form-of" in ts_t and any( 

3046 valid_tags.get(tk) == "dialect" for tk in ts_t 

3047 ): 

3048 ts_s = (set(ts_t) - {"form-of"}) | {"alt-of"} 

3049 else: 

3050 ts_s = set(ts_t) 

3051 if not (alt_infl_disallowed & ts_s): 3051 ↛ 3044line 3051 didn't jump to line 3044 because the condition on line 3051 was always true

3052 tags.extend(ts_s) 

3053 if ( 

3054 "alt-of" in tags 

3055 or "form-of" in tags 

3056 or "synonym-of" in tags 

3057 or "compound-of" in tags 

3058 ): 

3059 break 

3060 if m.group(1) == "of": 

3061 # Try parsing without the final "of". This is commonly used in 

3062 # various form-of expressions. 

3063 desc = gloss[: m.start()] 

3064 base = gloss[m.end() :] 

3065 tagsets, topics = decode_tags(desc, no_unknown_starts=True) 

3066 # print("ALT_OR_INFL: desc={!r} base={!r} tagsets={} topics={}" 

3067 # .format(desc, base, tagsets, topics)) 

3068 if not topics and any( 

3069 not (alt_infl_disallowed & set(t)) for t in tagsets 

3070 ): 

3071 tags = [] 

3072 for t in tagsets: 

3073 if not (alt_infl_disallowed & set(t)): 3073 ↛ 3072line 3073 didn't jump to line 3072 because the condition on line 3073 was always true

3074 tags.extend(t) 

3075 # It must have at least one tag from form_of_tags 

3076 if set(tags) & form_of_tags: 3076 ↛ 3080line 3076 didn't jump to line 3080 because the condition on line 3076 was always true

3077 # Accept this as form-of 

3078 tags.append("form-of") 

3079 break 

3080 if set(tags) & alt_of_tags: 

3081 # Accept this as alt-of 

3082 tags.append("alt-of") 

3083 break 

3084 

3085 else: 

3086 # Did not find a form description based on last word; see if the 

3087 # whole description is tags 

3088 tagsets, topics = decode_tags(gloss, no_unknown_starts=True) 

3089 if not topics and any( 

3090 not (alt_infl_disallowed & set(ts)) and form_of_tags & set(ts) 

3091 for ts in tagsets 

3092 ): 

3093 tags = [] 

3094 for ts in tagsets: 

3095 if not (alt_infl_disallowed & set(ts)) and form_of_tags & set( 3095 ↛ 3094line 3095 didn't jump to line 3094 because the condition on line 3095 was always true

3096 ts 

3097 ): 

3098 tags.extend(ts) 

3099 base = "" 

3100 else: 

3101 return None 

3102 

3103 # kludge for Spanish (again): 'x of [word] combined with [clitic]' 

3104 m = re.search(r"combined with \w+$", base) 

3105 if m: 3105 ↛ 3106line 3105 didn't jump to line 3106 because the condition on line 3105 was never true

3106 tagsets, topics = decode_tags(m.group(0), no_unknown_starts=True) 

3107 if not topics: 

3108 for ts in tagsets: 

3109 tags.extend(ts) 

3110 base = base[: m.start()] 

3111 

3112 # It is fairly common for form_of glosses to end with something like 

3113 # "ablative case" or "in instructive case". Parse that ending. 

3114 base = base.strip() 

3115 lst = base.split() 

3116 # print("parse_alt_or_inflection_of: lst={}".format(lst)) 

3117 if len(lst) >= 3 and lst[-1] in ("case", "case."): 3117 ↛ 3118line 3117 didn't jump to line 3118 because the condition on line 3117 was never true

3118 node = valid_sequences.children.get(lst[-2]) 

3119 if node and node.end: 

3120 for s in node.tags: 

3121 tags.extend(s.split(" ")) 

3122 lst = lst[:-2] 

3123 if lst[-1] == "in" and len(lst) > 1: 

3124 lst = lst[:-1] 

3125 

3126 # Eliminate empty and duplicate tags 

3127 tags = list(sorted(set(t for t in tags if t))) 

3128 

3129 # Clean up some extra stuff from the linked word, separating the text 

3130 # into ``base`` (the linked word) and ``extra`` (additional information, 

3131 # such as English translation or clarifying word sense information). 

3132 orig_base = base 

3133 base = re.sub(alt_of_form_of_clean_re, "", orig_base) 

3134 base = re.sub(r" [(⟨][^()]*[)⟩]", "", base) # Remove all (...) groups 

3135 extra = orig_base[len(base) :] 

3136 extra = re.sub(r"^[- :;.,,—]+", "", extra) 

3137 if extra.endswith(".") and extra.count(".") == 1: 3137 ↛ 3138line 3137 didn't jump to line 3138 because the condition on line 3137 was never true

3138 extra = extra[:-1].strip() 

3139 m = re.match(r"^\(([^()]*)\)$", extra) 

3140 if m: 3140 ↛ 3141line 3140 didn't jump to line 3141 because the condition on line 3140 was never true

3141 extra = m.group(1) 

3142 else: 

3143 # These weird backets used in "slash mark" 

3144 m = re.match(r"^⟨([^()]*)⟩$", extra) 

3145 if m: 3145 ↛ 3146line 3145 didn't jump to line 3146 because the condition on line 3145 was never true

3146 extra = m.group(1) 

3147 m = re.match(r'^[“"]([^"“”]*)["”]$', extra) 

3148 if m: 3148 ↛ 3149line 3148 didn't jump to line 3149 because the condition on line 3148 was never true

3149 extra = m.group(1) 

3150 # Note: base might still contain comma-separated values and values 

3151 # separated by "and" 

3152 base = base.strip() 

3153 if base.endswith(",") and len(base) > 2: 3153 ↛ 3154line 3153 didn't jump to line 3154 because the condition on line 3153 was never true

3154 base = base[:-1].strip() 

3155 while ( 

3156 base.endswith(".") 

3157 and not wxr.wtp.page_exists(base) 

3158 and base not in gloss_template_args 

3159 ): 

3160 base = base[:-1].strip() 

3161 if base.endswith('(\u201cconjecture")'): 3161 ↛ 3162line 3161 didn't jump to line 3162 because the condition on line 3161 was never true

3162 base = base[:-14].strip() 

3163 tags.append("conjecture") 

3164 while ( 3164 ↛ 3169line 3164 didn't jump to line 3169 because the condition on line 3164 was never true

3165 base.endswith(".") 

3166 and not wxr.wtp.page_exists(base) 

3167 and base not in gloss_template_args 

3168 ): 

3169 base = base[:-1].strip() 

3170 if ( 3170 ↛ 3175line 3170 didn't jump to line 3175 because the condition on line 3170 was never true

3171 base.endswith(".") 

3172 and base not in gloss_template_args 

3173 and base[:-1] in gloss_template_args 

3174 ): 

3175 base = base[:-1] 

3176 base = base.strip() 

3177 if not base: 

3178 return tags, None 

3179 

3180 # Kludge: Spanish verb forms seem to have a dot added at the end. 

3181 # Remove it; we know of no Spanish verbs ending with a dot. 

3182 language = wxr.wtp.section 

3183 pos = wxr.wtp.subsection 

3184 # print("language={} pos={} base={}".format(language, pos, base)) 

3185 if ( 3185 ↛ 3191line 3185 didn't jump to line 3191 because the condition on line 3185 was never true

3186 base.endswith(".") 

3187 and len(base) > 1 

3188 and base[-2].isalpha() 

3189 and (language == "Spanish" and pos == "Verb") 

3190 ): 

3191 base = base[:-1] 

3192 

3193 # Split base to alternatives when multiple alternatives provided 

3194 parts = split_at_comma_semi(base, extra=[" / ", "/", r" \+ "]) 

3195 titleword = re.sub(r"^Reconstruction:[^/]*/", "", wxr.wtp.title or "") 

3196 if ( 3196 ↛ 3205line 3196 didn't jump to line 3205 because the condition on line 3196 was always true

3197 len(parts) <= 1 

3198 or base.startswith("/") 

3199 or base.endswith("/") 

3200 or "/" in titleword 

3201 ): 

3202 parts = [base] 

3203 # Split base to alternatives when of form "a or b" and "a" and "b" are 

3204 # similar (generally spelling variants of the same word or similar words) 

3205 if len(parts) == 1: 3205 ↛ 3211line 3205 didn't jump to line 3211 because the condition on line 3205 was always true

3206 pp = base.split() 

3207 if len(pp) == 3 and pp[1] == "or" and distw([pp[0]], pp[2]) < 0.4: 

3208 parts = [pp[0], pp[2]] 

3209 

3210 # Create form-of/alt-of entries based on the extracted data 

3211 dt_lst: list[AltOf] = [] 

3212 for p in parts: 

3213 # Check for some suspicious base forms 

3214 m = re.search(r"[.,] |[{}()]", p) 

3215 if m and not wxr.wtp.page_exists(p): 3215 ↛ 3216line 3215 didn't jump to line 3216 because the condition on line 3215 was never true

3216 wxr.wtp.debug( 

3217 "suspicious alt_of/form_of with {!r}: {}".format(m.group(0), p), 

3218 sortid="form_descriptions/2278", 

3219 ) 

3220 if p.startswith("*") and len(p) >= 3 and p[1].isalpha(): 3220 ↛ 3221line 3220 didn't jump to line 3221 because the condition on line 3220 was never true

3221 p = p[1:] 

3222 dt: AltOf = {"word": p} 

3223 if extra: 

3224 dt["extra"] = extra 

3225 dt_lst.append(dt) 

3226 # print("alt_or_infl_of returning tags={} lst={} base={!r}" 

3227 # .format(tags, lst, base)) 

3228 return tags, dt_lst 

3229 

3230 

3231@functools.lru_cache(maxsize=65536) 

3232def classify_desc( 

3233 desc: str, 

3234 allow_unknown_tags=False, 

3235 no_unknown_starts=False, 

3236 accepted: Union[tuple[str, ...], frozenset[str]] = tuple(), 

3237) -> str: 

3238 """Determines whether the given description is most likely tags, english, 

3239 a romanization, or something else. Returns one of: "tags", "english", 

3240 "romanization", or "other". If ``allow_unknown_tags`` is True, then 

3241 allow "tags" classification even when the only tags are those starting 

3242 with a word in allowed_unknown_starts.""" 

3243 assert isinstance(desc, str) 

3244 # Empty and whitespace-only strings are treated as "other" 

3245 desc = desc.strip() 

3246 if not desc: 

3247 return "other" 

3248 

3249 normalized_desc = unicodedata.normalize("NFKD", desc) 

3250 

3251 # If it can be fully decoded as tags without errors, treat as tags 

3252 tagsets, topics = decode_tags(desc, no_unknown_starts=no_unknown_starts) 

3253 for tagset in tagsets: 

3254 assert isinstance(tagset, (list, tuple, set)) 

3255 if "error-unknown-tag" not in tagset and ( 

3256 topics or allow_unknown_tags or any(" " not in x for x in tagset) 

3257 ): 

3258 return "tags" 

3259 

3260 # Check if it looks like the taxonomic name of a species 

3261 if desc in known_species: 

3262 return "taxonomic" 

3263 desc1 = re.sub(r"^×([A-Z])", r"\1", desc) 

3264 desc1 = re.sub(r"\s*×.*", "", desc1) 

3265 lst = desc1.split() 

3266 if len(lst) > 1 and len(lst) <= 5 and lst[0] in known_firsts: 

3267 have_non_english = 1 if lst[0].lower() not in english_words else 0 

3268 for x in lst[1:]: 

3269 if x in ("A", "B", "C", "D", "E", "F", "I", "II", "III", "IV", "V"): 

3270 continue 

3271 if x[0].isupper(): 

3272 break 

3273 if x not in english_words: 

3274 have_non_english += 1 

3275 else: 

3276 # Starts with known taxonomic term, does not contain uppercase 

3277 # words (except allowed letters) and at least one word is not 

3278 # English 

3279 if have_non_english >= len(lst) - 1 and have_non_english > 0: 3279 ↛ 3285line 3279 didn't jump to line 3285 because the condition on line 3279 was always true

3280 return "taxonomic" 

3281 

3282 # If all words are in our English dictionary, interpret as English. 

3283 # [ -~] is regex black magic, "ALL CHARACTERS from space to tilde" 

3284 # in ASCII. Took me a while to figure out. 

3285 if re.match(r"[ -~―—“”…'‘’ʹ€]+$", normalized_desc) and len(desc) > 1: 

3286 if desc in english_words and desc[0].isalpha(): 

3287 return "english" # Handles ones containing whitespace 

3288 desc1 = re.sub( 

3289 tokenizer_fixup_re, lambda m: tokenizer_fixup_map[m.group(0)], desc 

3290 ) 

3291 tokens = tokenizer.tokenize(desc1) 

3292 if not tokens: 3292 ↛ 3293line 3292 didn't jump to line 3293 because the condition on line 3292 was never true

3293 return "other" 

3294 lst_bool = list( 

3295 x not in not_english_words 

3296 and 

3297 # not x.isdigit() and 

3298 ( 

3299 x in english_words 

3300 or x.lower() in english_words 

3301 or x in known_firsts 

3302 or x[0].isdigit() 

3303 or x in accepted 

3304 or 

3305 # (x[0].isupper() and x.find("-") < 0 and x.isascii()) or 

3306 ( 

3307 x.endswith("s") and len(x) >= 4 and x[:-1] in english_words 

3308 ) # Plural 

3309 or ( 

3310 x.endswith("ies") 

3311 and len(x) >= 5 

3312 and x[:-3] + "y" in english_words 

3313 ) # E.g. lily - lilies 

3314 or ( 

3315 x.endswith("ing") 

3316 and len(x) >= 5 

3317 and x[:-3] in english_words 

3318 ) # E.g. bring - bringing 

3319 or ( 

3320 x.endswith("ing") 

3321 and len(x) >= 5 

3322 and x[:-3] + "e" in english_words 

3323 ) # E.g., tone - toning 

3324 or ( 

3325 x.endswith("ed") and len(x) >= 5 and x[:-2] in english_words 

3326 ) # E.g. hang - hanged 

3327 or ( 

3328 x.endswith("ed") 

3329 and len(x) >= 5 

3330 and x[:-2] + "e" in english_words 

3331 ) # E.g. atone - atoned 

3332 or (x.endswith("'s") and x[:-2] in english_words) 

3333 or (x.endswith("s'") and x[:-2] in english_words) 

3334 or ( 

3335 x.endswith("ise") 

3336 and len(x) >= 5 

3337 and x[:-3] + "ize" in english_words 

3338 ) 

3339 or ( 

3340 x.endswith("ised") 

3341 and len(x) >= 6 

3342 and x[:-4] + "ized" in english_words 

3343 ) 

3344 or ( 

3345 x.endswith("ising") 

3346 and len(x) >= 7 

3347 and x[:-5] + "izing" in english_words 

3348 ) 

3349 or ( 

3350 re.search(r"[-/]", x) 

3351 and all( 

3352 ((y in english_words and len(y) > 2) or not y) 

3353 for y in re.split(r"[-/]", x) 

3354 ) 

3355 ) 

3356 ) 

3357 for x in tokens 

3358 ) 

3359 cnt = lst_bool.count(True) 

3360 rejected_words = tuple( 

3361 x for i, x in enumerate(tokens) if not lst_bool[i] 

3362 ) 

3363 if ( 

3364 any( 

3365 lst_bool[i] and x[0].isalpha() and len(x) > 1 

3366 for i, x in enumerate(tokens) 

3367 ) 

3368 and not desc.startswith("-") 

3369 and not desc.endswith("-") 

3370 and re.search(r"\w+", desc) 

3371 and ( 

3372 cnt == len(lst_bool) 

3373 or ( 

3374 any( 

3375 lst_bool[i] and len(x) > 3 for i, x in enumerate(tokens) 

3376 ) 

3377 and cnt >= len(lst_bool) - 1 

3378 ) 

3379 or cnt / len(lst_bool) >= 0.8 

3380 or ( 

3381 all(x in potentially_english_words for x in rejected_words) 

3382 and cnt / len(lst_bool) >= 0.50 

3383 ) 

3384 ) 

3385 ): 

3386 return "english" 

3387 # Some translations have apparent pronunciation descriptions in /.../ 

3388 # which we'll put in the romanization field (even though they probably are 

3389 # not exactly romanizations). 

3390 if desc.startswith("/") and desc.endswith("/"): 

3391 return "romanization" 

3392 # If all characters are in classes that could occur in romanizations, 

3393 # treat as romanization 

3394 classes = list( 

3395 unicodedata.category(x) if x not in ("-", ",", ":", "/", '"') else "OK" 

3396 for x in normalized_desc 

3397 ) 

3398 classes1 = [] 

3399 num_latin = 0 

3400 num_greek = 0 

3401 # part = "" 

3402 # for ch, cl in zip(normalized_desc, classes): 

3403 # part += f"{ch}({cl})" 

3404 # print(part) 

3405 for ch, cl in zip(normalized_desc, classes): 

3406 if ch in ( 

3407 "'", # ' in Arabic, / in IPA-like parenthesized forms 

3408 ".", # e.g., "..." in translations 

3409 ";", 

3410 ":", 

3411 "!", 

3412 "‘", 

3413 "’", 

3414 '"', 

3415 "“", 

3416 "”", 

3417 "/", 

3418 "?", 

3419 "…", # alternative to "..." 

3420 "⁉", # 見る/Japanese automatic transcriptions... 

3421 "?", 

3422 "!", 

3423 "⁻", # superscript -, used in some Cantonese roman, e.g. "we" 

3424 "ʔ", 

3425 "ʼ", 

3426 "ʾ", 

3427 "ʹ", 

3428 ): # ʹ e.g. in understand/English/verb Russian transl 

3429 classes1.append("OK") 

3430 continue 

3431 if cl not in ("Ll", "Lu"): 

3432 classes1.append(cl) 

3433 continue 

3434 try: 

3435 name = unicodedata.name(ch) 

3436 first = name.split()[0] 

3437 if first == "LATIN": 

3438 num_latin += 1 

3439 elif first == "GREEK": 

3440 num_greek += 1 

3441 elif first == "COMBINING": # Combining diacritic 3441 ↛ 3442line 3441 didn't jump to line 3442 because the condition on line 3441 was never true

3442 cl = "OK" 

3443 elif re.match(non_latin_scripts_re, name): 3443 ↛ 3447line 3443 didn't jump to line 3447 because the condition on line 3443 was always true

3444 cl = "NO" # Not acceptable in romanizations 

3445 except ValueError: 

3446 cl = "NO" # Not acceptable in romanizations 

3447 classes1.append(cl) 

3448 # print("classify_desc: {!r} classes1: {}".format(desc, classes1)) 

3449 # print(set(classes1) ) 

3450 if all( 

3451 x in ("Ll", "Lu", "Lt", "Lm", "Mn", "Mc", "Zs", "Nd", "OK") 

3452 for x in classes1 

3453 ): 

3454 if ( 

3455 (num_latin >= num_greek + 2 or num_greek == 0) 

3456 and classes1.count("OK") < len(classes1) 

3457 and classes1.count("Nd") < len(classes1) 

3458 ): 

3459 return "romanization" 

3460 # Otherwise it is something else, such as hanji version of the word 

3461 return "other" 

3462 

3463 

3464def remove_text_in_parentheses(text: str) -> str: 

3465 parentheses = 0 

3466 new_text = "" 

3467 for c in text: 

3468 if c == "(": 

3469 parentheses += 1 

3470 elif c == ")": 

3471 parentheses -= 1 

3472 elif parentheses == 0: 

3473 new_text += c 

3474 return new_text